Text classification & Sentiment analysis
--

Text classification – The aim of text classification is to automatically classify the text documents based on pretrained categories.

Applications:
--
1. Sentiment Analysis
2. Document classification
3. Spam – ham mail classification
4. Resume shortlisting
5. Document summarization

Problem
--
to do : Spam - ham classification using machine learning.

Solution
--
If you observe, your Gmail has a folder called “Spam.” It will basically
classify your emails into spam and ham so that you don’t have to read
unnecessary emails.

In [1]:
# Let’s follow the step-by-step method to build the classifier.

# Step 1 :  Data collection and understanding
# Please download data from the below link and save it in your working directory:
# https://www.kaggle.com/uciml/sms-spam-collection-dataset#spam.csv

import pandas as pd
#Read the data
Email_Data = pd.read_csv("C:\Program Files\Python36\suven\Adv ML\datasets\datasets/spam.csv",encoding ='latin1')

#Data undestanding
Email_Data.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [2]:
Email_Data = Email_Data[['v1', 'v2']]
Email_Data = Email_Data.rename(columns={"v1":"Target","v2":"Email"})
Email_Data.head()

Unnamed: 0,Target,Email
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
# step 2 : Text processing and feature engineering

# all imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import string
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import os
from textblob import TextBlob
from nltk.stem import PorterStemmer
from textblob import Word
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import sklearn.feature_extraction.text as text
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm

In [4]:
#pre processing steps like lower case, stemming and lemmatization
Email_Data['Email'] = Email_Data['Email'].apply(lambda x: " ".join(x.lower() for x in x.split()))
stop = stopwords.words('english')
Email_Data['Email'] = Email_Data['Email'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
st = PorterStemmer()

Email_Data['Email'] = Email_Data['Email'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
Email_Data['Email'] = Email_Data['Email'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
Email_Data.head()

Unnamed: 0,Target,Email
0,ham,"go jurong point, crazy.. avail bugi n great wo..."
1,ham,ok lar... joke wif u oni...
2,spam,free entri 2 wkli comp win fa cup final tkt 21...
3,ham,u dun say earli hor... u c alreadi say...
4,ham,"nah think goe usf, live around though"


In [6]:
# step 3:
# Splitting data into train and validation
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(Email_Data['Email'], Email_Data['Target'])

# TFIDF feature generation for a maximum of 5000 features
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)
# token_pattern : string
# Regular expression denoting what constitutes a “token”, 
# only used if analyzer == 'word'. The default regexp select tokens 
# of 2 or more alphanumeric characters 
# (punctuation is completely ignored and always treated as a token separator).
# TFIDF feature generation for a maximum of 5000 features
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)

tfidf_vect.fit(Email_Data['Email'])
xtrain_tfidf = tfidf_vect.transform(train_x)
xvalid_tfidf = tfidf_vect.transform(valid_x)
xtrain_tfidf.data

array([0.47787054, 0.27951112, 0.37828744, ..., 0.35897604, 0.41781432,
       0.5023328 ])

In [13]:
tfidf_vect.vocabulary_
# tfidf_vect.get
print(xtrain_tfidf)

  (0, 3929)	0.4778705383733631
  (0, 2223)	0.2795111230624288
  (0, 1384)	0.37828744113012275
  (0, 1272)	0.5376632661032151
  (0, 593)	0.5112045625829047
  (1, 1963)	0.8070812177243439
  (1, 1760)	0.5904404356042955
  (2, 4901)	0.39739368066143715
  (2, 3139)	0.4720545393636423
  (2, 2554)	0.4653531360327522
  (2, 1632)	0.6345779961683339
  (3, 4988)	0.38880812052149
  (3, 4078)	0.6099628001677578
  (3, 3881)	0.31334400620450503
  (3, 1144)	0.32183173418324934
  (3, 602)	0.5244172923124943
  (4, 4914)	0.6806103075861311
  (4, 3007)	0.7326456232091421
  (5, 4731)	0.11458785022397502
  (5, 4320)	0.18453401061303262
  (5, 3599)	0.9226700530651631
  (5, 2223)	0.09269584850499432
  (5, 2184)	0.12322059078287013
  (5, 2125)	0.14575863889528984
  (5, 1237)	0.15280108781705792
  :	:
  (4176, 544)	0.3130316493157341
  (4177, 4977)	0.30032710604240054
  (4177, 3469)	0.1971737897992139
  (4177, 3210)	0.2628910618972912
  (4177, 3018)	0.18767279619098437
  (4177, 2833)	0.17507795277237972
  (4177

In [14]:
# step 4: 
# Model training
# This is the generalized function for training any given model:

def train_model(classifier, feature_vector_train, label,feature_vector_valid, is_neural_net=False):

 # fit the training dataset on the classifier
 classifier.fit(feature_vector_train, label)
 # predict the labels on validation dataset
 predictions = classifier.predict(feature_vector_valid)
 return metrics.accuracy_score(predictions, valid_y)

# Naive Bayes trainig
accuracy = train_model(naive_bayes.MultinomialNB(alpha=0.2),xtrain_tfidf, train_y, xvalid_tfidf)
print("Accuracy: ", accuracy)

#importance of alpha in Naive_bayes technique
# alpha is a smooth-ning parameter. 
# alpha > 1  indicates laplace smooth-ning
# alpha < 1  indicates Lidstone smooth-ning

Accuracy:  0.9870782483847811


In [16]:
# trying one mor classifier, so that we can compare its performance with Naive Bayes

# Linear Classifier on Word Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(),xtrain_tfidf, train_y, xvalid_tfidf)
print ("Accuracy: ", accuracy)

Accuracy:  0.9641062455132807


You can suppress the above warning :
https://machinelearningmastery.com/how-to-fix-futurewarning-messages-in-scikit-learn/

Recommended Reading 
https://www.datacamp.com/community/tutorials/understanding-logistic-regression-python

Naive Bayes is giving better results than the linear classifier. We can try many more classifiers and then choose the best one.

Carrying Out Sentiment Analysis
--
In this section, we are going to discuss how to understand the sentiment of
a particular sentence or statement. Sentiment analysis is one of the widely
used techniques across the industries to understand the sentiments of the
customers/users around the products/services. Sentiment analysis gives
the sentiment score of a sentence/statement tending toward positive or negative.

Problem
--
You want to do a sentiment analysis.

Solution
--
The simplest way to do this by using a TextBlob or vedar library.

How It Works
--
Let’s follow the steps in this section to do sentiment analysis using TextBlob. 
It will basically give 2 metrics.

• Polarity = Polarity lies in the range of [-1,1] where 1 means a positive statement and -1 means a negative statement.

• Subjectivity = Subjectivity refers that mostly it is a public opinion and not factual information [0,1], i.e 0 means public opinion and 1 means factual information

In [21]:
# Create the sample data

review = "I like this phone. screen quality and camera clarity is really good."
review2 = "This tv is not good. Bad quality, no clarity, worst experience"

# Cleaning and preprocessing
def processRow(row):
 import re
 import nltk
 from textblob import TextBlob
 from nltk.corpus import stopwords
 from nltk.stem import PorterStemmer
 from textblob import Word
 from nltk.util import ngrams
 import re
 from nltk.tokenize import word_tokenize
 tweet = row

#Lower case
 tweet.lower()

#Removes unicode strings like "\u002c"  -> ,(comma)
 tweet = re.sub(r'(\\u[0-9A-Fa-f]+)',r'', tweet)
    
# Removes non-ascii characters. note : \x00 to \x7f is 00 to 255
# non-ascii characters like copyrigth symbol, trademark symbol
 tweet = re.sub(r'[^\x00-\x7f]',r'',tweet)
               
#convert any url to URL
 tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
               
#Convert any @Username to "AT_USER"
 tweet = re.sub('@[^\s]+','AT_USER',tweet)

#Remove additional white spaces
 tweet = re.sub('[\s]+', ' ', tweet)
 tweet = re.sub('[\n]+', ' ', tweet)

#Remove not alphanumeric symbols white spaces
 tweet = re.sub(r'[^\w]', ' ', tweet)

#Removes hastag in front of a word """
 tweet = re.sub(r'#([^\s]+)', r'\1', tweet)

#Replace #word with word
 tweet = re.sub(r'#([^\s]+)', r'\1', tweet)

#Remove :( or :)
 tweet = tweet.replace(':)','')
 tweet = tweet.replace(':(','')

#remove numbers
 tweet = ''.join([i for i in tweet if not i.isdigit()])

#remove multiple exclamation
 tweet = re.sub(r"(\!)\1+", ' ', tweet)

#remove multiple question marks
 tweet = re.sub(r"(\?)\1+", ' ', tweet)

#remove multistop
 tweet = re.sub(r"(\.)\1+", ' ', tweet)

#lemma
 from textblob import Word
 tweet =" ".join([Word(word).lemmatize() for word in tweet.split()])

#stemmer
#st = PorterStemmer()
#tweet=" ".join([st.stem(word) for word in tweet.split()])
#Removes emoticons from text
 tweet = re.sub(':\)|;\)|:-\)|\(-:|:-D|=D|:P|xD|X-p|\^\^|:-*|\^\.\^|\^\-\^|\^\_\^|\,-\)|\)-:|:\'\(|:\(|:-\(|:\S|T\.T|\.\_\.|:<|:-\S|:-<|\*\-\*|:O|=O|=\-O|O\.o|XO|O\_O|:-\@|=/|:/|X\-\(|>\.<|>=\(|D:', '', tweet)

#trim
 tweet = tweet.strip('\'"')
               
 row = tweet
 return row
               
#call the function with your data
review = processRow(review)
review2 = processRow(review2)
print(review)
print(review2)

I like this phone screen quality and camera clarity is really good
This tv is not good Bad quality no clarity worst experience


In [19]:
# Get the sentiment scores

# import libraries
from textblob import TextBlob

#TextBlob has a pre trained sentiment prediction model
blob = TextBlob(review)
blob.sentiment

Sentiment(polarity=0.7, subjectivity=0.6000000000000001)

In [20]:
# Again using TextBlob, over review2
blob = TextBlob(review2)
blob.sentiment

Sentiment(polarity=-0.6833333333333332, subjectivity=0.7555555555555555)