Using Naive Bayes binary classification for positive or negative sentiment analysis

In [54]:
#mount the drive from google drive, where dataset is kept
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#Import tools

In [0]:
# DataFrame for structuring and augmenting data
import pandas as pd

# Matplot for graphing
import matplotlib.pyplot as plt
%matplotlib inline

# Scikit-learn, for data splitting, metrics, and the NaiveBayes classifier itself
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer

# nltk, Natural Language Tool Kit, has many useful tool for working with words
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer

# Tools
import re
import numpy as np
import os

In [56]:
#check for data where it should be
print(os.listdir("../content/drive/My Drive/sentiment140"))

['training.1600000.processed.noemoticon.csv']


#Load the dataset, augment the data frame

In [0]:
df=pd.read_csv('../content/drive/My Drive/sentiment140/training.1600000.processed.noemoticon.csv',encoding="ISO-8859-1", names=["target", "ids", "date", "flag", "user", "text"])
df.drop(['ids','flag','date','user'],axis=1,inplace=True) #drop columns that we don't care about
df['target']=df['target'].apply(lambda x: x/4)            #resize the data frame

#checking values, Total of 1.6 million, half are postive, other half are negative
d1=df[df['target']==1.0]
d2=df[df['target']==0.0]
df=pd.concat([d1,d2])
print(df.shape,df['target'].unique().shape,df[df['target']==1].shape)

#Filter the data, clean out unwanted characters

In [59]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
nltk.download('stopwords') #download the common stopwords list, ex: of,and,the,a, etc...
stemmer=PorterStemmer()    #Stemmer attempts to take a word and classify it with
def filtr(st):
    st=st.lower()
    rs=[x for x in st.split() if x not in stopwords.words("english")]
    rs=[x for x in rs if (x[0]!='@' and x[:5]!='http')]
    rs=[stemmer.stem(word=x) for x in rs]
    return ' '.join(rs)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#Split the data, and vectorize 

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(df['text'],df['target'],test_size=0.1,shuffle=True)

In [61]:
#Split by words, using Term Frequency, Inverse Document, which scores a word based on frequency
tv=TfidfVectorizer(ngram_range=(1,2),max_features=10000,stop_words='english') #limit feature size as you may get lot of features
tv.fit(xtrain)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=10000,
                min_df=1, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

#Train the model using NaiveBayes, to binary classification

In [0]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.svm import SVC
nb=MultinomialNB(1.5,fit_prior=False) #call NB classifier with no priors
xtrain1=tv.transform(xtrain) #put the data into array that function accepts
nb.fit(xtrain1,ytrain)
pred1=nb.predict(tv.transform(xtest))

Check accuracy score, and confusion matrix to get insight into validation

In [65]:
from sklearn.metrics import confusion_matrix,accuracy_score
print(accuracy_score(ytest,pred1))
print(confusion_matrix(ytest,pred1))
tn, fp, fn, tp  = confusion_matrix(ytest,pred1).ravel()
(tn, fp, fn, tp) #true negative, false positive, false negative, true positive

0.75811875
[[61164 18956]
 [19745 60135]]


In [0]:
#use this to test predictions
def result(x):
    sentiment = "0" # 0 is negative, 1 is positive
    x=filtr(x)
    return nb.predict(tv.transform([x])) 


Test predictions

In [71]:
# 0 is negative, 1 is positive
print(result('This ipad is bad'))
print(result('This ipad is not good'))
print(result('This ipad is perfect'))
print(result('This ipad is not very good'))
print(result('This ipad is not terrible'))
print(str("Complex example: "), result('This movie was actually neither that funny, nor super witty.'))

[0.]
[1.]
[1.]
[1.]
[0.]
Example:  [1.]
