Importing Dependencies

In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Munem\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

The Dataset

In [4]:
df = pd.read_csv('sentiment_analysis.csv', encoding = "ISO-8859-1")

In [5]:
df.shape

(1599999, 6)

In [6]:
df.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [7]:
df.columns = ['label','ID','Date','Query','Username','Tweet']

In [8]:
df.head()

Unnamed: 0,label,ID,Date,Query,Username,Tweet
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [9]:
df.columns

Index(['label', 'ID', 'Date', 'Query', 'Username', 'Tweet'], dtype='object')

In [10]:
#n = len(pd.unique(df['label']))
#df["label"].replace({4: 1}, inplace=True)
n = pd.unique(df['label'])
print(n)

[0 4]


In [11]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])

In [12]:
df.isnull().sum()

label       0
ID          0
Date        0
Query       0
Username    0
Tweet       0
dtype: int64

In [13]:
#replace null values with empty string
df = df.fillna('')

In [14]:
from sklearn.utils import shuffle
df = shuffle(df,random_state=42)
df = df[1:800000]

In [15]:
df['label'].value_counts()

0    400210
1    399789
Name: label, dtype: int64

In [16]:
df['content'] = df['Username']+ ' ' + df['Tweet']

In [17]:
print(df['content'])

750       Young_J I'm off too bed. I gotta wake up hella...
766711    dougnawoschik I havent been able to listen to ...
285055    thireven now remembers why solving a relativel...
705995              taracollins086 Ate too much, feel sick 
379611    Portablemonkey Tried to purchase a parked doma...
                                ...                        
953018       HelloBayley Headed to bed....Good night all!  
993770    Gingerchaouat @nerekaulitz haha in november ! ...
937940    soumen08 @sanjay_ankur have libcurl, its mandr...
208691               mcraddictal @MCRsavedMilife not by me 
977400    louiseydeesy @ItsJoooosh haha probably, but i'...
Name: content, Length: 799999, dtype: object


In [18]:
X = df['content']
Y = df['label']

In [19]:
print(X)

750       Young_J I'm off too bed. I gotta wake up hella...
766711    dougnawoschik I havent been able to listen to ...
285055    thireven now remembers why solving a relativel...
705995              taracollins086 Ate too much, feel sick 
379611    Portablemonkey Tried to purchase a parked doma...
                                ...                        
953018       HelloBayley Headed to bed....Good night all!  
993770    Gingerchaouat @nerekaulitz haha in november ! ...
937940    soumen08 @sanjay_ankur have libcurl, its mandr...
208691               mcraddictal @MCRsavedMilife not by me 
977400    louiseydeesy @ItsJoooosh haha probably, but i'...
Name: content, Length: 799999, dtype: object


In [20]:
print(Y)

750       0
766711    0
285055    0
705995    0
379611    0
         ..
953018    1
993770    1
937940    1
208691    0
977400    1
Name: label, Length: 799999, dtype: int64


Stemming

In [21]:
port_stem = PorterStemmer()

In [22]:
def stemming(content):
  stemmed_content = re.sub('[^a-zA-Z]',' ', content)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)
  return stemmed_content

In [23]:
df['content'] = df['content'].apply(stemming)

In [24]:
print(df['content'])

750        young j bed gotta wake hella earli tomorrow morn
766711     dougnawoschik havent abl listen yet speaker bust
285055    thireven rememb solv rel big equat two unknown...
705995                        taracollin ate much feel sick
379611    portablemonkey tri purchas park domain godaddi...
                                ...                        
953018                      hellobayley head bed good night
993770           gingerchaouat nerekaulitz haha novemb hope
937940    soumen sanjay ankur libcurl mandriva packag do...
208691                              mcraddict mcrsavedmilif
977400    louiseydeesi itsjoooosh haha probabl much addi...
Name: content, Length: 799999, dtype: object


In [25]:
#converting textual data to numerical data
vectorizer = TfidfVectorizer(max_features = 600)
vectorizer.fit(X)

TfidfVectorizer(max_features=600)

In [26]:
X = vectorizer.transform(X)

print(X)

  (0, 551)	0.47191504735974643
  (0, 520)	0.3512316970739986
  (0, 337)	0.3498439676370239
  (0, 206)	0.44705742621225847
  (0, 139)	0.430331741675855
  (0, 44)	0.3827478688753846
  (1, 598)	0.5425851809546581
  (1, 294)	0.5567735378512322
  (1, 0)	0.6289710239407088
  (2, 536)	0.4281870883282029
  (2, 524)	0.44699013564604173
  (2, 416)	0.4648716783653315
  (2, 373)	0.4716931807806927
  (2, 48)	0.4222031523839459
  (3, 447)	0.5009653329589469
  (3, 342)	0.42571430087214296
  (3, 168)	0.398105817024456
  (3, 30)	0.639775607269896
  (4, 528)	0.43376319793497703
  (4, 375)	0.6032394639237997
  (4, 289)	0.3421852504577767
  (4, 271)	0.5752050866006813
  (5, 310)	0.6474868020235318
  (5, 140)	0.5886874144387112
  (5, 95)	0.483950378950988
  :	:
  (799991, 318)	0.5564667663856332
  (799991, 131)	0.4244252219073529
  (799991, 4)	0.5284355794989158
  (799992, 520)	0.7252578632229483
  (799992, 356)	0.6884773284816889
  (799993, 576)	0.2777771903683744
  (799993, 409)	0.40440153882155916
  (79

In [27]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, stratify=Y, random_state=1)

In [28]:
print(X.shape, x_train.shape, x_test.shape)

(799999, 600) (599999, 600) (200000, 600)


**Decision Tree**

In [29]:
from sklearn.tree import DecisionTreeClassifier

In [30]:
model1 = DecisionTreeClassifier(criterion='entropy', random_state=0)

In [31]:
model1.fit(x_train, y_train)

DecisionTreeClassifier(criterion='entropy', random_state=0)

In [32]:
x_train_predict = model1.predict(x_train)
training_data_accuracy = accuracy_score(x_train_predict, y_train)
print(training_data_accuracy)

0.9368382280637134


In [33]:
x_test_predict = model1.predict(x_test)
test_data_accuracy = accuracy_score(x_test_predict, y_test)
print(test_data_accuracy)

0.67809


In [34]:
training_precision = precision_score(y_train, x_train_predict)
print('Training data precision score = ', training_precision)
training_recall = recall_score(y_train, x_train_predict)
print('Training data recall score = ', training_recall)
training_matrix = confusion_matrix(y_train, x_train_predict)
print('Training data confusion matrix = ', training_matrix)

Training data precision score =  0.9224306149107387
Training data recall score =  0.9538190113459756
Training data confusion matrix =  [[276107  24050]
 [ 13847 285995]]


In [35]:
testing_precision = precision_score(y_test, x_test_predict)
print('Testing data precision score = ', testing_precision)
testing_recall = recall_score(y_test, x_test_predict)
print('Testing data recall score = ', testing_recall)
testing_matrix = confusion_matrix(y_test, x_test_predict)
print('Testing data confusion matrix = ', testing_matrix)

Testing data precision score =  0.6711056799484253
Testing data recall score =  0.6978298498204049
Testing data confusion matrix =  [[65872 34181]
 [30201 69746]]


**Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier
model2 = RandomForestClassifier(criterion='entropy', random_state=0)
model2.fit(x_train, y_train)

x_test_predict = model2.predict(x_test)
test_data_accuracy = accuracy_score(x_test_predict, y_test)
print(test_data_accuracy)

In [None]:
x_train_predict = model2.predict(x_train)
training_data_accuracy = accuracy_score(x_train_predict, y_train)
print(training_data_accuracy)

In [None]:
training_precision = precision_score(y_train, x_train_predict)
print('Training data precision score = ', training_precision)
training_recall = recall_score(y_train, x_train_predict)
print('Training data recall score = ', training_recall)
training_matrix = confusion_matrix(y_train, x_train_predict)
print('Training data confusion matrix = ', training_matrix)

In [None]:
testing_precision = precision_score(y_test, x_test_predict)
print('Testing data precision score = ', testing_precision)
testing_recall = recall_score(y_test, x_test_predict)
print('Testing data recall score = ', testing_recall)
testing_matrix = confusion_matrix(y_test, x_test_predict)
print('Testing data confusion matrix = ', testing_matrix)

**Gaussian Naive Bayes**


In [None]:
from sklearn.naive_bayes import GaussianNB


In [None]:
model3 = GaussianNB()

In [None]:
x_train_nb = x_train.toarray()
model3.fit(x_train_nb,y_train)

In [None]:
x_train_predict = model3.predict(x_train_nb)
training_data_accuracy = accuracy_score(x_train_predict, y_train)
print(training_data_accuracy)

In [None]:
x_test_nb = x_test.toarray()
x_test_predict = model3.predict(x_test_nb)
test_data_accuracy = accuracy_score(x_test_predict, y_test)
print(test_data_accuracy)

In [None]:
training_precision = precision_score(y_train, x_train_predict)
print('Training data precision score = ', training_precision)
training_recall = recall_score(y_train, x_train_predict)
print('Training data recall score = ', training_recall)
training_matrix = confusion_matrix(y_train, x_train_predict)
print('Training data confusion matrix = ', training_matrix)

In [None]:
testing_precision = precision_score(y_test, x_test_predict)
print('Testing data precision score = ', testing_precision)
testing_recall = recall_score(y_test, x_test_predict)
print('Testing data recall score = ', testing_recall)
testing_matrix = confusion_matrix(y_test, x_test_predict)
print('Testing data confusion matrix = ', testing_matrix)

**Multinomial Naive Bayes**

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
model4 = MultinomialNB()

In [None]:
x_train_nb = x_train.toarray()
model4.fit(x_train_nb,y_train)

In [None]:
x_train_predict = model4.predict(x_train_nb)
training_data_accuracy = accuracy_score(x_train_predict, y_train)
print(training_data_accuracy)

In [None]:
x_test_nb = x_test.toarray()
x_test_predict = model4.predict(x_test_nb)
test_data_accuracy = accuracy_score(x_test_predict, y_test)
print(test_data_accuracy)

In [None]:
training_precision = precision_score(y_train, x_train_predict)
print('Training data precision score = ', training_precision)
training_recall = recall_score(y_train, x_train_predict)
print('Training data recall score = ', training_recall)
training_matrix = confusion_matrix(y_train, x_train_predict)
print('Training data confusion matrix = ', training_matrix)

In [None]:
testing_precision = precision_score(y_test, x_test_predict)
print('Testing data precision score = ', testing_precision)
testing_recall = recall_score(y_test, x_test_predict)
print('Testing data recall score = ', testing_recall)
testing_matrix = confusion_matrix(y_test, x_test_predict)
print('Testing data confusion matrix = ', testing_matrix)

**Support Vector Machine**

In [37]:
from sklearn.svm import LinearSVC
svm_model=LinearSVC(random_state= 0 ,max_iter=15000)
svm_model.fit(x_train, y_train)

LinearSVC(max_iter=15000, random_state=0)

In [38]:
predictionsTest = svm_model.predict(x_test)
predictionsTrain = svm_model.predict(x_train)

In [39]:
print(accuracy_score(y_test, predictionsTest))
print(accuracy_score(y_train, predictionsTrain))

0.72954
0.7298145496909162


In [40]:
training_precision = precision_score(y_train, predictionsTrain)
print('Training data precision score = ', training_precision)
training_recall = recall_score(y_train, predictionsTrain)
print('Training data recall score = ', training_recall)
training_matrix = confusion_matrix(y_train, predictionsTrain)
print('Training data confusion matrix = ', training_matrix)

Training data precision score =  0.7127822940709362
Training data recall score =  0.7693618639149953
Training data confusion matrix =  [[207201  92956]
 [ 69155 230687]]


In [41]:
testing_precision = precision_score(y_test, predictionsTest)
print('Testing data precision score = ', testing_precision)
testing_recall = recall_score(y_test, predictionsTest)
print('Testing data recall score = ', testing_recall)
testing_matrix = confusion_matrix(y_test, predictionsTest)
print('Testing data confusion matrix = ', testing_matrix)

Testing data precision score =  0.7121639753851848
Testing data recall score =  0.7700081042952766
Testing data confusion matrix =  [[68948 31105]
 [22987 76960]]


**Neural Network**

In [42]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(alpha=1e-5, hidden_layer_sizes=(30,), random_state=1)
clf.fit(x_train, y_train)

MLPClassifier(alpha=1e-05, hidden_layer_sizes=(30,), random_state=1)

In [43]:
predictionsTest = clf.predict(x_test)
predictionsTrain = clf.predict(x_train)

In [44]:
print(accuracy_score(y_test, predictionsTest))
print(accuracy_score(y_train, predictionsTrain))

0.732135
0.7631262718771198


In [45]:
training_precision = precision_score(y_train, predictionsTrain)
print('Training data precision score = ', training_precision)
training_recall = recall_score(y_train, predictionsTrain)
print('Training data recall score = ', training_recall)
training_matrix = confusion_matrix(y_train, predictionsTrain)
print('Training data confusion matrix = ', training_matrix)

Training data precision score =  0.7421483492188268
Training data recall score =  0.8060611922279067
Training data confusion matrix =  [[216184  83973]
 [ 58151 241691]]


In [46]:
testing_precision = precision_score(y_test, predictionsTest)
print('Testing data precision score = ', testing_precision)
testing_recall = recall_score(y_test, predictionsTest)
print('Testing data recall score = ', testing_recall)
testing_matrix = confusion_matrix(y_test, predictionsTest)
print('Testing data confusion matrix = ', testing_matrix)

Testing data precision score =  0.7131510727877773
Testing data recall score =  0.776191381432159
Testing data confusion matrix =  [[68849 31204]
 [22369 77578]]
