In [1]:
import nltk
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.model_selection import train_test_split
from matplotlib import cm
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.naive_bayes import MultinomialNB

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\khan_\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\khan_\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\khan_\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
df = pd.read_csv('data/cleaned_data.csv')
df.head()

Unnamed: 0,tweet,product,emotion
0,wesley i have a g iphone after hrs tweeting a...,iPhone,Negative emotion
1,jessedee know about fludapp awesome ipadiphon...,iPad or iPhone App,Positive emotion
2,swonderlin can not wait for ipad also they sh...,iPad,Positive emotion
3,sxsw i hope this years festival isnt as crashy...,iPad or iPhone App,Negative emotion
4,sxtxstate great stuff on fri sxsw marissa maye...,Google,Positive emotion


# Prepare Data

In [3]:
df = df.drop(columns = ['product'])

In [4]:
# Dropping all columns with the 'I can't tell' emotion
to_drop = df[df['emotion'] == "I can't tell"]
index_to_drop = list(to_drop.index)
df.drop(df.index[index_to_drop], inplace=True)

In [5]:
# Making sure all the there are only three emotions present
df['emotion'].value_counts()

No emotion toward brand or product    5388
Positive emotion                      2978
Negative emotion                       570
Name: emotion, dtype: int64

In [6]:
# Setting data and target values
data = df['tweet']
target = df['emotion']

In [7]:
import string, re
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [8]:
## setting stopwords and punctuations
sw_list = stopwords.words('english')
sw_list += list(string.punctuation)
sw_list += ["''", '""', '...', '``', '’', '“', '’', '”', '‘', '‘', '©',
            'said', 'one', 'com', 'satirewire', '-', '–', '—', 'satirewire.com']
sw_set = set(sw_list)

In [9]:
# we can define a function that removes stopwords 
def process_article(article):
    tokens = nltk.word_tokenize(article)
    stopwords_removed = [token.lower() for token in tokens if token.lower() not in sw_set]
    return stopwords_removed

In [10]:
# applying the above function to our data/features 
processed_data = list(map(process_article, data))

In [11]:
total_vocab = set()
for comment in processed_data:
    total_vocab.update(comment)
len(total_vocab)

10185

In [12]:
# Lemmatizing words

from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()

# creating a list with all lemmatized outputs
lemmatized_output = []

for listy in processed_data:
    lemmed = ' '.join([lemmatizer.lemmatize(w) for w in listy])
    lemmatized_output.append(lemmed)

In [13]:
X_lem = lemmatized_output
# [' '.join(d) for d in processed_data]

y_lem = target
# X[3]

# CountVectorize

In [14]:
# Initiating train test split
X_train_lem, X_test_lem, y_train_lem, y_test_lem = train_test_split(X_lem, y_lem, test_size=0.20, random_state=1)

In [15]:
# Define Random Forest Classifier
rf_classifier_lem = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1, verbose=2)

In [16]:
# Convert a collection of text documents to a matrix of token counts
vec = CountVectorizer(stop_words = stopwords.words('english'))

X_train_data = vec.fit_transform(X_train_lem)

X_train_data = pd.DataFrame(X_train_data.toarray(), columns = vec.get_feature_names())

X_train_data

Unnamed: 0,aapl,aaron,aarpbulletin,ab,abacus,abandoned,abc,ability,able,abnormal,...,zms,zomb,zombie,zomg,zone,zoom,zuckerberg,zuckerberglink,zynga,zzzs
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7143,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7144,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7145,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7146,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
X_test_data = vec.transform(X_test_lem)
X_test_data = pd.DataFrame(X_test_data.toarray(), columns = vec.get_feature_names())
X_test_data

Unnamed: 0,aapl,aaron,aarpbulletin,ab,abacus,abandoned,abc,ability,able,abnormal,...,zms,zomb,zombie,zomg,zone,zoom,zuckerberg,zuckerberglink,zynga,zzzs
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1783,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1784,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1785,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1786,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# Predict test and train data
rf_classifier_lem.fit(X_train_data, y_train_lem)

rf_test_preds_lem = rf_classifier_lem.predict(X_test_data)
rf_train_preds_lem = rf_classifier_lem.predict(X_train_data)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


building tree 1 of 100building tree 2 of 100
building tree 3 of 100
building tree 4 of 100

building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   22.3s


building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
building tree 58 of 100
building tree 59 of 100
building tree 60 of 100
building tree 61 of 100
building tree 62 of 100
building tree 63 of 100
building tree 64 of 100
building tree 65 of 100
building tree 66 of 100
building tree 67 of 100
building tree 68 of 100
building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72 of 100
building tree 73 of 100
building tree 74 of 100
building tree 75 of 100
building tree 76 of 100
building tree 77 of 100
building tree 78 of 100
building tree 79

[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.0min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    1.0s finished


In [19]:
# Print accuracy and f1 scores
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

rf_acc_score_lem = accuracy_score(y_test_lem, rf_test_preds_lem)
rf_f1_score_lem = f1_score(y_test_lem, rf_test_preds_lem, average='micro')
print('Random Forest with Lemmatization Features')
print("Testing Accuracy: {:.4}".format(rf_acc_score_lem))
print()
print("F1 Score: {:.4}".format(rf_f1_score_lem))

Random Forest with Lemmatization Features
Testing Accuracy: 0.6924

F1 Score: 0.6924


In [20]:
# Printing a confusion matrix of Random Forest model
print('Evaluations for test:\n', confusion_matrix(y_test_lem, rf_test_preds_lem))
print(classification_report(y_test_lem, rf_test_preds_lem))
print('\n')
print('Evaluations for train:\n',confusion_matrix(y_train_lem, rf_train_preds_lem))
print(classification_report(y_train_lem, rf_train_preds_lem))

Evaluations for test:
 [[ 25  78  10]
 [  8 974 102]
 [  3 349 239]]
                                    precision    recall  f1-score   support

                  Negative emotion       0.69      0.22      0.34       113
No emotion toward brand or product       0.70      0.90      0.78      1084
                  Positive emotion       0.68      0.40      0.51       591

                          accuracy                           0.69      1788
                         macro avg       0.69      0.51      0.54      1788
                      weighted avg       0.69      0.69      0.66      1788



Evaluations for train:
 [[ 456    1    0]
 [   2 4299    3]
 [   0   33 2354]]
                                    precision    recall  f1-score   support

                  Negative emotion       1.00      1.00      1.00       457
No emotion toward brand or product       0.99      1.00      1.00      4304
                  Positive emotion       1.00      0.99      0.99      2387

         

# TF-IDF

In [21]:
# Fitting the tf-idf model
tfidf = TfidfVectorizer()

tfidf_data_train_lem = tfidf.fit_transform(X_train_lem)
tfidf_data_test_lem = tfidf.transform(X_test_lem)

tfidf_data_train_lem

<7148x8529 sparse matrix of type '<class 'numpy.float64'>'
	with 81001 stored elements in Compressed Sparse Row format>

In [22]:
non_zero_cols = tfidf_data_train_lem.nnz / float(tfidf_data_train_lem.shape[0])
print("Average Number of Non-Zero Elements in Vectorized Articles: {}".format(non_zero_cols))

percent_sparse = 1 - (non_zero_cols / float(tfidf_data_train_lem.shape[1]))
print('Percentage of columns containing ZERO: {}'.format(percent_sparse))

Average Number of Non-Zero Elements in Vectorized Articles: 11.331980973698936
Percentage of columns containing ZERO: 0.9986713587790246


# Random Forest

In [23]:
# Fitting to a Random Forest Classifer
rf_classifier_lem.fit(tfidf_data_train_lem, y_train_lem)

rf_test_preds_lem = rf_classifier_lem.predict(tfidf_data_test_lem)
rf_train_preds_lem = rf_classifier_lem.predict(tfidf_data_train_lem)

building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    1.0s


building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
building tree 58 of 100
building tree 59 of 100
building tree 60 of 100
building tree 61 of 100
building tree 62 of 100
building tree 63 of 100
building tree 64 of 100
building tree 65 of 100
building tree 66 of 100
building tree 67 of 100
building tree 68 of 100
building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72 of 100
building tree 73 of 100
building tree 74 of 100
building tree 75 of 100
building tree 76 of 100
building tree 77 of 100
building tree 78

[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    3.3s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.1s finished


In [24]:
# Printing accuracy and f1 scores
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

rf_acc_score_lem = accuracy_score(y_test_lem, rf_test_preds_lem)
rf_f1_score_lem = f1_score(y_test_lem, rf_test_preds_lem, average='micro')
print('Random Forest with Lemmatization Features')
print("Testing Accuracy: {:.4}".format(rf_acc_score_lem))
print()
print("F1 Score: {:.4}".format(rf_f1_score_lem))

Random Forest with Lemmatization Features
Testing Accuracy: 0.6879

F1 Score: 0.6879


In [25]:
# Printing a confusion matrix of Random Forest model
print('Evaluations for test:\n', confusion_matrix(y_test_lem, rf_test_preds_lem))
print(classification_report(y_test_lem, rf_test_preds_lem))
print('\n')
print('Evaluations for train:\n',confusion_matrix(y_train_lem, rf_train_preds_lem))
print(classification_report(y_train_lem, rf_train_preds_lem))

Evaluations for test:
 [[ 19  85   9]
 [  6 965 113]
 [  4 341 246]]
                                    precision    recall  f1-score   support

                  Negative emotion       0.66      0.17      0.27       113
No emotion toward brand or product       0.69      0.89      0.78      1084
                  Positive emotion       0.67      0.42      0.51       591

                          accuracy                           0.69      1788
                         macro avg       0.67      0.49      0.52      1788
                      weighted avg       0.68      0.69      0.66      1788



Evaluations for train:
 [[ 456    1    0]
 [   2 4298    4]
 [   0   32 2355]]
                                    precision    recall  f1-score   support

                  Negative emotion       1.00      1.00      1.00       457
No emotion toward brand or product       0.99      1.00      1.00      4304
                  Positive emotion       1.00      0.99      0.99      2387

         