In [None]:
import nltk
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.model_selection import train_test_split
from matplotlib import cm
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.naive_bayes import MultinomialNB

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
df = pd.read_csv('data/cleaned_data.csv')
df.head()

# Prepare Data

In [None]:
df = df.drop(columns = ['product'])

In [None]:
# Dropping all columns with the 'I can't tell' emotion
to_drop = df[df['emotion'] == "I can't tell"]
index_to_drop = list(to_drop.index)
df.drop(df.index[index_to_drop], inplace=True)

In [None]:
# Making sure all the there are only three emotions present
df['emotion'].value_counts()

In [None]:
# Setting data and target values
data = df['tweet']
target = df['emotion']

In [None]:
import string, re
string.punctuation

In [None]:
## setting stopwords and punctuations
sw_list = stopwords.words('english')
sw_list += list(string.punctuation)
sw_list += ["''", '""', '...', '``', '’', '“', '’', '”', '‘', '‘', '©',
            'said', 'one', 'com', 'satirewire', '-', '–', '—', 'satirewire.com']
sw_set = set(sw_list)

In [None]:
# we can define a function that removes stopwords 
def process_article(article):
    tokens = nltk.word_tokenize(article)
    stopwords_removed = [token.lower() for token in tokens if token.lower() not in sw_set]
    return stopwords_removed

In [None]:
# applying the above function to our data/features 
processed_data = list(map(process_article, data))

In [None]:
total_vocab = set()
for comment in processed_data:
    total_vocab.update(comment)
len(total_vocab)

In [None]:
# Lemmatizing words

from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()

# creating a list with all lemmatized outputs
lemmatized_output = []

for listy in processed_data:
    lemmed = ' '.join([lemmatizer.lemmatize(w) for w in listy])
    lemmatized_output.append(lemmed)

In [None]:
X_lem = lemmatized_output
# [' '.join(d) for d in processed_data]

y_lem = target
# X[3]

# CountVectorize

In [None]:
# Initiating train test split
X_train_lem, X_test_lem, y_train_lem, y_test_lem = train_test_split(X_lem, y_lem, test_size=0.20, random_state=1)

In [None]:
# Define Random Forest Classifier
rf_classifier_lem = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1, verbose=2)

In [None]:
# Convert a collection of text documents to a matrix of token counts
vec = CountVectorizer(stop_words = stopwords.words('english'))

X_train_data = vec.fit_transform(X_train_lem)

X_train_data = pd.DataFrame(X_train_data.toarray(), columns = vec.get_feature_names())

X_train_data

In [None]:
X_test_data = vec.transform(X_test_lem)
X_test_data = pd.DataFrame(X_test_data.toarray(), columns = vec.get_feature_names())
X_test_data

In [None]:
# Predict test and train data
rf_classifier_lem.fit(X_train_data, y_train_lem)

rf_test_preds_lem = rf_classifier_lem.predict(X_test_data)
rf_train_preds_lem = rf_classifier_lem.predict(X_train_data)

In [None]:
# Print accuracy and f1 scores
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

rf_acc_score_lem = accuracy_score(y_test_lem, rf_test_preds_lem)
rf_f1_score_lem = f1_score(y_test_lem, rf_test_preds_lem, average='micro')
print('Random Forest with Lemmatization Features')
print("Testing Accuracy: {:.4}".format(rf_acc_score_lem))
print()
print("F1 Score: {:.4}".format(rf_f1_score_lem))

In [None]:
# Printing a confusion matrix of Random Forest model
print('Evaluations for test:\n', confusion_matrix(y_test_lem, rf_test_preds_lem))
print(classification_report(y_test_lem, rf_test_preds_lem))
print('\n')
print('Evaluations for train:\n',confusion_matrix(y_train_lem, rf_train_preds_lem))
print(classification_report(y_train_lem, rf_train_preds_lem))

# TF-IDF

In [None]:
# Fitting the tf-idf model
tfidf = TfidfVectorizer()

tfidf_data_train_lem = tfidf.fit_transform(X_train_lem)
tfidf_data_test_lem = tfidf.transform(X_test_lem)

tfidf_data_train_lem

In [None]:
non_zero_cols = tfidf_data_train_lem.nnz / float(tfidf_data_train_lem.shape[0])
print("Average Number of Non-Zero Elements in Vectorized Articles: {}".format(non_zero_cols))

percent_sparse = 1 - (non_zero_cols / float(tfidf_data_train_lem.shape[1]))
print('Percentage of columns containing ZERO: {}'.format(percent_sparse))

In [None]:
# Fitting to a Random Forest Classifer
rf_classifier_lem.fit(tfidf_data_train_lem, y_train_lem)

rf_test_preds_lem = rf_classifier_lem.predict(tfidf_data_test_lem)
rf_train_preds_lem = rf_classifier_lem.predict(tfidf_data_train_lem)

In [None]:
# Printing accuracy and f1 scores
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

rf_acc_score_lem = accuracy_score(y_test_lem, rf_test_preds_lem)
rf_f1_score_lem = f1_score(y_test_lem, rf_test_preds_lem, average='micro')
print('Random Forest with Lemmatization Features')
print("Testing Accuracy: {:.4}".format(rf_acc_score_lem))
print()
print("F1 Score: {:.4}".format(rf_f1_score_lem))

In [None]:
# Printing a confusion matrix of Random Forest model
print('Evaluations for test:\n', confusion_matrix(y_test_lem, rf_test_preds_lem))
print(classification_report(y_test_lem, rf_test_preds_lem))
print('\n')
print('Evaluations for train:\n',confusion_matrix(y_train_lem, rf_train_preds_lem))
print(classification_report(y_train_lem, rf_train_preds_lem))