In [1]:
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import pandas as pd

In [2]:
# This will be used for lemmatizationn
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Accessing Data

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
df = pd.read_csv('/content/drive/Shareddrives/CIS 5190 Final Project!/new_emotions_df.csv')

In [None]:
df

Unnamed: 0.1,Unnamed: 0,text,label
0,0,im sick with allergies and feeling horrible,0
1,1,i feel the music hit me in a vain attempt to k...,0
2,2,i feel terribly helpless and thus i am putting...,0
3,3,im feeling like ive missed you all this time s...,0
4,4,im finding it harder and harder every day to c...,0
...,...,...,...
8995,995,I thought I was the only one! I’m currently go...,8
8996,996,"Sometimes it's a survival skill, health wise I...",8
8997,997,Quite dudes i hang shit on..usually it be shut...,8
8998,998,I could say I’ve been in similar situations wh...,8


# Cleaning Data


In [5]:
#Checking for duplicate values and removing them
len(df) - len(df.drop_duplicates())

0

# Tokenization and Lemmatization


In [6]:
#Tokenize text
tokenized_texts = [] #Store the tokenized version of the text
for text in df['text']: #For each tweet/ reddit comment in the data
  tokenized_text = word_tokenize(text) #tokenize the text
  tokenized_texts.append(tokenized_text) #add the tokenized text to the list

#Assign the list of tokenized text to the 'text' column, replacing the text with its tokenizations
df['text'] = tokenized_texts

In [9]:
#Lemmatization: to reduce words to their base form. This will normalize words and make it easier to perform classification tasks
tag_map = defaultdict(lambda: wn.NOUN)
tag_map['J'] = wn.ADJ #tag the adjectives, 'J'
tag_map['V'] = wn.VERB #tag the verbs, 'V'
tag_map['R'] = wn.ADV #tag the adverbs, 'R'

stop_words = set(stopwords.words('english')) #create a list of common English stop words
wordnet_lemmatizer = WordNetLemmatizer()
for indx, text in enumerate(df['text']):
  finalWords = []
  for word, tag in pos_tag(text):
    if word not in stop_words and word.isalpha(): #identify the non-stop word alphabetical words
      wordnet_tag = tag_map[tag[0]] #grab the tag of the word: what kind of word is it?
      finalWords.append(wordnet_lemmatizer.lemmatize(word, wordnet_tag)) #add the lemmatized version of the word to the final words array
  df.loc[indx, 'finalText'] = str(finalWords)


In [10]:
# Sample the data for all vectorization techniques to reduce run time burden on transformers
# There is no perfromance difference between a sample of 7k and the full data set (9k), so start there
data_sample = df.sample(n = 7000, random_state = 19104, ignore_index = True)
print(data_sample.shape[0])

7000


In [11]:
#Create the train/test split (70:30)
x_train, x_test, y_train, y_test = model_selection.train_test_split(data_sample['finalText'], data_sample['label'], test_size = 0.3, shuffle = True)

In [12]:
#Vectorize train and test with TF-IDF to extract text features
tfidf_vec = TfidfVectorizer(max_features = 5000) #Vectorizer will only consider top 5k terms ordered by term frequency to inform classification
x_train_tfidf = tfidf_vec.fit_transform(x_train)
x_test_tfidf = tfidf_vec.transform(x_test)

In [13]:
#Most frequent words in the vector vocabulary:
tfidf_vec.vocabulary_

{'think': 4368,
 'go': 1707,
 'even': 1354,
 'visit': 4684,
 'hour': 1937,
 'help': 1858,
 'feel': 1492,
 'passionate': 2881,
 'remind': 3442,
 'awful': 322,
 'lot': 2382,
 'people': 2902,
 'like': 2324,
 'if': 1991,
 'believe': 395,
 'base': 352,
 'whole': 4803,
 'attitude': 300,
 'worldview': 4898,
 'belief': 394,
 'around': 253,
 'idea': 1980,
 'reality': 3342,
 'create': 915,
 'sorry': 3855,
 'harsh': 1818,
 'truth': 4509,
 'the': 4333,
 'good': 1716,
 'news': 2663,
 'soon': 3846,
 'realize': 3344,
 'place': 2965,
 'mentally': 2494,
 'change': 633,
 'you': 4969,
 'victim': 4668,
 'understand': 4551,
 'struggle': 4050,
 'lonely': 2364,
 'possible': 3021,
 'whatever': 4771,
 'happen': 1803,
 'thank': 4322,
 'connect': 815,
 'others': 2796,
 'little': 2345,
 'avoid': 315,
 'life': 2317,
 'time': 4415,
 'and': 182,
 'having': 1829,
 'building': 538,
 'connection': 817,
 'require': 3464,
 'get': 1688,
 'uncomfortable': 4546,
 'doing': 1176,
 'thing': 4366,
 'use': 4625,
 'comfort': 745,

# Run Logistic Regression



In [14]:
from sklearn.linear_model import LogisticRegression
logistic_reg = LogisticRegression()
logistic_reg.fit(x_train_tfidf, y_train) #this is what the model was fitted on
predictions_LR = logistic_reg.predict(x_test_tfidf)

In [18]:
#Accuracy and F1 score for Logistic Regression
LR_accuracy = accuracy_score(predictions_LR, y_test) * 100
print("Logistic Regression accuracy score: ", LR_accuracy)

LR_F1 = f1_score(predictions_LR, y_test, average = 'weighted') * 100
print("Logistic Regression F1 Score: ",  LR_F1)

Logistic Regression accuracy score:  78.80952380952381
Logistic Regression F1 Score:  78.8334961408658


# Run SVM


In [19]:
from sklearn.svm import SVC
svm_model = SVC()
svm_model.fit(x_train_tfidf, y_train)
predictions_SVM = svm_model.predict(x_test_tfidf)

In [21]:
#Accuracy and F1 Score for SVM
SVM_accuracy = accuracy_score(predictions_SVM, y_test) * 100
SVM_f1 = f1_score(predictions_SVM, y_test, average = 'weighted') * 100
print("SVM accuracy score: ", SVM_accuracy)
print("SVM F1 scor: " , SVM_f1)

SVM accuracy score:  78.57142857142857
SVM F1 scor:  78.43955232036825
