In [62]:
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import accuracy_score, f1_score

# Accessing Data

In [63]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [64]:
import pandas as pd
df = pd.read_csv('/content/drive/Shareddrives/CIS 5190 Final Project!/new_emotions_df.csv')
print(df.head()) #Check display

   Unnamed: 0                                               text  label
0           0        im sick with allergies and feeling horrible      0
1           1  i feel the music hit me in a vain attempt to k...      0
2           2  i feel terribly helpless and thus i am putting...      0
3           3  im feeling like ive missed you all this time s...      0
4           4  im finding it harder and harder every day to c...      0


# Cleaning Data


In [65]:
# Check for NA values and drop them
print(len(df)) #9000
df.dropna(inplace = True)
print(len(df)) #9000

9000
9000


In [66]:
# Sample the data for all vectorization techniques to reduce run time burden on transformers
# There is no perfromance difference between a sample of 7k and the full data set (9k), so start there
data_sample = df.sample(n = 7000, random_state = 19104, ignore_index = True)
print(data_sample.shape[0])

7000


In [67]:
#Store the x and y variables
texts = data_sample['text']
labels = data_sample['label']

# Import pretrained word vectors


In [68]:
import numpy as np
import gensim.downloader as api
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score

In [69]:
#Load pre-trained Word2Vec embeddings
embedding_model = api.load("word2vec-google-news-300")

# Prepare text for word2vec embedding on svm and logistic regression


In [70]:
#Create a text length distribution to decide on an optimal max_len
text_lengths = [len(text.split()) for text in texts] #list of lengths
print("Average length: " , np.mean(text_lengths)) #average text length
print("Max length: ", np.max(text_lengths)) #highest text length
print("95% percentile: " , np.percentile(text_lengths, 95)) #95th percentile length

#Choose the max_len based on the 95th percentile (will catch most words)
max_len = int(np.percentile(text_lengths, 95))

Average length:  46.898
Max length:  1400
95% percentile:  169.0


# Prepare the text by tokenizing, embedding, and vectorizing

In [72]:
#Initialize WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

In [73]:
#Preprocess texts
def preprocess_text(text):
  stop_words = set(stopwords.words('english')) #create an array of common English stop words
  words = [word.lower() for word in word_tokenize(text) if word.isalnum() and word.lower() not in stop_words] #only store non-stop word text
  lemmatized_words = [lemmatizer.lemmatize(word) for word in words] #lemmatize words: improved accuracy by ~5%
  return ' '.join(lemmatized_words)

texts = texts.apply(preprocess_text)

In [74]:
# Tokenization and sequence padding
tokenizer = Tokenizer(num_words = 5000)
tokenizer.fit_on_texts(texts) # fit tokenizer on text
sequences = tokenizer.texts_to_sequences(texts) #convert texts to sequences of integers
word_index = tokenizer.word_index #create a word index that maps words to their indices
max_len = 170  # set based on code above (data distribution)
data = pad_sequences(sequences, maxlen=max_len) #pad sequences to ensre the length of the text os uniform

In [75]:
#Create embedding matrix
embedding_dim = 300
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim)) #initialize embedding with shape: (number of words )
for word, i in word_index.items(): #for each word_index dictionary
  if word in embedding_model: #if word exists in the embedding model
    embedding_vector = embedding_model[word] #retrieve the embedding vector and add it to the corresponding row in the embedding matrix
    if embedding_vector is not None:
      embedding_matrix[i] = embedding_vector

In [76]:
#Create feature vectors by averaging word vectors
def create_feature_vectors(data, embeddig_matrix):
  features = np.zeros((data.shape[0], embedding_matrix.shape[1])) #features shape: (number of sequences, embedding dimension)
  for i, sequence in enumerate(data): #for each sequence in the data
    valid_embeddings = [embedding_matrix[word] for word in sequence if word > 0] #extract the embeddings for words in the sequence
    #  that have embeddings (word > 0)
    if valid_embeddings:
      features[i] = np.max(valid_embeddings, axis = 0) #take the max. valid embedding along each dimension to create the feature vector
    else:
      features[i] = np.zeros(embedding_matrix.shape[1]) #if there are no feature vectors, feature vector remains a zero vector
  return features

features = create_feature_vectors(data, embedding_matrix)

In [77]:
print(len(word_index) +1)
print(len(embedding_matrix))

11902
11902


# Run Logistic Regression and SVM Classifiers

In [78]:
# Split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=42)

In [79]:
from sklearn.utils.class_weight import compute_class_weight
#Automatically adjusts weights (inversly proprotional to class frequencies) to balance classes
weights = compute_class_weight('balanced', classes = np.unique(y_train), y = y_train)
class_weights = dict(enumerate(weights))

In [80]:
#Run Logistic Regression Model
log_reg = LogisticRegression(max_iter = 1000) #stop at 1000 iterations regardless
log_reg.fit(x_train, y_train)
log_reg_pred = log_reg.predict(x_test)
log_reg_accuracy = accuracy_score(y_test, log_reg_pred) * 100 #accuracy score
log_reg_f1 = f1_score(y_test, log_reg_pred, average = 'weighted') * 100 #f1 score
print("Logistic Regression Accuracy: ", log_reg_accuracy)
print("Logistic Regression F1: ", log_reg_f1)

Logistic Regression Accuracy:  67.57142857142857
Logistic Regression F1:  67.60245254288176


In [81]:
#Run SVM Model
svm_model = SVC()
svm_model.fit(x_train, y_train)
svm_pred = svm_model.predict(x_test)
svm_accuracy = accuracy_score(y_test, svm_pred) * 100 #accuracy score
svm_f1 = f1_score(y_test, svm_pred, average = 'weighted') * 100 #f1 score
print("SVM Accuracy: ", svm_accuracy)
print("SVM F1: ", svm_f1)

SVM Accuracy:  70.33333333333334
SVM F1:  70.37264658963836
