In [1]:
# e.g. if using google colab import drive, uncomment lines below
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
import os
import re
import math
import string
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

from sklearn.linear_model import LinearRegression as sk_OLS
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans, DBSCAN
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.mixture import GaussianMixture
from sklearn.naive_bayes import MultinomialNB

from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score, accuracy_score, classification_report

import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize

from collections import Counter

import warnings
warnings.filterwarnings('ignore')

# Loading Datasets

In [3]:
TRAIN_ORIGINAL = pd.read_csv("./data/train.csv")
VALIDATION_ORIGINAL= pd.read_csv("/Users/jeongwoochoi/Desktop/Cornell/Fall/5750_AML/midterm_project/data/val.csv")
TEST_ORIGINAL = pd.read_csv("/Users/jeongwoochoi/Desktop/Cornell/Fall/5750_AML/midterm_project/data/test.csv")

In [4]:
TRAIN_ORIGINAL['Sentiment'].value_counts()

Sentiment
-100    65545
 2      22418
 3       9238
 1       7544
 4       2587
 0       1910
Name: count, dtype: int64

# Preprocessing steps

In [5]:
def clean_data(text: str):
    """
    Take in a text review and return the cleaned version
    """
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    texter = re.sub(r"<br />", " ", text)
    texter = re.sub(r"&quot;", "\"",texter)
    texter = re.sub('&#39;', "\"", texter)
    texter = re.sub('\n', " ", texter)
    texter = re.sub(' u '," you ", texter)
    texter = re.sub('`',"", texter)
    texter = re.sub(' +', ' ', texter)
    texter = re.sub(r"(!)\1+", r"!", texter)
    texter = re.sub(r"(\?)\1+", r"?", texter)
    texter = re.sub('&amp;', 'and', texter)
    texter = re.sub('\r', ' ',texter)

    #***********added substitutions***********
    # Converting to Lowercase
    texter = texter.lower()    
    # remove all the special characters
    texter = re.sub(r'\W', ' ', texter)
    # remove all single characters
    texter = re.sub(r'\s+[a-zA-Z]\s+', ' ', texter)
    # Remove single characters from the start
    texter = re.sub(r'\^[a-zA-Z]\s+', ' ', texter)
    # Remove numbers
    texter = re.sub(r'\d+', ' ', texter)
    # Remove punctuation
    texter = re.sub(r'[^\w\s]', ' ', texter)
    # Remove parentheses
    texter = re.sub(r'\([^)]*\)', ' ', texter)
    # Remove single quotes
    texter = re.sub(r'\'', ' ', texter)
    # Substituting multiple spaces with single space
    texter = re.sub(r'\s+', ' ', texter, flags=re.I)

    clean = re.compile('<.*?>')
    texter = texter.encode('ascii', 'ignore').decode('ascii')
    texter = re.sub(clean, '', texter)
        
    return texter

## Bag of words

In [6]:
# Get the cleaned_phrases
cleaned_phrases_bow = TRAIN_ORIGINAL['Phrase'].apply(clean_data)
# cleaned_phrases = pre_process(cleaned_phrases)
train_data_cleaned_bow = pd.DataFrame({'Phrase': cleaned_phrases_bow, 'Sentiment': TRAIN_ORIGINAL['Sentiment']})
train_data_cleaned_bow

bag_of_words_vectorizer = CountVectorizer(max_features= 10000)
bag_of_words_vectorizer.fit(train_data_cleaned_bow['Phrase'])

## N-grams

In [7]:
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('punkt')

english_stopwords = set(stopwords.words('english'))
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_for_ngrams(text: str):
    # Clean data 
    text = clean_data(text)
    
    # Tokenization
    tokens = word_tokenize(text)
        
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]    
    
    # Stopword removal
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in lemmatized_tokens if token not in stop_words]
    
    return ' '.join(filtered_tokens)

# Get the cleaned_phrases
cleaned_phrases = TRAIN_ORIGINAL['Phrase'].apply(preprocess_for_ngrams)
train_data_ngrams = pd.DataFrame({'Phrase': cleaned_phrases, 'Sentiment': TRAIN_ORIGINAL['Sentiment']})
train_data_ngrams

Unnamed: 0,Phrase,Sentiment
0,every taste often funny collegiate gross comed...,-100
1,bunch good actor flailing around caper neither...,-100
2,vietnam picture,-100
3,fincher,2
4,pitiful directing,1
...,...,...
109237,trademark villain,-100
109238,earn share holiday box office pie although mov...,-100
109239,moving tale love destruction unexpected place,-100
109240,love reading poetry mean check,3


In [8]:
ngrams_vectorizer = CountVectorizer(ngram_range=(1, 3), min_df=100, max_features=10000)
ngrams_vectorizer.fit(train_data_ngrams['Phrase'])

## GloVe Embeddings

In [9]:
glove = {}
with open("./data/glove.6B.300d.txt", 'r', encoding='utf-8') as f: # if 'r' fails with unicode error, please use 'rb'
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        glove[word] = vector

In [10]:
def get_glove_embedding(text):
    glove_embedding = np.zeros(300)
    
    num_valid_words = 0
    words = text.split()
    for word in words:
        if word in glove:
            glove_embedding += glove[word]
            num_valid_words += 1
    
    if num_valid_words > 0:
        glove_embedding /= num_valid_words
    return glove_embedding

def preprocess_for_glove_embeddings(text: str):
    text = clean_data(text)
    return get_glove_embedding(text)

# Part 1: Labeling data with Unsupervised Learning Algorithms

## 1.1: Training K-means with Bag of Words, N-grams, GloVe

In [11]:
# K-Means + BoWs
X_BoWs = bag_of_words_vectorizer.transform(train_data_cleaned_bow['Phrase'])
kmeans_bag_of_words = KMeans(n_clusters=5)
kmeans_bag_of_words.fit(X_BoWs)

In [20]:
# K-Means + ngrams
X_ngram = ngrams_vectorizer.transform(train_data_ngrams['Phrase'])

# Initialize the centroids based on the labeled data
kmeans_ngram = KMeans(n_clusters=5)
kmeans_ngram.fit(X_ngram)

Counter(kmeans_ngram.labels_)

Counter({1: 100943, 0: 4997, 2: 1881, 4: 1397, 3: 24})

In [13]:
# K-means + GloVe
train_data_glove = [preprocess_for_glove_embeddings(phrase) for phrase in TRAIN_ORIGINAL['Phrase']] 

kmeans_glove = KMeans(n_clusters=5)
kmeans_glove.fit(train_data_glove)
Counter(kmeans_glove.labels_)

Counter({2: 37790, 3: 26239, 1: 25481, 4: 10352, 0: 9380})

### Fill in the missing labels for K-Means + BoWs

In [14]:
# Find the most frequent sentiment in each cluster
# cluster_map_for_kmeans_bows is a dict where the key is the cluster id and the value is the most frequent label in this cluster id
cluster_map_for_kmeans_bows = {}

for cluster_i in range(5):
    indices = np.where(kmeans_bag_of_words.labels_ == cluster_i)[0]
    data_in_cluster_i = train_data_cleaned_bow.iloc[indices]

    labels_in_cluster_i = data_in_cluster_i[data_in_cluster_i['Sentiment'] != -100]
    cluster_map_for_kmeans_bows[cluster_i] = labels_in_cluster_i['Sentiment'].mode().iloc[0]

print(cluster_map_for_kmeans_bows)

# Filling in the missing labels
train_data_kmeans_bows = train_data_cleaned_bow.copy()

predictions = [cluster_map_for_kmeans_bows[cluster_id] for cluster_id in kmeans_bag_of_words.labels_[train_data_kmeans_bows['Sentiment'] == -100]]
train_data_kmeans_bows.loc[train_data_kmeans_bows['Sentiment'] == -100, 'Sentiment'] = predictions

train_data_kmeans_bows['Sentiment'].value_counts()

{0: 2, 1: 2, 2: 2, 3: 2, 4: 2}


Sentiment
2    87963
3     9238
1     7544
4     2587
0     1910
Name: count, dtype: int64

### Fill in the missing labels for K-Means + N-Gram

In [15]:
# Find the most frequent sentiment in each cluster
# cluster_map_for_kmeans_ngram is a dict where the key is the cluster id and the value is the most frequent label in this cluster id
cluster_map_for_kmeans_ngram = {}
train_data_kmeans_ngrams = train_data_ngrams.copy()

for cluster_i in range(5):
    indices = np.where(kmeans_ngram.labels_ == cluster_i)[0]
    data_in_cluster_i = train_data_kmeans_ngrams.iloc[indices]
    
    labels_in_cluster_i = data_in_cluster_i[data_in_cluster_i['Sentiment'] != -100]
    cluster_map_for_kmeans_ngram[cluster_i] = int(labels_in_cluster_i['Sentiment'].mode().iloc[0])

print(cluster_map_for_kmeans_ngram)

predictions = [cluster_map_for_kmeans_ngram[cluster_id] for cluster_id in \
               kmeans_ngram.labels_[train_data_kmeans_ngrams['Sentiment'] == -100]]
train_data_kmeans_ngrams.loc[train_data_kmeans_ngrams['Sentiment'] == -100, 'Sentiment'] = predictions

train_data_kmeans_ngrams['Sentiment'].value_counts()

{0: 3, 1: 2, 2: 2, 3: 2, 4: 2}


Sentiment
2    87037
3    10164
1     7544
4     2587
0     1910
Name: count, dtype: int64

### Fill in the missing labels for K-Means + GloVe

In [16]:
# Find the most frequent sentiment in each cluster
# cluster_map_for_kmeans_ngram is a dict where the key is the cluster id and the value is the most frequent label in this cluster id
cluster_map_for_kmeans_glove = {}
train_data_kmeans_glove = TRAIN_ORIGINAL.copy()

for cluster_i in range(5):
    indices = np.where(kmeans_glove.labels_ == cluster_i)[0]
    data_in_cluster_i = train_data_kmeans_glove.iloc[indices]
    
    labels_in_cluster_i = data_in_cluster_i[data_in_cluster_i['Sentiment'] != -100]
    cluster_map_for_kmeans_glove[cluster_i] = int(labels_in_cluster_i['Sentiment'].mode().iloc[0])

print(cluster_map_for_kmeans_glove)

predictions = [cluster_map_for_kmeans_glove[cluster_id] for cluster_id in \
               kmeans_glove.labels_[train_data_kmeans_glove['Sentiment'] == -100]]
train_data_kmeans_glove.loc[train_data_kmeans_glove['Sentiment'] == -100, 'Sentiment'] = predictions

train_data_kmeans_glove['Sentiment'].value_counts()

{0: 2, 1: 2, 2: 2, 3: 2, 4: 2}


Sentiment
2    87963
3     9238
1     7544
4     2587
0     1910
Name: count, dtype: int64

## 1.2: Training GMM with Bag of words, N-Grams, and GloVe

In [17]:
# GMM + BoWs
# GMM_bag_of_words = GaussianMixture(n_components = 5, random_state=42, max_iter = 1, init_params='random')
# GMM_bag_of_words.fit(X_BoWs.toarray())
# clusters = GMM_bag_of_words.fit_predict(GMM_bag_of_words)

In [18]:
# GMM + N_grams
X_ngrams = ngrams_vectorizer.transform(train_data_ngrams['Phrase']).toarray()
gmm_ngram = GaussianMixture(n_components=5, random_state=42, max_iter=50)
clusters = gmm_ngram.fit_predict(X_ngrams)
Counter(clusters)

Counter({2: 33228, 3: 32718, 0: 16463, 4: 14765, 1: 12068})

In [19]:
# GMM + GloVe
train_data_glove = [preprocess_for_glove_embeddings(phrase) for phrase in TRAIN_ORIGINAL['Phrase']] 

gmm_glove = GaussianMixture(n_components=5, random_state=66, max_iter=66)
clusters = gmm_glove.fit_predict(train_data_glove)

Counter(clusters)

Counter({3: 32912, 1: 24240, 4: 22904, 0: 15647, 2: 13539})

### Fill in the missing labels for GMM + N-grams

In [21]:
train_data_gmm_ngrams = train_data_ngrams.copy()

for cluster_id in np.unique(clusters):
    
    # Find indices of points in this cluster
    cluster_indices = np.where(clusters == cluster_id)[0]
    
    # Find original labels for these points
    data_in_cluster_id = train_data_gmm_ngrams.iloc[cluster_indices]
    
    # Filter out the unlabeled (-100) labels
    labeled_data_in_cluster_id = data_in_cluster_id[data_in_cluster_id['Sentiment'] != -100]
    
    # Find the most common label
    most_common_label = labeled_data_in_cluster_id['Sentiment'].mode().iloc[0]
    
    unlabeld_data_in_cluster_id = data_in_cluster_id[data_in_cluster_id['Sentiment'] == -100].index
    train_data_gmm_ngrams.loc[unlabeld_data_in_cluster_id, 'Sentiment'] = most_common_label
    
train_data_gmm_ngrams['Sentiment'].value_counts()

Sentiment
2    87963
3     9238
1     7544
4     2587
0     1910
Name: count, dtype: int64

### Fill in the missing labels for GMM + GloVe

In [22]:
train_data_gmm_glove = TRAIN_ORIGINAL.copy()

for cluster_id in np.unique(clusters):
    
    # Find indices of points in this cluster
    cluster_indices = np.where(clusters == cluster_id)[0]
    
    # Find original labels for these points
    data_in_cluster_id = train_data_gmm_glove.iloc[cluster_indices]
    
    # Filter out the unlabeled (-100) labels
    labeled_data_in_cluster_id = data_in_cluster_id[data_in_cluster_id['Sentiment'] != -100]
    
    # Find the most common label
    most_common_label = labeled_data_in_cluster_id['Sentiment'].mode().iloc[0]
    
    unlabeld_data_in_cluster_id = data_in_cluster_id[data_in_cluster_id['Sentiment'] == -100].index
    train_data_gmm_glove.loc[unlabeld_data_in_cluster_id, 'Sentiment'] = most_common_label
    
train_data_gmm_glove['Sentiment'].value_counts()

Sentiment
2    87963
3     9238
1     7544
4     2587
0     1910
Name: count, dtype: int64

# Part 2: Evaluating the predicted labels with Supervised Learning Algorithms

## 2.1 Evaluating with Multinomial NB: 

### Multinomial NB + KMeans + N-Grams

In [23]:
# Model training
X_kmeans_ngrams = X_ngrams
y_kmeans_ngrams = train_data_kmeans_ngrams['Sentiment']

naive_bayes = MultinomialNB()
naive_bayes.fit(X_kmeans_ngrams, y_kmeans_ngrams)

# Validation
X_validation_ngrams = ngrams_vectorizer.transform(VALIDATION_ORIGINAL['Phrase'])
y_validation = VALIDATION_ORIGINAL['Sentiment']

y_validation_predictions = naive_bayes.predict(X_validation_ngrams)
print("Accuracy and report on validation data:", accuracy_score(y_validation, y_validation_predictions))
print(classification_report(y_validation, y_validation_predictions))

# Testing predictions
X_testing_ngrams = ngrams_vectorizer.transform(TEST_ORIGINAL['Phrase'])
y_testing_predictions = naive_bayes.predict(X_testing_ngrams)

test_predictions = pd.DataFrame({'PhraseId': TEST_ORIGINAL['PhraseId'], 'Sentiment': y_testing_predictions})
# test_predictions.to_csv('./dataset/naive_bayes-kmeans-ngrams.csv', index=False)

Accuracy and report on validation data: 0.5126660686060917
              precision    recall  f1-score   support

           0       0.76      0.02      0.04      1058
           1       0.64      0.00      0.01      4089
           2       0.51      0.99      0.68     11847
           3       0.49      0.03      0.06      4992
           4       0.73      0.01      0.03      1423

    accuracy                           0.51     23409
   macro avg       0.63      0.21      0.16     23409
weighted avg       0.55      0.51      0.36     23409



### Multinomial NB + KMeans + GloVe

In [47]:
# Find the minimum value in the training GloVe embeddings
min_value = np.min(train_data_glove)

# Make sure all values are non-negative
train_data_glove_positive = train_data_glove + abs(min_value)

# Train a Multinomial Naive Bayes model
naive_bayes_glove = MultinomialNB()
naive_bayes_glove.fit(train_data_glove_positive, train_data_kmeans_glove['Sentiment'])

# Transform validation and test data similarly
X_validation_glove = [preprocess_for_glove_embeddings(text) for text in VALIDATION_ORIGINAL['Phrase']]
X_validation_glove_positive = np.array(X_validation_glove) + abs(min_value)

y_validation = VALIDATION_ORIGINAL['Sentiment']
y_pred = naive_bayes_glove.predict(X_validation_glove_positive)

print("Accuracy and report:", accuracy_score(y_pred, y_validation))
print(classification_report(y_pred, y_validation))

# Test predictions
X_test_glove = [preprocess_for_glove_embeddings(text) for text in TEST_ORIGINAL['Phrase']]
X_test_glove_positive = np.array(X_test_glove) + abs(min_value)
y_test_glove = naive_bayes_glove.predict(X_test_glove_positive)

test_predictions = pd.DataFrame({'PhraseId': TEST_ORIGINAL['PhraseId'], 'Sentiment': y_test_glove})
# test_predictions.to_csv('./dataset/naive_bayes-kmeans-glove-positive.csv', index=False)

Accuracy and report: 0.5060874022811739
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00         0
           2       1.00      0.51      0.67     23409
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0

    accuracy                           0.51     23409
   macro avg       0.20      0.10      0.13     23409
weighted avg       1.00      0.51      0.67     23409



### Multinomial NB + GMM + N-Grams

In [24]:
X_gmm_ngrams = X_ngrams
y_gmm_ngrams = train_data_gmm_ngrams['Sentiment']

naive_bayes = MultinomialNB()
naive_bayes.fit(X_gmm_ngrams, y_gmm_ngrams)

# Validation
X_validation_ngrams = ngrams_vectorizer.transform(VALIDATION_ORIGINAL['Phrase'])
y_validation = VALIDATION_ORIGINAL['Sentiment']

y_validation_predictions = naive_bayes.predict(X_validation_ngrams)
print("Accuracy and report for validation:", accuracy_score(y_validation, y_validation_predictions))
print(classification_report(y_validation, y_validation_predictions))


# Testing prediction
X_testing_ngrams = ngrams_vectorizer.transform(TEST_ORIGINAL['Phrase'])
y_testing_predictions = naive_bayes.predict(X_testing_ngrams)

test_predictions = pd.DataFrame({'PhraseId': TEST_ORIGINAL['PhraseId'], 'Sentiment': y_testing_predictions})
# test_predictions.to_csv('./dataset/naive_bayes-gmm-ngrams.csv', index=False)

Accuracy and report for validation: 0.5094621726686317
              precision    recall  f1-score   support

           0       0.79      0.02      0.04      1058
           1       0.57      0.00      0.01      4089
           2       0.51      1.00      0.67     11847
           3       0.64      0.01      0.01      4992
           4       0.80      0.01      0.03      1423

    accuracy                           0.51     23409
   macro avg       0.66      0.21      0.15     23409
weighted avg       0.58      0.51      0.35     23409



### Multinomial NB + GMM + Glove

In [48]:
# Find the minimum value in the training GloVe embeddings
min_value = np.min(train_data_glove)

# Make sure all values are non-negative
train_data_glove_positive = train_data_glove + abs(min_value)

# Train a Multinomial Naive Bayes model
naive_bayes_glove_2 = MultinomialNB()
naive_bayes_glove_2.fit(train_data_glove_positive, train_data_gmm_glove['Sentiment'])

# Transform validation and test data similarly
X_validation_glove = [preprocess_for_glove_embeddings(text) for text in VALIDATION_ORIGINAL['Phrase']]
X_validation_glove_positive = np.array(X_validation_glove) + abs(min_value)

y_validation = VALIDATION_ORIGINAL['Sentiment']
y_pred = naive_bayes_glove_2.predict(X_validation_glove_positive)

print("Accuracy and report:", accuracy_score(y_pred, y_validation))
print(classification_report(y_pred, y_validation))

# Test predictions
X_test_glove = [preprocess_for_glove_embeddings(text) for text in TEST_ORIGINAL['Phrase']]
X_test_glove_positive = np.array(X_test_glove) + abs(min_value)
y_test_glove = naive_bayes_glove_2.predict(X_test_glove_positive)

test_predictions = pd.DataFrame({'PhraseId': TEST_ORIGINAL['PhraseId'], 'Sentiment': y_test_glove})
# test_predictions.to_csv('./dataset/naive_bayes-kmeans-glove-positive.csv', index=False)

Accuracy and report: 0.5060874022811739
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00         0
           2       1.00      0.51      0.67     23409
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0

    accuracy                           0.51     23409
   macro avg       0.20      0.10      0.13     23409
weighted avg       1.00      0.51      0.67     23409



## 2.2 Evaluating with KNN: 

### KNN + K-Means + N-grams

In [25]:
X_kmeans_ngrams = X_ngrams
y_kmeans_ngrams = train_data_kmeans_ngrams['Sentiment']

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_kmeans_ngrams, y_kmeans_ngrams)


# Validation
X_validation_ngrams = ngrams_vectorizer.transform(VALIDATION_ORIGINAL['Phrase'])
y_validation = VALIDATION_ORIGINAL['Sentiment']

y_validation_predictions = knn.predict(X_validation_ngrams)
print("Accuracy and report for validation:", accuracy_score(y_validation, y_validation_predictions))
print(classification_report(y_validation, y_validation_predictions))


# Testing predictions
X_testing_ngrams = ngrams_vectorizer.transform(TEST_ORIGINAL['Phrase'])
y_testing_predictions = knn.predict(X_testing_ngrams)

test_predictions = pd.DataFrame({'PhraseId': TEST_ORIGINAL['PhraseId'], 'Sentiment': y_testing_predictions})
# test_predictions.to_csv('./dataset/knn-kmeans-ngrams.csv', index=False)

Accuracy and report for validation: 0.5174505532060318
              precision    recall  f1-score   support

           0       0.43      0.03      0.05      1058
           1       0.41      0.03      0.06      4089
           2       0.52      0.98      0.68     11847
           3       0.53      0.06      0.11      4992
           4       0.66      0.03      0.05      1423

    accuracy                           0.52     23409
   macro avg       0.51      0.23      0.19     23409
weighted avg       0.51      0.52      0.38     23409



### KNN + K-Means + GloVe

In [26]:
knn = KNeighborsClassifier()
knn.fit(train_data_glove, train_data_kmeans_glove['Sentiment'])

# Validation
X_validation_glove = [preprocess_for_glove_embeddings(text) for text in VALIDATION_ORIGINAL['Phrase']]
y_validation = VALIDATION_ORIGINAL['Sentiment']

y_pred = knn.predict(X_validation_glove)

print("Accuracy and report:", accuracy_score(y_pred, y_validation))
print(classification_report(y_pred, y_validation))

# Test Predictions
X_test_glove = [preprocess_for_glove_embeddings(text) for text in TEST_ORIGINAL['Phrase']]
y_test_glove = knn.predict(X_test_glove)

test_predictions = pd.DataFrame({'PhraseId': TEST_ORIGINAL['PhraseId'], 'Sentiment': y_test_glove})
# test_predictions.to_csv('./dataset/knn-kmeans-glove.csv', index=False)

Accuracy and report: 0.5241146567559486
              precision    recall  f1-score   support

           0       0.10      0.40      0.15       255
           1       0.09      0.48      0.15       764
           2       0.96      0.53      0.68     21681
           3       0.06      0.53      0.11       572
           4       0.06      0.61      0.11       137

    accuracy                           0.52     23409
   macro avg       0.25      0.51      0.24     23409
weighted avg       0.90      0.52      0.64     23409



### KNN + GMM + N-grams

In [27]:
X_gmm_ngrams = X_ngrams
y_gmm_ngrams = train_data_gmm_ngrams['Sentiment']

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_gmm_ngrams, y_gmm_ngrams)


# Validation
X_validation_ngrams = ngrams_vectorizer.transform(VALIDATION_ORIGINAL['Phrase'])
y_validation = VALIDATION_ORIGINAL['Sentiment']

y_validation_predictions = knn.predict(X_validation_ngrams)
print("Accuracy:", accuracy_score(y_validation, y_validation_predictions))
print(classification_report(y_validation, y_validation_predictions))


# Testing predictions
X_testing_ngrams = ngrams_vectorizer.transform(TEST_ORIGINAL['Phrase'])
y_testing_predictions = knn.predict(X_testing_ngrams)

test_predictions = pd.DataFrame({'PhraseId': TEST_ORIGINAL['PhraseId'], 'Sentiment': y_testing_predictions})
# test_predictions.to_csv('./dataset/knn-gmm-ngrams.csv', index=False)

Accuracy: 0.5144602503310692
              precision    recall  f1-score   support

           0       0.43      0.03      0.05      1058
           1       0.41      0.03      0.06      4089
           2       0.52      0.99      0.68     11847
           3       0.56      0.04      0.07      4992
           4       0.67      0.03      0.05      1423

    accuracy                           0.51     23409
   macro avg       0.52      0.22      0.18     23409
weighted avg       0.51      0.51      0.37     23409



### KNN + GMM + GloVe

In [28]:
knn = KNeighborsClassifier()
knn.fit(train_data_glove, train_data_gmm_glove['Sentiment'])

# Validation
X_validation_glove = [preprocess_for_glove_embeddings(text) for text in VALIDATION_ORIGINAL['Phrase']]
y_validation = VALIDATION_ORIGINAL['Sentiment']

y_pred = knn.predict(X_validation_glove)

print("Accuracy and report:", accuracy_score(y_pred, y_validation))
print(classification_report(y_pred, y_validation))

# Test Predictions
X_test_glove = [preprocess_for_glove_embeddings(text) for text in TEST_ORIGINAL['Phrase']]
y_test_glove = knn.predict(X_test_glove)

test_predictions = pd.DataFrame({'PhraseId': TEST_ORIGINAL['PhraseId'], 'Sentiment': y_test_glove})
# test_predictions.to_csv('./dataset/knn-gmm-glove.csv', index=False)

Accuracy and report: 0.5241146567559486
              precision    recall  f1-score   support

           0       0.10      0.40      0.15       255
           1       0.09      0.48      0.15       764
           2       0.96      0.53      0.68     21681
           3       0.06      0.53      0.11       572
           4       0.06      0.61      0.11       137

    accuracy                           0.52     23409
   macro avg       0.25      0.51      0.24     23409
weighted avg       0.90      0.52      0.64     23409



## 2.3 Evaluating with Softmax regression: 

### Softmax + K-means + N-grams

In [29]:
X_kmeans_ngrams = X_ngrams
y_kmeans_ngrams = train_data_kmeans_ngrams['Sentiment']

softmax_reg = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=2000)
softmax_reg.fit(X_kmeans_ngrams, y_kmeans_ngrams)


# Validation
X_validation_ngrams = ngrams_vectorizer.transform(VALIDATION_ORIGINAL['Phrase'])
y_validation = VALIDATION_ORIGINAL['Sentiment']

y_validation_predictions = softmax_reg.predict(X_validation_ngrams)
print("Accuracy and report on validation data:", accuracy_score(y_validation, y_validation_predictions))
print(classification_report(y_validation, y_validation_predictions))


# Testing predictions
X_testing_ngrams = ngrams_vectorizer.transform(TEST_ORIGINAL['Phrase'])
y_testing_predictions = softmax_reg.predict(X_testing_ngrams)

test_predictions = pd.DataFrame({'PhraseId': TEST_ORIGINAL['PhraseId'], 'Sentiment': y_testing_predictions})
# test_predictions.to_csv('./dataset/softmax_regression-kmeans-ngrams.csv', index=False)

Accuracy and report on validation data: 0.5138194711435773
              precision    recall  f1-score   support

           0       0.75      0.02      0.04      1058
           1       0.58      0.00      0.01      4089
           2       0.51      0.99      0.68     11847
           3       0.48      0.04      0.07      4992
           4       0.65      0.03      0.06      1423

    accuracy                           0.51     23409
   macro avg       0.60      0.22      0.17     23409
weighted avg       0.54      0.51      0.36     23409



### Softmax + K-means + GloVe

In [30]:
softmax_reg = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=2000)
softmax_reg.fit(train_data_glove, train_data_kmeans_glove['Sentiment'])

# Validation
X_validation_glove = [preprocess_for_glove_embeddings(text) for text in VALIDATION_ORIGINAL['Phrase']]
y_validation = VALIDATION_ORIGINAL['Sentiment']

y_pred = softmax_reg.predict(X_validation_glove)

print("Accuracy and report:", accuracy_score(y_pred, y_validation))
print(classification_report(y_pred, y_validation))


# Test predictions
X_test_glove = [preprocess_for_glove_embeddings(text) for text in TEST_ORIGINAL['Phrase']]
y_test_glove = softmax_reg.predict(X_test_glove)

test_predictions = pd.DataFrame({'PhraseId': TEST_ORIGINAL['PhraseId'], 'Sentiment': y_test_glove})
# test_predictions.to_csv('./dataset/softmax_regression-kmeans-glove.csv', index=False)

Accuracy and report: 0.5096330471186296
              precision    recall  f1-score   support

           0       0.01      0.53      0.02        17
           1       0.01      0.56      0.02        57
           2       1.00      0.51      0.67     23211
           3       0.01      0.48      0.01        69
           4       0.03      0.67      0.05        55

    accuracy                           0.51     23409
   macro avg       0.21      0.55      0.15     23409
weighted avg       0.99      0.51      0.67     23409



### Softmax + GMM + N-grams

In [31]:
X_gmm_ngrams = X_ngrams
y_gmm_ngrams = train_data_gmm_ngrams['Sentiment']

softmax_reg = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=2000)
softmax_reg.fit(X_gmm_ngrams, y_gmm_ngrams)


# Validation
X_validation_ngrams = ngrams_vectorizer.transform(VALIDATION_ORIGINAL['Phrase'])
y_validation = VALIDATION_ORIGINAL['Sentiment']

y_validation_predictions = softmax_reg.predict(X_validation_ngrams)
print("Accuracy and report for validation:", accuracy_score(y_validation, y_validation_predictions))
print(classification_report(y_validation, y_validation_predictions))


# Testing predictions
X_testing_ngrams = ngrams_vectorizer.transform(TEST_ORIGINAL['Phrase'])
y_testing_predictions = softmax_reg.predict(X_testing_ngrams)

test_predictions = pd.DataFrame({'PhraseId': TEST_ORIGINAL['PhraseId'], 'Sentiment': y_testing_predictions})
# test_predictions.to_csv('./dataset/softmax_regression-gmm-ngrams.csv', index=False)

Accuracy and report for validation: 0.509761202956128
              precision    recall  f1-score   support

           0       0.75      0.02      0.04      1058
           1       0.57      0.00      0.01      4089
           2       0.51      1.00      0.67     11847
           3       0.51      0.01      0.01      4992
           4       0.64      0.02      0.05      1423

    accuracy                           0.51     23409
   macro avg       0.60      0.21      0.16     23409
weighted avg       0.54      0.51      0.35     23409



### Softmax + GMM + GloVe

In [32]:
softmax_reg = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=2000)
softmax_reg.fit(train_data_glove, train_data_gmm_glove['Sentiment'])

# Validation
X_validation_glove = [preprocess_for_glove_embeddings(text) for text in VALIDATION_ORIGINAL['Phrase']]
y_validation = VALIDATION_ORIGINAL['Sentiment']

y_pred = softmax_reg.predict(X_validation_glove)

print("Accuracy and report:", accuracy_score(y_pred, y_validation))
print(classification_report(y_pred, y_validation))


# Test predictions
X_test_glove = [preprocess_for_glove_embeddings(text) for text in TEST_ORIGINAL['Phrase']]
y_test_glove = softmax_reg.predict(X_test_glove)

test_predictions = pd.DataFrame({'PhraseId': TEST_ORIGINAL['PhraseId'], 'Sentiment': y_test_glove})
# test_predictions.to_csv('./dataset/softmax_regression-gmm-glove.csv', index=False)

Accuracy and report: 0.5096330471186296
              precision    recall  f1-score   support

           0       0.01      0.53      0.02        17
           1       0.01      0.56      0.02        57
           2       1.00      0.51      0.67     23211
           3       0.01      0.48      0.01        69
           4       0.03      0.67      0.05        55

    accuracy                           0.51     23409
   macro avg       0.21      0.55      0.15     23409
weighted avg       0.99      0.51      0.67     23409



# Final model selection:

## Use Naive Bayes + KMeans + BoWs to predict data in test

In [51]:
knn = KNeighborsClassifier()
knn.fit(train_data_glove, train_data_gmm_glove['Sentiment'])

# Test Predictions
X_test_glove = [preprocess_for_glove_embeddings(text) for text in TEST_ORIGINAL['Phrase']]
y_test_glove = knn.predict(X_test_glove)

test_predictions = pd.DataFrame({'PhraseId': TEST_ORIGINAL['PhraseId'], 'Sentiment': y_test_glove})
test_predictions.to_csv('./data/knn-gmm-glove.csv', index=False)