In [1]:
import os
import sklearn
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression as sk_OLS
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

import torch
import torch.nn.functional as F
import math

from sklearn.metrics import r2_score
import string
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer, PorterStemmer
import re
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression

In [2]:
train_data = pd.read_csv('data/train.csv')
val_data = pd.read_csv('data/val.csv')
test_data = pd.read_csv('data/test.csv')

In [3]:
# get all train data (labelled and unlabelled)
X_train    = train_data['Phrase']
y_train    = train_data['Sentiment']
# get only labelled train data
mask = (y_train != -100)
train_data_clean    = train_data[mask]
X_train_clean    = X_train[mask]
y_train_clean    = y_train[mask]

# get val data
X_val    = val_data['Phrase']
y_val    = val_data['Sentiment']

# get test data
X_test     = test_data['Phrase']


In [4]:

def clean(text):
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', str(text), flags=re.MULTILINE)
    texter = re.sub(r"<br />", " ", text)
    texter = re.sub(r"&quot;", "\"",texter)
    texter = re.sub('&#39;', "\"", texter)
    texter = re.sub('\n', " ", texter)
    texter = re.sub(' u '," you ", texter)
    texter = re.sub('`',"", texter)
    texter = re.sub(' +', ' ', texter)
    texter = re.sub(r"(!)\1+", r"!", texter)
    texter = re.sub(r"(\?)\1+", r"?", texter)
    texter = re.sub('&amp;', 'and', texter)
    texter = re.sub('\r', ' ',texter)
    #added substitutions

    #***********added substitutions***********
    # remove all the special characters
    texter = re.sub(r'\W', ' ', texter)
    # remove all single characters
    texter = re.sub(r'\s+[a-zA-Z]\s+', ' ', texter)
    # Remove single characters from the start
    texter = re.sub(r'\^[a-zA-Z]\s+', ' ', texter)
    # Remove numbers
    texter = re.sub(r'\d+', ' ', texter)
    # Converting to Lowercase
    texter = texter.lower()
    # Remove punctuation
    texter = re.sub(r'[^\w\s]', ' ', texter)
    # Remove parentheses
    texter = re.sub(r'\([^)]*\)', ' ', texter)
    # Remove single quotes
    texter = re.sub(r'\'', ' ', texter)
    # Substituting multiple spaces with single space
    texter = re.sub(r'\s+', ' ', texter, flags=re.I)

    clean = re.compile('<.*?>')
    texter = texter.encode('ascii', 'ignore').decode('ascii')
    texter = re.sub(clean, '', texter)
    if texter == "":
        texter = ""
    return texter

def clean_dataset(dataset):
    for row in range(dataset.shape[0]):
        dataset[row,0] = clean(dataset[row,0])
    return dataset

def tokenize_lexicon(texts):
    return_texts = []
    for i in range(len(texts)):
        return_texts.append(nltk.word_tokenize(texts[i]))
        return_texts[i] = nltk.pos_tag(return_texts[i])
    return return_texts

def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wn.ADJ
    elif pos_tag.startswith('V'):
        return wn.VERB
    elif pos_tag.startswith('N'):
        return wn.NOUN
    elif pos_tag.startswith('R'):
        return wn.ADV
    else:
        return wn.NOUN

def lemmatize_texts(texts):
    return_texts = []
    lemmer = nltk.stem.WordNetLemmatizer()
    for i in range(len(texts)):
        return_texts.append([])
        for j in range(len(texts[i])):
                return_texts[i].append(lemmer.lemmatize(texts[i][j][0], pos=get_wordnet_pos(texts[i][j][1])))
    return return_texts

def stem_texts(texts):
    return_texts = []
    ps = PorterStemmer()
    for i in range(len(texts)):
        return_texts.append([])
        for j in range(len(texts[i])):
                return_texts[i].append(ps.stem(texts[i][j][0]))
    return return_texts


def backtostring(texts):
    return_texts = []
    for i in range(len(texts)):
        return_texts.append(" ".join(texts[i]))
    return return_texts

In [5]:
def pre_process(data):
    preproc_data = data.copy()
    preproc_data = preproc_data.str.lower()
    punctuation = string.punctuation
    mapping = str.maketrans("", "", punctuation)
    preproc_data = preproc_data.str.translate(mapping)
    #nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))
    preproc_data = preproc_data.apply(lambda text: ' '.join([word for word in str(text).split() if word.lower() not in stop_words]))
    #nltk.download('wordnet')
    lemmatizer = WordNetLemmatizer()
    preproc_data = preproc_data.apply(lambda text: ' '.join([lemmatizer.lemmatize(word) for word in text.split()]))
    preproc_data = preproc_data.apply(lambda text: re.sub(r'@\w+', '', re.sub(r'http\S+|www\S+', '', text)))
    return preproc_data

# get the preprocessed data
X_train_preproc   = pre_process(X_train)
X_train_clean_preproc   = pre_process(X_train_clean)
X_val_preproc = pre_process(X_val)
X_test_preproc = pre_process(X_test)

In [6]:
combined_data = pd.concat([X_train_preproc, X_val_preproc, X_test_preproc])

In [7]:
def bag_of_word(data,  threshold_M):
    vectorizer = CountVectorizer(binary=True, max_features= threshold_M)
    vectorizer.fit(combined_data)
    X = vectorizer.transform(data)
    featurized_data = pd.DataFrame(X.toarray(), columns = vectorizer.get_feature_names_out())
    return featurized_data

# get the featurized data
X_train   = bag_of_word(X_train_preproc, 125)
X_train_clean = bag_of_word(X_train_clean_preproc, 125)
X_val = bag_of_word(X_val_preproc, 125)
X_test = bag_of_word(X_test_preproc, 125)

In [8]:
#BAG OF WORDS NAMES 125 threshold for pca
X_train_bow = bag_of_word(X_train_preproc, 125)
X_train_clean_bow = bag_of_word(X_train_clean_preproc, 125)
X_val_bow = bag_of_word(X_val_preproc, 125)
X_test_bow = bag_of_word(X_test_preproc, 125)

# do kmeans only on unlabeled data denoted by -100, this one is not the optimal but had to explore kmeans on only unlabeled
unlabeled_mask = (y_train == -100)
X_train_unlabeled_bow = X_train_bow[unlabeled_mask]
kmeans = KMeans(n_clusters=5, random_state=0)
kmeans.fit(X_train_unlabeled_bow)
unlabeled_labels = kmeans.labels_
#ASSIGNING pseudo-labels to the unlabeled data in y_train
y_train.loc[unlabeled_mask] = unlabeled_labels

# visualizatio
"""pca = PCA(n_components=2)
X_train_2d = pca.fit_transform(X_train_bow)  # Apply PCA to the entire BoW data for visualization
cluster_centers_2d = pca.transform(kmeans.cluster_centers_)

plt.figure(figsize=(10, 6))
plt.scatter(X_train_2d[:, 0], X_train_2d[:, 1], c=y_train, cmap='viridis', alpha=0.5, label='Data points')
plt.scatter(cluster_centers_2d[:, 0], cluster_centers_2d[:, 1], marker='D', c='red', s=100, label='Cluster centers')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('K-Means Clustering with PCA-reduced Data')
plt.legend()
plt.grid(True)
plt.show()

# Elbow Method for selecting optimal k
objective_scores = []
k_values = range(1, 21)
for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=0)
    kmeans.fit(X_train_2d)
    objective_scores.append(kmeans.inertia_)

plt.figure(figsize=(10, 6))
plt.plot(k_values, objective_scores, marker='o', linestyle='-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('K-Means Objective (Inertia)')
plt.title('Elbow Method for Optimal k')
plt.xticks(k_values)
plt.grid(True)
plt.show()"""


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_train.loc[unlabeled_mask] = unlabeled_labels


"pca = PCA(n_components=2)\nX_train_2d = pca.fit_transform(X_train_bow)  # Apply PCA to the entire BoW data for visualization\ncluster_centers_2d = pca.transform(kmeans.cluster_centers_)\n\nplt.figure(figsize=(10, 6))\nplt.scatter(X_train_2d[:, 0], X_train_2d[:, 1], c=y_train, cmap='viridis', alpha=0.5, label='Data points')\nplt.scatter(cluster_centers_2d[:, 0], cluster_centers_2d[:, 1], marker='D', c='red', s=100, label='Cluster centers')\nplt.xlabel('Principal Component 1')\nplt.ylabel('Principal Component 2')\nplt.title('K-Means Clustering with PCA-reduced Data')\nplt.legend()\nplt.grid(True)\nplt.show()\n\n# Elbow Method for selecting optimal k\nobjective_scores = []\nk_values = range(1, 21)\nfor k in k_values:\n    kmeans = KMeans(n_clusters=k, random_state=0)\n    kmeans.fit(X_train_2d)\n    objective_scores.append(kmeans.inertia_)\n\nplt.figure(figsize=(10, 6))\nplt.plot(k_values, objective_scores, marker='o', linestyle='-')\nplt.xlabel('Number of Clusters (k)')\nplt.ylabel('K-M

In [9]:
pseudo_labeled_class_counts = pd.Series(unlabeled_labels).value_counts()
combined_y_train = pd.concat([y_train_clean, pd.Series(unlabeled_labels)], ignore_index=True)
combined_X_train = pd.concat([X_train_clean_bow, X_train_unlabeled_bow], ignore_index=True)  # Assuming bow vectors for both


In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
#GRID SEARCH TEST
rf_param_grid = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 10, 20, 30],
    'class_weight': ['balanced', 'balanced_subsample']
}

rf_grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=0),
    param_grid=rf_param_grid,
    cv=5,  
    scoring='accuracy',
    n_jobs=-1
)


rf_grid_search.fit(combined_X_train, combined_y_train)

# best parameters end up being max depth 10, n_estimators 100, meaning to less depth, 

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=0), n_jobs=-1,
             param_grid={'class_weight': ['balanced', 'balanced_subsample'],
                         'max_depth': [None, 10, 20, 30],
                         'n_estimators': [10, 50, 100]},
             scoring='accuracy')

In [11]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd

X_train_bow = bag_of_word(X_train_preproc, 125)
X_train_clean_bow = bag_of_word(X_train_clean_preproc, 125)
X_val_bow = bag_of_word(X_val_preproc, 125)
X_test_bow = bag_of_word(X_test_preproc, 125)

pca = PCA(n_components=125, random_state=0)
X_train_pca = pca.fit_transform(X_train_bow)
X_val_pca = pca.transform(X_val_bow)
X_test_pca = pca.transform(X_test_bow)
#kmeans
kmeans = KMeans(n_clusters=5, random_state=0)  
kmeans.fit(X_train_pca)
all_labels = kmeans.labels_

#assigning our pesudo labels for unlabeled data in y train using KMeans amd update y train for unlabeled
y_train.loc[unlabeled_mask] = all_labels[unlabeled_mask]
combined_X_train = X_train_pca  
combined_y_train = y_train 

# RANDOM FOREST 
rf_classifier = RandomForestClassifier(class_weight="balanced", random_state=0, n_estimators=10)
rf_classifier.fit(combined_X_train, combined_y_train)

# Accuracy is .39
y_val_pred = rf_classifier.predict(X_val_pca)
print("Accuracy for RF ,pca bow ,+ kmeans:", accuracy_score(y_val, y_val_pred))


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_train.loc[unlabeled_mask] = all_labels[unlabeled_mask]


Accuracy for RF ,pca bow ,+ kmeans: 0.39090987272101824


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
import warnings
warnings.filterwarnings("ignore", message=".*keepdims.*")


knn_classifier = KNeighborsClassifier(n_neighbors=3)
knn_classifier.fit(combined_X_train, combined_y_train)


y_val_pred = knn_classifier.predict(X_val_pca)


accuracy = accuracy_score(y_val, y_val_pred)
print(f"Accuracy on Validation Set with k=3, pca, bow, kmeans, rf: {accuracy}")
#.42 is best with these methods

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

#optinized using grid search 
optimized_rf_classifier = RandomForestClassifier(class_weight="balanced", random_state=0, 
                                                 n_estimators=100, max_depth=10)
optimized_rf_classifier.fit(combined_X_train, combined_y_train)

# Random Forest on validation set
y_val_pred_rf = optimized_rf_classifier.predict(X_val_pca)
print(" RF Accuracy on Val class 1 and 4 always worst:", accuracy_score(y_val, y_val_pred_rf))
