## Setup

In [1]:
# RUN SETUP.SH BEFORE RUNNING THIS IPYNB

import pandas as pd
from sklearn.metrics import f1_score
from sklearn.naive_bayes import MultinomialNB # Naive Bayes Classifier
from sklearn.linear_model import LogisticRegression # Logistic Regression Classifier
from sklearn.neural_network import MLPClassifier # Multi Layer Perceptron, simple Neural Network
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import ADASYN, RandomOverSampler
from scipy.sparse import hstack, csr_matrix
import spacy

In [2]:
SEED = 42 # seed random state for comparison, testing
PARTITION_SIZE = 500 # Adjust lower if potato PC and higher if gaming rig or want results closer to actual
enable_all_data = True # SET TO FALSE IF PREPROCESSING TAKES A LONG TIME (True = test on PARTITION_SIZE training and PARTITION_SIZE testing samples)

## Pre-processing

In [None]:
### Jian Hui start

In [3]:
df = pd.read_csv('raw_data/fulltrain.csv', index_col = False)
df.head()

Unnamed: 0,Label,Sentence
0,1,"A little less than a decade ago, hockey fans w..."
1,1,The writers of the HBO series The Sopranos too...
2,1,Despite claims from the TV news outlet to offe...
3,1,After receiving 'subpar' service and experienc...
4,1,After watching his beloved Seattle Mariners pr...


In [4]:
df = df if enable_all_data else df.sample(n=PARTITION_SIZE, random_state=SEED)
X_train = df.iloc[:, 1] 
y_train = df.iloc[:, 0]

print(X_train.head())
print(y_train.head())

print(len(X_train))
print(len(y_train))

y_train.value_counts()

0    A little less than a decade ago, hockey fans w...
1    The writers of the HBO series The Sopranos too...
2    Despite claims from the TV news outlet to offe...
3    After receiving 'subpar' service and experienc...
4    After watching his beloved Seattle Mariners pr...
Name: Sentence, dtype: object
0    1
1    1
2    1
3    1
4    1
Name: Label, dtype: int64
48854
48854


3    17870
1    14047
4     9995
2     6942
Name: Label, dtype: int64

In [5]:
# Processing data: tokenize the text for NLP Machine Learning
# Eric
spacy_preprocess_model = spacy.load("en_core_web_sm")

def preprocess(sentence):
    '''
    Preprocessing strategies:
    1) Tokenization
    2) Punctuation removal
    3) Stopword removal
    4) Lemmatization
    5) Lowercase
    '''
    tokens = spacy_preprocess_model(sentence)
    ls_sentence = [token.lemma_ for token in tokens if not token.is_punct and not token.is_stop]
    return ls_sentence

In [None]:
# To be used by features for feature extraction:
X_train_ls = X_train.apply(preprocess)
X_train_sentence = X_train_ls.apply(lambda sentence: ' '.join(sentence))

# X_train_ls = X_train
# X_train_sentence = X_train_ls

In [6]:
# quickload pre-processed data
X_train = pd.read_csv('raw_data/lemma_strip_punct_stop.csv', index_col=False).iloc[:, 0]
X_train.head()

0    little decade ago hockey fan bless slate game ...
1    writer HBO series Sopranos take daring storyte...
2    despite claim tv news outlet offer nonstop new...
3    receive subpar service experience unusually lo...
4    watch beloved Seattle Mariners prevail San Die...
Name: Sentence, dtype: object

In [None]:
# Save pre-processed data
# compression_opts = dict(method='zip', archive_name='lemma_strip_punct_stop.csv')
# X_train_sentence.to_csv('lemma_strip_punct_stop.zip', index=False, compression=compression_opts)

### Train-Validation Split

In [7]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=SEED)

### Feature Engineering

In [8]:
# Feature set:
# 1) TF-IDF
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

In [None]:
# 2) NER? 

In [9]:
# Consolidation of feature sets into single vector:
# Eric
X_train = hstack([X_train_tfidf])

### Oversampling

In [None]:
# RandomOverSampler
ros = RandomOverSampler(random_state=SEED)
X_train, y_train = ros.fit_resample(X_train, y_train)

In [None]:
# ADASYN
ada = ADASYN(random_state=SEED)
X_train, y_train = ada.fit_resample(X_train, y_train)

In [None]:
# SMOTEENN

## Models

### Naive Bayes Model [MultinomialNB]

In [10]:
model = MultinomialNB().fit(X_train, y_train) # fit model

### Logistic Regression Model [LogisticRegression]

In [20]:
model = LogisticRegression(solver = 'saga').fit(X_train, y_train) # train the model

## Validation

In [11]:
# Apply feature engineering on X_val
X_val_tfidf = tfidf_vectorizer.transform(X_val)

In [12]:
# Consolidation of feature sets:
X_val = hstack([X_val_tfidf])

In [21]:
# obtain predictions on validation data
y_val_predicted = model.predict(X_val)

# evaluate model training metrics with macro f1 score
f1_score(y_val, y_val_predicted, average='macro')

0.9487405898448626

## Test Data

In [14]:
# TEST DATA 
test_df = pd.read_csv('raw_data/balancedtest.csv', index_col = False)
test_df = test_df if enable_all_data else test_df.sample(PARTITION_SIZE)

In [15]:
X_test = test_df.iloc[:, 1]
y_test = test_df.iloc[:, 0]

# print(X_test.head())
# print(y_test.head())

In [16]:
# Preprocess test data to match steps on training data
X_test_ls = X_test.apply(preprocess)
X_test_sentence = X_test_ls.apply(lambda sentence: ' '.join(sentence))

# X_test_ls = X_test
# X_test_sentence = X_test_ls

### Feature Engineering (Test Data)

In [17]:
# 1) TF-IDF
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [18]:
# Consolidation of feature transformations into single vector
# Eric
X_test = hstack([X_test_tfidf])

In [22]:
# obtain predictions on test data
y_pred = model.predict(X_test)

# evaluate model training metrics with macro f1 score
f1_score(y_test, y_pred, average='macro')

0.4195495337848855

In [None]:
# do prediction on training data
y_train_predicted = lr_clf.predict(X_train)

# obtain training f1 score
f1_score(y_train, y_train_predicted, average='macro') # TODO this tests the model on its already trained set...

In [None]:
# obtain predictions on test data
y_pred = lr_clf.predict(X_test)

# obtain test f1 score
f1_score(y_test, y_pred, average= 'macro')

In [None]:
### Hyper Parameter tuning with GridSearchCV()

In [None]:
### Jian Hui end

In [None]:
### <Group Member's name> start

In [None]:
# Group member's code here

In [None]:
### <Group Member's name> end