In [2]:
# RUN SETUP.SH BEFORE RUNNING THIS IPYNB

import pandas as pd
from sklearn.metrics import f1_score
from sklearn.naive_bayes import MultinomialNB # Naive Bayes Classifier
from sklearn.linear_model import LogisticRegression # Logistic Regression Classifier
from sklearn.neural_network import MLPClassifier # Multi Layer Perceptron, simple Neural Network
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import ADASYN
from scipy.sparse import hstack, csr_matrix
import spacy

In [46]:
### Jian Hui start

In [3]:
df = pd.read_csv('raw_data/fulltrain.csv', index_col = False)
# df.head()

Unnamed: 0,Label,Sentence
0,1,"A little less than a decade ago, hockey fans w..."
1,1,The writers of the HBO series The Sopranos too...
2,1,Despite claims from the TV news outlet to offe...
3,1,After receiving 'subpar' service and experienc...
4,1,After watching his beloved Seattle Mariners pr...


In [4]:
PARTITION_SIZE = 500 # Adjust lower if potato PC and higher if gaming rig or want results closer to actual
enable_all_data = True # SET TO FALSE IF PREPROCESSING TAKES A LONG TIME (True = test on PARTITION_SIZE training and PARTITION_SIZE testing samples)

df = df if enable_all_data else df.sample(PARTITION_SIZE)
X_train = df.iloc[:, 1] 
y_train = df.iloc[:, 0]

print(X_train.head())
print(y_train.head())

print(len(X_train))
print(len(y_train))

0    A little less than a decade ago, hockey fans w...
1    The writers of the HBO series The Sopranos too...
2    Despite claims from the TV news outlet to offe...
3    After receiving 'subpar' service and experienc...
4    After watching his beloved Seattle Mariners pr...
Name: Sentence, dtype: object
0    1
1    1
2    1
3    1
4    1
Name: Label, dtype: int64
48854
48854


In [None]:
SEED = 42

In [6]:
# Processing data: tokenize the text for NLP Machine Learning
# Eric
spacy_preprocess_model = spacy.load("en_core_web_sm")

def preprocess(sentence):
    '''
    Preprocessing strategies:
    1) Tokenization
    2) Punctuation removal
    3) Stopword removal
    4) Lemmatization
    5) Lowercase
    '''
    tokens = spacy_preprocess_model(sentence)
    ls_sentence = [token.lemma_ for token in tokens if not token.is_punct and not token.is_stop]
    return ls_sentence

# To be used by features for feature extraction:
X_train_ls = X_train.apply(preprocess)
X_train_sentence = X_train_ls.apply(lambda sentence: ' '.join(sentence))

# X_train_ls = X_train
# X_train_sentence = X_train_ls

In [7]:
print(X_train.head())
print(X_train_ls.head())
print(X_train_sentence.head())

0    A little less than a decade ago, hockey fans w...
1    The writers of the HBO series The Sopranos too...
2    Despite claims from the TV news outlet to offe...
3    After receiving 'subpar' service and experienc...
4    After watching his beloved Seattle Mariners pr...
Name: Sentence, dtype: object
0    [little, decade, ago, hockey, fan, bless, slat...
1    [writer, HBO, series, Sopranos, take, daring, ...
2    [despite, claim, tv, news, outlet, offer, nons...
3    [receive, subpar, service, experience, unusual...
4    [watch, beloved, Seattle, Mariners, prevail, S...
Name: Sentence, dtype: object
0    little decade ago hockey fan bless slate game ...
1    writer HBO series Sopranos take daring storyte...
2    despite claim tv news outlet offer nonstop new...
3    receive subpar service experience unusually lo...
4    watch beloved Seattle Mariners prevail San Die...
Name: Sentence, dtype: object


In [10]:
X_train_sentence.to_csv('lemma_strip_punct_stop_tokens.csv')

In [50]:
# Feature set:
# 1) TF-IDF
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_sentence)

# 2) NER? 


# Consolidation of feature sets into single vector:
# Eric
X_train = hstack([X_train_tfidf])


In [51]:
# Naive Bayes Model
nb_clf = MultinomialNB().fit(X_train, y_train) # fit model

# obtain predictions on training data
y_train_predicted = nb_clf.predict(X_train)

# evaluate model training metrics with macro f1 score
f1_score(y_train, y_train_predicted, average='macro') # TODO this tests the model on its already trained set...

0.5916500053383984

In [52]:
# TEST DATA 
test_df = pd.read_csv('raw_data/balancedtest.csv', index_col = False)
test_df = test_df if enable_all_data else test_df.sample(PARTITION_SIZE)

In [53]:
X_test = test_df.iloc[:, 1]
y_test = test_df.iloc[:, 0]

# print(X_test.head())
# print(y_test.head())

# Preprocess test data to match steps on training data
X_test_ls = X_test.apply(preprocess)
X_test_sentence = X_test_ls.apply(lambda sentence: ' '.join(sentence))

# X_test_ls = X_test
# X_test_sentence = X_test_ls

In [54]:
# Feature transformation for test data

# 1) TF-IDF
X_test_tfidf = tfidf_vectorizer.transform(X_test_sentence)

# Consolidation of feature transformations into single vector
# Eric
X_test = hstack([X_test_tfidf])

y_pred = nb_clf.predict(X_test)

In [55]:
# Test f1 Score
# evaluate model training metrics with macro f1 score
f1_score(y_test, y_pred, average='macro')

0.10567823343848581

In [56]:
lr_clf = LogisticRegression(solver = 'saga')
lr_clf.fit(X_train, y_train) # train the model



In [57]:
# do prediction on training data
y_train_predicted = lr_clf.predict(X_train)

# obtain training f1 score
f1_score(y_train, y_train_predicted, average='macro') # TODO this tests the model on its already trained set...

0.9885542733839947

In [58]:
# obtain predictions on test data
y_pred = lr_clf.predict(X_test)

# obtain test f1 score
f1_score(y_test, y_pred, average= 'macro')

0.3386648304806873

In [59]:
### Hyper Parameter tuning with GridSearchCV()

In [60]:
### Jian Hui end

In [61]:
### <Group Member's name> start

In [62]:
# Group member's code here

In [63]:
### <Group Member's name> end