## Setup

In [36]:
# RUN SETUP.SH BEFORE RUNNING THIS IPYNB

import pandas as pd
from sklearn.metrics import f1_score
from sklearn.naive_bayes import MultinomialNB # Naive Bayes Classifier
from sklearn.linear_model import LogisticRegression # Logistic Regression Classifier
from sklearn.neural_network import MLPClassifier # Multi Layer Perceptron, simple Neural Network
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import ADASYN, RandomOverSampler
from scipy.sparse import hstack, csr_matrix
import spacy
import re
from spacytextblob.spacytextblob import SpacyTextBlob

In [27]:
SEED = 42 # seed random state for comparison, testing
PARTITION_SIZE = 500 # Adjust lower if potato PC and higher if gaming rig or want results closer to actual
enable_all_data = True # SET TO FALSE IF PREPROCESSING TAKES A LONG TIME (True = test on PARTITION_SIZE training and PARTITION_SIZE testing samples)

## Pre-processing

In [28]:
### Jian Hui start

In [29]:
df = pd.read_csv('raw_data/fulltrain.csv', index_col = False)
df.head()

Unnamed: 0,1,"A little less than a decade ago, hockey fans were blessed with a slate of games every night, but on Thursday sources confirmed that for the ninth consecutive year NHL players have been locked out, with very slim hopes of an agreement in sight. It seems like just yesterday Martin St. Louis and his Lightning teammates were raising the Stanley Cup, high school hockey coach and onetime ESPN analyst Barry Melrose said. Obviously, Im still hoping the two sides can come together and reach an agreement, but Im starting to think nobody really misses hockey anymore. Nope. Nobody but old Barry. Id still love to catch an Atlanta Thrashers game. Observers have noted that when arena doors do reopen, the NHL will face the perhaps greater challenge of convincing fans to return to hockey instead of watching more popular sports like football, basketball, baseball, and SlamBall."
0,1,The writers of the HBO series The Sopranos too...
1,1,Despite claims from the TV news outlet to offe...
2,1,After receiving 'subpar' service and experienc...
3,1,After watching his beloved Seattle Mariners pr...
4,1,"At a cafeteria-table press conference Monday, ..."


In [30]:
df = df if enable_all_data else df.sample(n=PARTITION_SIZE, random_state=SEED)
X_train = df.iloc[:, 1] 
y_train = df.iloc[:, 0]

print(X_train.head())
print(y_train.head())

print(len(X_train))
print(len(y_train))

y_train.value_counts()

0    The writers of the HBO series The Sopranos too...
1    Despite claims from the TV news outlet to offe...
2    After receiving 'subpar' service and experienc...
3    After watching his beloved Seattle Mariners pr...
4    At a cafeteria-table press conference Monday, ...
Name: A little less than a decade ago, hockey fans were blessed with a slate of games every night, but on Thursday sources confirmed that for the ninth consecutive year NHL players have been locked out, with very slim hopes of an agreement in sight. It seems like just yesterday Martin St. Louis and his Lightning teammates were raising the Stanley Cup, high school hockey coach and onetime ESPN analyst Barry Melrose said. Obviously, Im still hoping the two sides can come together and reach an agreement, but Im starting to think nobody really misses hockey anymore. Nope. Nobody but old Barry. Id still love to catch an Atlanta Thrashers game. Observers have noted that when arena doors do reopen, the NHL will face the pe

1
3    17870
1    14046
4     9995
2     6942
Name: count, dtype: int64

In [31]:
# Processing data: tokenize the text for NLP Machine Learning
# Lemmatization, Case-folding (lowercase), Stopword removal, Punctuation removal
# Eric
personal_pronouns = ["i", "me", "mine", "my", "myself", "our", "ours", "we", \
                     "their", "you", "your", "he", "she", "it", "its", "we", "they", "me", \
                     "him", "her", "us", "them", "his", "hers", "herself", \
                        "himself", "itself", "themselves", "ourselves", "yourself", "yourselves"]
spacy_preprocess_model = spacy.load("en_core_web_sm")
spacy_preprocess_model.Defaults.stop_words -= set(personal_pronouns)

# def preprocess(sentence):
#     '''
#     Preprocessing strategies:
#     1) Tokenization
#     2) Punctuation removal
#     3) Stopword removal
#     4) Lemmatization
#     5) Lowercase
#     '''
#     tokens = spacy_preprocess_model(sentence)
#     ls_sentence = [token.lemma_.lower() for token in tokens if not (token.is_punct and token not in ["!", "?"]) and not token.is_stop]
#     return ls_sentence

In [32]:
# Processing data: tokenize the text for NLP Machine Learning
# Case-folding (lowercase), Stopword removal, Punctuation removal

def preprocess(sentence):
    '''
    Preprocessing strategies:
    1) Tokenization
    2) Punctuation removal
    3) Stopword removal
    4) Lowercase
    '''
    tokens = spacy_preprocess_model(sentence)
    ls_sentence = [token.text.lower() for token in tokens if not (token.is_punct and token not in ["!", "?"]) and not token.is_stop]
    return ls_sentence

In [33]:
# Processing data: tokenize the text for NLP Machine Learning
# Case-folding (lowercase), Punctuation removal

# def preprocess(sentence):
#     '''
#     Preprocessing strategies:
#     1) Tokenization
#     2) Punctuation removal
#     3) Lowercase
#     '''
#     tokens = spacy_preprocess_model(sentence)
#     ls_sentence = [token.text.lower() for token in tokens if not (token.is_punct and token not in ["!", "?"])]
#     return ls_sentence

In [34]:
# Processing data: tokenize the text for NLP Machine Learning
# pos (TAG), Punctuation removal

# def preprocess(sentence):
#     '''
#     Preprocessing strategies:
#     1) Tokenization
#     2) Punctuation removal
#     3) POS tag
#     '''
#     tokens = spacy_preprocess_model(sentence)
#     ls_sentence = [token.tag_ for token in tokens if not (token.is_punct and token not in ["!", "?"])]
#     return ls_sentence

In [35]:
# To be used by features for feature extraction:
X_train_ls = X_train.apply(preprocess)
X_train_sentence = X_train_ls.apply(lambda sentence: ' '.join(sentence))

# X_train_ls = X_train
# X_train_sentence = X_train_ls

KeyboardInterrupt: 

### Save and load preprocessed data

In [None]:
X_train_sentence.head()

0    little decade ago hockey fan bless slate game ...
1    writer hbo series sopranos take daring storyte...
2    despite claim tv news outlet offer nonstop new...
3    receive subpar service experience unusually lo...
4    watch beloved seattle mariners prevail san die...
Name: Sentence, dtype: object

In [None]:
# Save pre-processed data
compression_opts = dict(method='zip', archive_name='strip_punct_stop_lower.csv')
X_train_sentence.to_csv('strip_punct_stop_lower.zip', index=False, compression=compression_opts)

In [None]:
# quickload pre-processed data
# replace 'raw_data/*.csv' with .csv file containing preprocessed data
X_train = pd.read_csv('raw_data/strip_punct_stop_lower.csv', index_col=False).iloc[:, 0]
X_train.head()

# Reload constants if preprocessing cells are not executed
personal_pronouns = ["i", "me", "mine", "my", "myself", "our", "ours", "we", \
                     "their", "you", "your", "he", "she", "it", "its", "we", "they", "me", \
                     "him", "her", "us", "them", "his", "hers", "herself", \
                        "himself", "itself", "themselves", "ourselves", "yourself", "yourselves"]
spacy_model = spacy.load("en_core_web_sm")
spacy_model.Defaults.stop_words -= set(personal_pronouns)
spacy_model.add_pipe('spacytextblob')

### Train-Validation Split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=SEED)

### Feature Engineering

In [None]:
# Feature set:
# 1) TF-IDF
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

In [None]:
# 2) Sentiment analysis
def sentiment_analysis(sentence):
    doc = spacy_model(sentence)
    return doc._.blob.polarity ** 2 # square to flag extreme polarity text

X_train_sent_ana = X_train.apply(sentiment_analysis)

# 3) Subjectivity analysis 
def subjectivity_analysis(sentence):
    doc = spacy_model(sentence)
    return doc._.blob.subjectivity

X_train_subj_ana = X_train.apply(subjectivity_analysis)

# 4) Number of exclamation and question marks (pre-analyze first)
def count_exclamation_question_marks(sentence):
    count_exclamation = len(re.findall(r'!', sentence))
    count_question = len(re.findall(r'?', sentence))

    return count_exclamation, count_question

X_train_count_ex, X_train_count_qn = X_train.apply(count_exclamation_question_marks)

# 5) Word2vec on top k words tf-idf per sentence? and then sum them up
# k = 5
# def word2vec_select_k_best_sum(matrix):
#     scores = matrix.toarray()[0]
#     top_score_words = scores.argsort()[-k:][::-1]
#     for word in top_score_words:
#         spacy_model(u'{word}')

# 6) Number of personal pronouns
def count_personal_pronouns(sentence):
    counter = 0
    sentence_ls = sentence.split()
    for token in sentence_ls:
        if token in personal_pronouns:
            counter += 1
    return counter

X_train_count_pp = X_train.apply(count_personal_pronouns)


# 7) 

# Model: LSTM? Random forest, SVM

In [None]:
# Consolidation of feature sets into single vector:
# Eric
X_train = hstack([X_train_tfidf, X_train_sent_ana, X_train_subj_ana, X_train_count_ex, X_train_count_qn, X_train_count_pp])

### Oversampling

In [None]:
# RandomOverSampler
ros = RandomOverSampler(random_state=SEED)
X_train, y_train = ros.fit_resample(X_train, y_train)

In [None]:
# ADASYN
ada = ADASYN(random_state=SEED)
X_train, y_train = ada.fit_resample(X_train, y_train)

In [None]:
# SMOTEENN

## Models

### Naive Bayes Model [MultinomialNB]

In [None]:
model = MultinomialNB().fit(X_train, y_train) # fit model

### Logistic Regression Model [LogisticRegression]

In [None]:
model = LogisticRegression(solver = 'saga').fit(X_train, y_train) # train the model

## Validation

In [None]:
# Apply feature engineering on X_val
X_val_tfidf = tfidf_vectorizer.transform(X_val)

In [None]:
# Consolidation of feature sets:
X_val = hstack([X_val_tfidf])

In [None]:
# obtain predictions on validation data
y_val_predicted = model.predict(X_val)

# evaluate model training metrics with macro f1 score
f1_score(y_val, y_val_predicted, average='macro')

## Test Data

In [None]:
# TEST DATA 
test_df = pd.read_csv('raw_data/balancedtest.csv', index_col = False)
test_df = test_df if enable_all_data else test_df.sample(PARTITION_SIZE)

In [None]:
X_test = test_df.iloc[:, 1]
y_test = test_df.iloc[:, 0]

# print(X_test.head())
# print(y_test.head())

In [None]:
# Preprocess test data to match steps on training data
X_test_ls = X_test.apply(preprocess)
X_test_sentence = X_test_ls.apply(lambda sentence: ' '.join(sentence))

X_test = X_test_sentence

### Feature Engineering (Test Data)

In [None]:
# 1) TF-IDF
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
# Consolidation of feature transformations into single vector
# Eric
X_test = hstack([X_test_tfidf])

In [None]:
# obtain predictions on test data
y_pred = model.predict(X_test)

# evaluate model training metrics with macro f1 score
f1_score(y_test, y_pred, average='macro')

In [None]:
# do prediction on training data
y_train_predicted = model.predict(X_train)

# obtain training f1 score
f1_score(y_train, y_train_predicted, average='macro') # TODO this tests the model on its already trained set...

In [None]:
# obtain predictions on test data
y_pred = model.predict(X_test)

# obtain test f1 score
f1_score(y_test, y_pred, average= 'macro')

In [None]:
### Hyper Parameter tuning with GridSearchCV()

In [None]:
### Jian Hui end

In [None]:
### <Group Member's name> start

In [None]:
# Group member's code here

In [None]:
### <Group Member's name> end