In [83]:
# Helpful libraries
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.naive_bayes import MultinomialNB # Naive Bayes Classifier
from sklearn.linear_model import LogisticRegression # Logistic Regression Classifier
from sklearn.neural_network import MLPClassifier # Multi Layer Perceptron, simple Neural Network
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import spacy

In [84]:
### Jian Hui start

In [85]:
df = pd.read_csv('raw_data/fulltrain.csv', index_col = False)

In [86]:
# df.head()

In [87]:
X_train = df.iloc[:, 1] 
y_train = df.iloc[:, 0]

# print(X_train.head())
# print(y_train.head())

# print(len(X_train))
# print(len(y_train))

In [105]:
# Processing data: tokenize the text for NLP Machine Learning
# Eric
spacy_preprocess_model = spacy.load("en_core_web_sm")

def preprocess(ls_sentence):
    '''
    Preprocessing strategies:
    1) Tokenization
    2) Punctuation removal
    3) Stopword removal
    4) Lemmatization
    5) Lowercase
    '''
    tokens = spacy_preprocess_model(' '.join(ls_sentence))
    ls_sentence = [token.lemma_ for token in tokens if not token.is_punct and not token.is_stop]
    return ' '.join(ls_sentence)

# X_train = X_train.head(1000) # Comment out to test on partial set
X_train = X_train.apply(preprocess)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
print(X_train_counts.shape)

(100, 5353)


In [None]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(48853, 229596)

In [None]:
# Naive Bayes Model
nb_clf = MultinomialNB().fit(X_train_tfidf, y_train) # fit model

# obtain predictions on training data
y_train_predicted = nb_clf.predict(X_train_tfidf)

In [None]:
# evaluate model training metrics with macro f1 score
f1_score(y_train, y_train_predicted, average='macro')

0.5921668554228574

In [None]:
# load test data
test = pd.read_csv('raw_data/balancedtest.csv', index_col = False)

In [None]:
X_test = df.iloc[:, 1] 
y_test = df.iloc[:, 0]

print(X_test.head())
print(y_test.head())

0    The writers of the HBO series The Sopranos too...
1    Despite claims from the TV news outlet to offe...
2    After receiving 'subpar' service and experienc...
3    After watching his beloved Seattle Mariners pr...
4    At a cafeteria-table press conference Monday, ...
Name: A little less than a decade ago, hockey fans were blessed with a slate of games every night, but on Thursday sources confirmed that for the ninth consecutive year NHL players have been locked out, with very slim hopes of an agreement in sight. It seems like just yesterday Martin St. Louis and his Lightning teammates were raising the Stanley Cup, high school hockey coach and onetime ESPN analyst Barry Melrose said. Obviously, Im still hoping the two sides can come together and reach an agreement, but Im starting to think nobody really misses hockey anymore. Nope. Nobody but old Barry. Id still love to catch an Atlanta Thrashers game. Observers have noted that when arena doors do reopen, the NHL will face the pe

In [None]:
# tokenize to occurences
X_test_counts = count_vect.transform(X_test)
print(X_test_counts.shape)

# change occurences to frequencies
X_test_tfidf = tfidf_transformer.transform(X_test_counts)
print(X_test_tfidf.shape)

y_pred = nb_clf.predict(X_test_tfidf)

(48853, 229596)
(48853, 229596)


In [None]:
# Test f1 Score
# evaluate model training metrics with macro f1 score
f1_score(y_test, y_pred, average='macro')

0.5921668554228574

In [None]:
lr_clf = LogisticRegression(solver = 'saga')
lr_clf.fit(X_train_tfidf, y_train) # train the model

In [None]:
# do prediction on training data
y_train_predicted = lr_clf.predict(X_train_tfidf)

# obtain training f1 score
f1_score(y_train, y_train_predicted, average='macro')

0.9861389886323899

In [None]:
# obtain predictions on test data
y_pred = lr_clf.predict(X_test_tfidf)

# obtain test f1 score
f1_score(y_test, y_pred, average= 'macro')

0.9861389886323899

In [None]:
### Hyper Parameter tuning with GridSearchCV()

In [None]:
### Jian Hui end

In [None]:
### <Group Member's name> start

In [None]:
# Group member's code here

In [None]:
### <Group Member's name> end