In [11]:
# Helpful libraries
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.naive_bayes import MultinomialNB # Naive Bayes Classifier
from sklearn.linear_model import LogisticRegression # Logistic Regression Classifier
from sklearn.neural_network import MLPClassifier # Multi Layer Perceptron, simple Neural Network
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
### Jian Hui start

In [2]:
train = pd.read_csv('fulltrain.csv', index_col = False)

In [3]:
train.head()

Unnamed: 0,Label,Sentence
0,1,"A little less than a decade ago, hockey fans w..."
1,1,The writers of the HBO series The Sopranos too...
2,1,Despite claims from the TV news outlet to offe...
3,1,After receiving 'subpar' service and experienc...
4,1,After watching his beloved Seattle Mariners pr...


In [5]:
X_train = train['Sentence']
y_train = train['Label']

print(X_train.head())
print(y_train.head())

0    A little less than a decade ago, hockey fans w...
1    The writers of the HBO series The Sopranos too...
2    Despite claims from the TV news outlet to offe...
3    After receiving 'subpar' service and experienc...
4    After watching his beloved Seattle Mariners pr...
Name: Sentence, dtype: object
0    1
1    1
2    1
3    1
4    1
Name: Label, dtype: int64


In [7]:
print(len(X_train))
print(len(y_train))

48854
48854


In [10]:
# process data. tokenize the text for NLP Machine Learning
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
print(X_train_counts.shape)

(48854, 229597)


In [12]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(48854, 229597)

In [14]:
# Naive Bayes Model
nb_clf = MultinomialNB().fit(X_train_tfidf, y_train) # fit model

# obtain predictions on training data
y_train_predicted = nb_clf.predict(X_train_tfidf)

In [15]:
# evaluate model training metrics with macro f1 score
f1_score(y_train, y_train_predicted, average='macro')

0.5921665663623545

In [31]:
# load test data
test = pd.read_csv('balancedtest.csv', index_col = False)

In [32]:
print(test.columns)

Index(['Label', 'Sentence'], dtype='object')


In [34]:
X_test = test['Sentence']
y_test = test['Label']

print(X_test.head())
print(y_test.head())

0    When so many actors seem content to churn out ...
1     In what football insiders are calling an unex...
2    In a freak accident following Game 3 of the N....
3    North Koreas official news agency announced to...
4    The former Alaska Governor Sarah Palin would b...
Name: Sentence, dtype: object
0    1
1    1
2    1
3    1
4    1
Name: Label, dtype: int64


In [35]:
# tokenize to occurences
X_test_counts = count_vect.transform(X_test)
print(X_test_counts.shape)

# change occurences to frequencies
X_test_tfidf = tfidf_transformer.transform(X_test_counts)
print(X_test_tfidf.shape)

y_pred = nb_clf.predict(X_test_tfidf)

(3000, 229597)
(3000, 229597)


In [36]:
# Test f1 Score
# evaluate model training metrics with macro f1 score
f1_score(y_test, y_pred, average='macro')

0.3190775650700206

In [None]:
### more of Jian Hui's code here

In [None]:
### Jian Hui end

In [None]:
### <Group Member's name> start

In [None]:
# Group member's code here

In [None]:
### <Group Member's name> end