In [1]:
from sklearn import model_selection, linear_model, naive_bayes, metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import LinearSVC

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE

import pandas as pd
import numpy as np

import warnings 
warnings.filterwarnings(action = 'ignore') 

In [2]:
# Import pre-cleaned dataset
input_prefix = 'yelp_cleaned'

df_train = pd.read_csv(input_prefix + '_train.csv')
df_test = pd.read_csv(input_prefix + '_test.csv')

In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10261 entries, 0 to 10260
Data columns (total 2 columns):
target    10261 non-null int64
text      10261 non-null object
dtypes: int64(1), object(1)
memory usage: 160.4+ KB


In [4]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2054 entries, 0 to 2053
Data columns (total 2 columns):
target    2054 non-null int64
text      2054 non-null object
dtypes: int64(1), object(1)
memory usage: 32.2+ KB


In [5]:
# Split text back into tokens
X_train = [text.split(' ') for text in df_train['text'].astype(str)]
X_test = [text.split(' ') for text in df_test['text'].astype(str)]

In [6]:
# Split of targets
y_train = df_train['target']
y_test = df_test['target']

In [7]:
# Set up TFIDF vectorizing model
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1,6))
tfidf_vect_ngram.fit(df_train['text'])

# Vectorize text tokens
X_train_tfidf =  tfidf_vect_ngram.transform(df_train['text'])
X_test_tfidf =  tfidf_vect_ngram.transform(df_test['text'])

# Get array of feature names (words) from TFIDF Vectorizer
tfidf_feat_names = np.array(tfidf_vect_ngram.get_feature_names())

In [8]:
# Undersample positive judgements to balance classes
ros = RandomUnderSampler(random_state=0)
X_train_tfidf_us, y_train_tfidf_us = ros.fit_resample(X_train_tfidf, y_train)

In [9]:
# Undersample positive judgements to balance classes
ros = RandomUnderSampler(random_state=0)
X_test_tfidf_us, y_test_tfidf_us = ros.fit_resample(X_test_tfidf, y_test)

In [10]:
# Instantiate Linear Logistic Regression Classifier
lr_clf_1 = linear_model.LogisticRegression()

# Linear Logistic Regression on TFIDF Vectors
lr_model_1 = lr_clf_1.fit(X_train_tfidf_us, y_train_tfidf_us)
lr_preds_1 = lr_model_1.predict(X_test_tfidf)

lr_accuracy_1 = metrics.accuracy_score(lr_preds_1, y_test)

print('LR Mean Vectors: ', lr_accuracy_1)
print('F1 Score: ', metrics.f1_score(y_test, lr_preds_1, average=None))
metrics.confusion_matrix(y_test, lr_preds_1)

LR Mean Vectors:  0.873904576436222
F1 Score:  [0.93117194 0.24927536]


array([[1752,  243],
       [  16,   43]], dtype=int64)

In [11]:
# Instantiate Linear Logistic Regression Classifier
lr_clf_1 = linear_model.LogisticRegression()

# Linear Logistic Regression on TFIDF Vectors
lr_model_1 = lr_clf_1.fit(X_train_tfidf_us, y_train_tfidf_us)
lr_preds_1 = lr_model_1.predict(X_test_tfidf)

lr_accuracy_1 = metrics.accuracy_score(lr_preds_1, y_test)

print('LR Mean Vectors: ', lr_accuracy_1)
print('F1 Score: ', metrics.f1_score(y_test, lr_preds_1, average=None))
metrics.confusion_matrix(y_test, lr_preds_1)

LR Mean Vectors:  0.873904576436222
F1 Score:  [0.93117194 0.24927536]


array([[1752,  243],
       [  16,   43]], dtype=int64)

In [12]:
# Instantiate SVC Classifier
svc_clf_1 = LinearSVC()

# SVC on TFIDF Vectors
svc_model_1 = svc_clf_1.fit(X_train_tfidf_us, y_train_tfidf_us)
svc_preds_1 = svc_model_1.predict(X_test_tfidf)

svc_accuracy_1 = metrics.accuracy_score(svc_preds_1, y_test)

print('SVC Mean Vectors: ', svc_accuracy_1)
print('F1 Score: ', metrics.f1_score(y_test, svc_preds_1, average=None))
metrics.confusion_matrix(y_test, svc_preds_1)

SVC Mean Vectors:  0.870983446932814
F1 Score:  [0.92942743 0.24929178]


array([[1745,  250],
       [  15,   44]], dtype=int64)

In [13]:
# Instantiate Naive Bayes Classifier
nb_clf_1 = naive_bayes.MultinomialNB()

# Naive Bayes  on TFIDF Vectors
nb_model_1 = nb_clf_1.fit(X_train_tfidf_us, y_train_tfidf_us)
nb_preds_1 = nb_model_1.predict(X_test_tfidf)

nb_accuracy_1 = metrics.accuracy_score(nb_preds_1, y_test)

print('NB, Count Vectors: ', nb_accuracy_1)
print('F1 Score: ', metrics.f1_score(y_test, nb_preds_1, average=None))
metrics.confusion_matrix(y_test, nb_preds_1)

NB, Count Vectors:  0.8485881207400194
F1 Score:  [0.91596866 0.23587224]


array([[1695,  300],
       [  11,   48]], dtype=int64)

In [14]:
ens_preds = np.round(np.mean([nb_preds_1, lr_preds_1, svc_preds_1], axis=0))

In [15]:
print('F1 Score: ', metrics.f1_score(y_test, ens_preds, average=None))
metrics.confusion_matrix(y_test, ens_preds)

F1 Score:  [0.92828579 0.2464986 ]


array([[1741,  254],
       [  15,   44]], dtype=int64)

In [16]:
# Get a list of the most important words for each category using TFIDF and 
# the LR model
print('Most important words for category 0')
print(tfidf_feat_names[np.argsort(-nb_model_1.coef_[0,:])[0:10]], '\n')



Most important words for category 0
['pron' 'great' 'good' 'the' 'be' 'place' 'food' 's' 'delicious' 'love'] 



In [17]:
# Oversample positive judgements to balance classes
ros = RandomOverSampler(random_state=0)
X_train_tfidf_os, y_train_tfidf_os = ros.fit_resample(X_train_tfidf, y_train)

In [18]:
# Instantiate Linear Logistic Regression Classifier
lr_clf_2 = linear_model.LogisticRegression()

# Linear Logistic Regression on TFIDF Vectors
lr_model_2 = lr_clf_2.fit(X_train_tfidf_os, y_train_tfidf_os)
lr_preds_2 = lr_model_2.predict(X_test_tfidf)

lr_accuracy_2 = metrics.accuracy_score(lr_preds_2, y_test)

print('LR Mean Vectors: ', lr_accuracy_2)
print('F1 Score: ', metrics.f1_score(y_test, lr_preds_2, average=None))
metrics.confusion_matrix(y_test, lr_preds_2)

LR Mean Vectors:  0.9698149951314509
F1 Score:  [0.98466106 0.06060606]


array([[1990,    5],
       [  57,    2]], dtype=int64)

In [19]:
# Instantiate SVC Classifier
svc_clf_2 = LinearSVC()

# SVC on TFIDF Vectors
svc_model_2 = svc_clf_2.fit(X_train_tfidf_os, y_train_tfidf_os)
svc_preds_2 = svc_model_2.predict(X_test_tfidf)

svc_accuracy_2 = metrics.accuracy_score(svc_preds_2, y_test)

print('SVC Mean Vectors: ', svc_accuracy_2)
print('F1 Score: ', metrics.f1_score(y_test, svc_preds_2, average=None))
metrics.confusion_matrix(y_test, svc_preds_2)

SVC Mean Vectors:  0.9712755598831548
F1 Score:  [0.98541409 0.06349206]


array([[1993,    2],
       [  57,    2]], dtype=int64)

In [20]:
# Instantiate Naive Bayes Classifier
nb_clf_2 = naive_bayes.MultinomialNB()

# Naive Bayes  on TFIDF Vectors
nb_model_2 = nb_clf_2.fit(X_train_tfidf_os, y_train_tfidf_os)
nb_preds_2 = nb_model_2.predict(X_test_tfidf)

nb_accuracy_2 = metrics.accuracy_score(nb_preds_2, y_test)

print('NB, Count Vectors: ', nb_accuracy_2)
print('F1 Score: ', metrics.f1_score(y_test, nb_preds_2, average=None))
metrics.confusion_matrix(y_test, nb_preds_2)

NB, Count Vectors:  0.9581304771178188
F1 Score:  [0.97823887 0.44871795]


array([[1933,   62],
       [  24,   35]], dtype=int64)

In [21]:
# Oversample positive judgements using SMOTE to balance classes
sm = SMOTE(random_state=13)
X_train_tfidf_sm, y_train_tfidf_sm = sm.fit_resample(X_train_tfidf, y_train)

In [22]:
# Instantiate Linear Logistic Regression Classifier
lr_clf_5 = linear_model.LogisticRegression()

# Linear Logistic Regression on TFIDF Vectors
lr_model_5 = lr_clf_5.fit(X_train_tfidf_sm, y_train_tfidf_sm)
lr_preds_5 = lr_model_5.predict(X_test_tfidf)

lr_accuracy_5 = metrics.accuracy_score(lr_preds_5, y_test)

print('LR Mean Vectors: ', lr_accuracy_5)
print('F1 Score: ', metrics.f1_score(y_test, lr_preds_5, average=None))
metrics.confusion_matrix(y_test, lr_preds_5)

LR Mean Vectors:  0.9698149951314509
F1 Score:  [0.98466106 0.06060606]


array([[1990,    5],
       [  57,    2]], dtype=int64)

In [23]:
# Instantiate SVC Classifier
svc_clf_5 = LinearSVC()

# SVC on TFIDF Vectors
svc_model_5 = svc_clf_5.fit(X_train_tfidf_sm, y_train_tfidf_sm)
svc_preds_5 = svc_model_5.predict(X_test_tfidf)

svc_accuracy_5 = metrics.accuracy_score(svc_preds_5, y_test)

print('SVC Mean Vectors: ', svc_accuracy_5)
print('F1 Score: ', metrics.f1_score(y_test, svc_preds_5, average=None))
metrics.confusion_matrix(y_test, svc_preds_5)

SVC Mean Vectors:  0.9712755598831548
F1 Score:  [0.98541409 0.06349206]


array([[1993,    2],
       [  57,    2]], dtype=int64)

In [24]:
# Instantiate Naive Bayes Classifier
nb_clf_5 = naive_bayes.MultinomialNB()

# Naive Bayes  on TFIDF Vectors
nb_model_5 = nb_clf_5.fit(X_train_tfidf_sm, y_train_tfidf_sm)
nb_preds_5 = nb_model_5.predict(X_test_tfidf)

nb_accuracy_5 = metrics.accuracy_score(nb_preds_5, y_test)

print('NB, Count Vectors: ', nb_accuracy_5)
print('F1 Score: ', metrics.f1_score(y_test, nb_preds_5, average=None))
metrics.confusion_matrix(y_test, nb_preds_5)

NB, Count Vectors:  0.939143135345667
F1 Score:  [0.96798976 0.38423645]


array([[1890,  105],
       [  20,   39]], dtype=int64)

In [25]:
# Create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1,6))
count_vect.fit(df_train['text'])

# Transform the training and test data using count vectorizer object
X_train_count =  count_vect.transform(df_train['text'])
X_test_count =  count_vect.transform(df_test['text'])

# Get list of feature names (words) from Count Vectorizer
count_feat_names = count_vect.get_feature_names()

In [26]:
# Undersample positive judgements to balance classes
ros = RandomUnderSampler(random_state=0)
X_train_count_us, y_train_count_us = ros.fit_resample(X_train_count, y_train)

In [27]:
# Instantiate Linear Logistic Regression Classifier
lr_clf_3 = linear_model.LogisticRegression()

# Linear Logistic Regression on Count Vectors
lr_model_3 = lr_clf_3.fit(X_train_count_us, y_train_count_us)
lr_preds_3 = lr_model_3.predict(X_test_count)

lr_accuracy_3 = metrics.accuracy_score(lr_preds_3, y_test)

print('LR Mean Vectors: ', lr_accuracy_3)
print('F1 Score: ', metrics.f1_score(y_test, lr_preds_3, average=None))
metrics.confusion_matrix(y_test, lr_preds_3)

LR Mean Vectors:  0.8354430379746836
F1 Score:  [0.90815217 0.21028037]


array([[1671,  324],
       [  14,   45]], dtype=int64)

In [28]:
# Instantiate SVC Classifier
svc_clf_3 = LinearSVC()

# SVC on Count Vectors
svc_model_3 = svc_clf_3.fit(X_train_count_us, y_train_count_us)
svc_preds_3 = svc_model_3.predict(X_test_count)

svc_accuracy_3 = metrics.accuracy_score(svc_preds_3, y_test)

print('SVC Mean Vectors: ', svc_accuracy_3)
print('F1 Score: ', metrics.f1_score(y_test, svc_preds_3, average=None))
metrics.confusion_matrix(y_test, svc_preds_3)

SVC Mean Vectors:  0.8222979552093476
F1 Score:  [0.90019141 0.19068736]


array([[1646,  349],
       [  16,   43]], dtype=int64)

In [29]:
# Instantiate Naive Bayes Classifier
nb_clf_3 = naive_bayes.MultinomialNB()

# Naive Bayes on Count Vectors
nb_model_3 = nb_clf_3.fit(X_train_count_us, y_train_count_us)
nb_preds_3 = nb_model_3.predict(X_test_count)

nb_accuracy_3 = metrics.accuracy_score(nb_preds_3, y_test)

print('NB, Count Vectors: ', nb_accuracy_3)
print('F1 Score: ', metrics.f1_score(y_test, nb_preds_3, average=None))
metrics.confusion_matrix(y_test, nb_preds_3)

NB, Count Vectors:  0.939143135345667
F1 Score:  [0.96810411 0.33862434]


array([[1897,   98],
       [  27,   32]], dtype=int64)

In [30]:
# Get a list of the most important words for each category using BOW and 
# the Naive Bayes model
print('Most important words for category 0 (guns)')
print(tfidf_feat_names[np.argsort(-nb_model_2.coef_[0,:])[0:10]], '\n')


Most important words for category 0 (guns)
['pron' 'great' 'good' 'the' 'be' 'place' 'food' 's' 'not' 'love'] 



In [31]:
# Oversample positive judgements to balance classes
ros = RandomOverSampler(random_state=0)
X_train_count_os, y_train_count_os = ros.fit_resample(X_train_count, y_train)

In [32]:
# Instantiate Linear Logistic Regression Classifier
lr_clf_4 = linear_model.LogisticRegression()

# Linear Logistic Regression on Count Vectors
lr_model_4 = lr_clf_4.fit(X_train_count_os, y_train_count_os)
lr_preds_4 = lr_model_4.predict(X_test_count)

lr_accuracy_4 = metrics.accuracy_score(lr_preds_4, y_test)

print('LR Mean Vectors: ', lr_accuracy_4)
print('F1 Score: ', metrics.f1_score(y_test, lr_preds_4, average=None))
metrics.confusion_matrix(y_test, lr_preds_4)

LR Mean Vectors:  0.9707887049659202
F1 Score:  [0.98514851 0.11764706]


array([[1990,    5],
       [  55,    4]], dtype=int64)

In [33]:
# Instantiate SVC Classifier
svc_clf_4 = LinearSVC()

# SVC on Count Vectors
svc_model_4 = svc_clf_4.fit(X_train_count_os, y_train_count_os)
svc_preds_4 = svc_model_4.predict(X_test_count)

svc_accuracy_4 = metrics.accuracy_score(svc_preds_4, y_test)

print('SVC Mean Vectors: ', svc_accuracy_4)
print('F1 Score: ', metrics.f1_score(y_test, svc_preds_4, average=None))
metrics.confusion_matrix(y_test, svc_preds_4)

SVC Mean Vectors:  0.9688412852969815
F1 Score:  [0.98415057 0.08571429]


array([[1987,    8],
       [  56,    3]], dtype=int64)

In [34]:
# Instantiate Naive Bayes Classifier
nb_clf_4 = naive_bayes.MultinomialNB()

# Naive Bayes on Count Vectors
nb_model_4 = nb_clf_4.fit(X_train_count_os, y_train_count_os)
nb_preds_4 = nb_model_4.predict(X_test_count)

nb_accuracy_4 = metrics.accuracy_score(nb_preds_4, y_test)

print('NB, Count Vectors: ', nb_accuracy_4)
print('F1 Score: ', metrics.f1_score(y_test, nb_preds_4, average=None))
metrics.confusion_matrix(y_test, nb_preds_4)

NB, Count Vectors:  0.9712755598831548
F1 Score:  [0.98536343 0.23376623]


array([[1986,    9],
       [  50,    9]], dtype=int64)

In [35]:
# Oversample positive judgements using SMOTE to balance classes
sm = SMOTE(random_state=13)
X_train_count_sm, y_train_count_sm = sm.fit_resample(X_train_count, y_train)

In [36]:
# Instantiate Linear Logistic Regression Classifier
lr_clf_6 = linear_model.LogisticRegression()

# Linear Logistic Regression on Count Vectors
lr_model_6 = lr_clf_6.fit(X_train_count_sm, y_train_count_sm)
lr_preds_6 = lr_model_6.predict(X_test_count)

lr_accuracy_6 = metrics.accuracy_score(lr_preds_6, y_test)

print('LR Mean Vectors: ', lr_accuracy_6)
print('F1 Score: ', metrics.f1_score(y_test, lr_preds_6, average=None))
metrics.confusion_matrix(y_test, lr_preds_6)

LR Mean Vectors:  0.9284323271665044
F1 Score:  [0.96256684 0.1878453 ]


array([[1890,  105],
       [  42,   17]], dtype=int64)

In [37]:
# Instantiate SVC Classifier
svc_clf_6 = LinearSVC()

# SVC on Count Vectors
svc_model_6 = svc_clf_6.fit(X_train_count_sm, y_train_count_sm)
svc_preds_6 = svc_model_6.predict(X_test_count)

svc_accuracy_6 = metrics.accuracy_score(svc_preds_6, y_test)

print('SVC Mean Vectors: ', svc_accuracy_6)
print('F1 Score: ', metrics.f1_score(y_test, svc_preds_6, average=None))
metrics.confusion_matrix(y_test, svc_preds_6)

SVC Mean Vectors:  0.9259980525803311
F1 Score:  [0.9612047 0.2      ]


array([[1883,  112],
       [  40,   19]], dtype=int64)

In [38]:
# Instantiate Naive Bayes Classifier
nb_clf_6 = naive_bayes.MultinomialNB()

# Naive Bayes on Count Vectors
nb_model_6 = nb_clf_6.fit(X_train_count_sm, y_train_count_sm)
nb_preds_6 = nb_model_6.predict(X_test_count)

nb_accuracy_6 = metrics.accuracy_score(nb_preds_6, y_test)

print('NB, Count Vectors: ', nb_accuracy_6)
print('F1 Score: ', metrics.f1_score(y_test, nb_preds_6, average=None))
metrics.confusion_matrix(y_test, nb_preds_6)

NB, Count Vectors:  0.9712755598831548
F1 Score:  [0.9854285 0.       ]


array([[1995,    0],
       [  59,    0]], dtype=int64)

In [39]:
prob_preds = np.argmax(np.mean([nb_clf_1.predict_proba(X_test_tfidf),
                                nb_clf_2.predict_proba(X_test_tfidf),
                                nb_clf_5.predict_proba(X_test_tfidf)], axis=0), axis=1) + 1

print('F1 Score: ', metrics.f1_score(y_test, prob_preds, average=None))
metrics.confusion_matrix(y_test, prob_preds)

F1 Score:  [0.97247706 0.41304348]


array([[1908,   87],
       [  21,   38]], dtype=int64)