In [1]:
#Using the data that was scraped from the previous notebook,
#we will explore, analyse the data via spaCy
#and run our model via NaiveBayes and Logistic Regression to compare the models

In [4]:
import pandas as pd
import numpy as np

In [5]:
import spacy
en_nlp = spacy.load('en')

In [6]:
final = pd.read_csv('../Datasets/final.csv')

In [7]:
#run the text columns thru spacy
parsed_quotes = []
for i, parsed in enumerate(en_nlp.pipe(final.text.values, batch_size=50, n_threads=4)):
    assert parsed.is_parsed
    if (i % 1000) == 0:
        print(i)
    parsed_quotes.append(parsed)        

0
1000


In [8]:
#Find all the unique part of speech categories in the reviews.
unique_pos = []
for parsed in parsed_quotes:
    unique_pos.extend([t.pos_ for t in parsed])
unique_pos = np.unique(unique_pos)
print(unique_pos)

['ADJ' 'ADP' 'ADV' 'AUX' 'CCONJ' 'DET' 'INTJ' 'NOUN' 'NUM' 'PART' 'PRON'
 'PROPN' 'PUNCT' 'SYM' 'VERB' 'X']


In [7]:
#Create the proportion columns for each part of speech.
for pos in unique_pos:
    final[pos+'_prop'] = 0.

In [8]:
#Iterate through the titles and calculate the proportions of each part of speech tag.
final = final.reset_index(drop=True)
for i, parsed in enumerate(parsed_quotes):
    if (i % 100) == 0:
        print(i, end=' ')
    parsed_len = len(parsed)
    for pos in unique_pos:
        count = len([x for x in parsed if x.pos_ == pos])
        final.loc[i, pos+'_prop'] = float(count)/parsed_len

0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 

In [9]:
final.head()

Unnamed: 0,text,target,ADJ_prop,ADP_prop,ADV_prop,AUX_prop,CCONJ_prop,DET_prop,INTJ_prop,NOUN_prop,NUM_prop,PART_prop,PRON_prop,PROPN_prop,PUNCT_prop,SYM_prop,VERB_prop,X_prop
0,Welcome to Kroger Join our community Discord s...,Kroger,0.0,0.125,0.0,0.0,0.0,0.125,0.0,0.25,0.0,0.0,0.0,0.375,0.0,0.0,0.125,0.0
1,/r/kroger Renovation,Kroger,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.25,0.25,0.0,0.0
2,Who thought this was a good idea?,Kroger,0.125,0.0,0.0,0.0,0.0,0.25,0.0,0.125,0.0,0.0,0.125,0.0,0.125,0.0,0.25,0.0
3,Our customers can’t even take free posters off...,Kroger,0.0625,0.125,0.125,0.0625,0.0,0.125,0.0,0.1875,0.0,0.0625,0.0625,0.0,0.0625,0.0,0.125,0.0
4,Waste Integration and Best Practices,Kroger,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.2,0.0,0.0,0.0,0.6,0.0,0.0,0.0,0.0


In [10]:
#1 = Kroger, 0 = Publix
final['target'] = final['target'].map(lambda x: 1 if x == 'Kroger' else 0)

In [11]:
from sklearn.preprocessing import StandardScaler

X = final.iloc[:,2:]
y = final.target

ss = StandardScaler()
Xs = ss.fit_transform(X)

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(Xs,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42,
                                                    stratify=y)

In [13]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()

In [14]:
model = nb.fit(X_train,y_train)

In [15]:
predictions = model.predict(X_test)

In [16]:
# Score our model on the training set.
model.score(X_train, y_train)

0.5181208053691275

In [17]:
model.score(X_test, y_test)

0.4949698189134809

In [18]:
# Import the confusion matrix function.
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,predictions)

array([[ 52, 197],
       [ 54, 194]])

In [19]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()

In [20]:
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)
print("")

#Most predictions leaning towards Kroger, regardless of accuracy
print("Accuracy =","{0:.2f}".format((tp + tn)/(tn+fp+fn+tp)))
#Only half of the predictions are correct

print("Recall =","{0:.2f}".format(tp/(fn+tp)))
#correctly predicted Kroger 91% of the time, but that is also because all predictions lean towards Kroger anyways
print("Specificity =","{0:.2f}".format(tn/(tn+fp)))
#correctly predicted Publix only 13% of the time, due to the same reason above

True Negatives: 52
False Positives: 197
False Negatives: 54
True Positives: 194

Accuracy = 0.49
Recall = 0.78
Specificity = 0.21


In [21]:
#building a secondary model to see if there is a difference in scores

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [22]:
lr_scores = cross_val_score(LogisticRegression(), X_train, y_train, cv=10)
print(np.mean(lr_scores), final.target.mean())

0.5208198198198197 0.49974836436839454




In [26]:
lr_scores2 = cross_val_score(LogisticRegression(), X_test, y_test, cv=10)
print(np.mean(lr_scores2), final.target.mean())

#Using Logistic Regression results in a similar result

0.5226904761904761 0.49974836436839454




In [30]:
lr = LogisticRegression()
model2 = lr.fit(X_train,y_train)
predictions2 = model2.predict(X_test)
confusion_matrix(y_test,predictions2)



array([[131, 118],
       [111, 137]])

In [36]:
tn2, fp2, fn2, tp2 = confusion_matrix(y_test, predictions2).ravel()

In [37]:
print("True Negatives: %s" % tn2)
print("False Positives: %s" % fp2)
print("False Negatives: %s" % fn2)
print("True Positives: %s" % tp2)
print("")

#Most predictions leaning towards Kroger, regardless of accuracy
print("Accuracy =","{0:.2f}".format((tp2 + tn2)/(tn2+fp2+fn2+tp2)))
#Only half of the predictions are correct

print("Recall =","{0:.2f}".format(tp2/(fn2+tp2)))
#correctly predicted Kroger 55% of the time
print("Specificity =","{0:.2f}".format(tn/(tn2+fp2)))
#correctly predicted Publix only 21% of the time

True Negatives: 131
False Positives: 118
False Negatives: 111
True Positives: 137

Accuracy = 0.54
Recall = 0.55
Specificity = 0.21
