In [1]:
#Using the data that was scraped from the previous notebook,
#instead of using spaCy, we will analyse the data using TFIDF Vectorizer
#and run our model via NaiveBayes and Logistic Regression to compare the models

In [1]:
import pandas as pd
import numpy as np

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
final = pd.read_csv('../Datasets/final.csv')

In [5]:
final.shape

(1987, 2)

In [6]:
#1 = Kroger, 0 = Publix
final['target'] = final['target'].map(lambda x: 1 if x == 'Kroger' else 0)

In [7]:
tvec = TfidfVectorizer(stop_words='english')
tvec.fit(final.text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [8]:
from sklearn.preprocessing import StandardScaler
X = pd.DataFrame(tvec.transform(final.text).todense(),
                   columns=tvec.get_feature_names())
y = final.target

ss = StandardScaler()
Xs = ss.fit_transform(X)

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(Xs,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42,
                                                    stratify=y)

In [10]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()

In [11]:
model = nb.fit(X_train,y_train)

In [12]:
predictions = model.predict(X_test)

In [13]:
# Score our model on the training set.
model.score(X_train, y_train)
#could it be overfitting?

0.9563758389261745

In [14]:
model.score(X_test, y_test)

0.6277665995975855

In [15]:
# Import the confusion matrix function.
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,predictions)

array([[179,  70],
       [115, 133]])

In [16]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()

In [17]:
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)
print("")

#Much more balanced in terms of prediction
print("Accuracy =","{0:.2f}".format((tp + tn)/(tn+fp+fn+tp)))
#Slightly better prediction than using spaCy

print("Recall =","{0:.2f}".format(tp/(fn+tp)))
#correctly predicted Kroger 54% of the time
print("Specificity =","{0:.2f}".format(tn/(tn+fp)))
#correctly predicted Publix 72% of the time

True Negatives: 179
False Positives: 70
False Negatives: 115
True Positives: 133

Accuracy = 0.63
Recall = 0.54
Specificity = 0.72


In [18]:
#building a secondary model to see if there is a difference in scores

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [19]:
lr_scores = cross_val_score(LogisticRegression(), X_train, y_train, cv=10)
print(np.mean(lr_scores), final.target.mean())



0.624945945945946 0.49974836436839454


In [20]:
lr_scores = cross_val_score(LogisticRegression(), X_test, y_test, cv=10)
print(np.mean(lr_scores), final.target.mean())

#Using Logistic Regression results in a similar result compared to tfidf



0.6021394557823129 0.49974836436839454


In [21]:
lr = LogisticRegression()
model2 = lr.fit(X_train,y_train)
predictions2 = model2.predict(X_test)
confusion_matrix(y_test,predictions2)



array([[167,  82],
       [101, 147]])

In [22]:
tn2, fp2, fn2, tp2 = confusion_matrix(y_test, predictions2).ravel()

In [23]:
print("True Negatives: %s" % tn2)
print("False Positives: %s" % fp2)
print("False Negatives: %s" % fn2)
print("True Positives: %s" % tp2)
print("")

#Most predictions leaning towards Kroger, regardless of accuracy
print("Accuracy =","{0:.2f}".format((tp2 + tn2)/(tn2+fp2+fn2+tp2)))
#6 out of 10 predictions are correct

print("Recall =","{0:.2f}".format(tp2/(fn2+tp2)))
#correctly predicted Kroger about 60% of the time
print("Specificity =","{0:.2f}".format(tn/(tn2+fp2)))
#correctly predicted Publix only 72% of the time, higher than any other model

True Negatives: 167
False Positives: 82
False Negatives: 101
True Positives: 147

Accuracy = 0.63
Recall = 0.59
Specificity = 0.72
