In [1]:
#Using the data that was scraped from the previous notebook,
#instead of using spaCy, we will analyse the data using TFIDF Vectorizer
#and run our model via NaiveBayes and Adaboost to compare the models

In [2]:
import pandas as pd
import numpy as np

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
final = pd.read_csv('../Datasets/final.csv')

In [5]:
final.shape

(1988, 2)

In [6]:
final.head(5)

Unnamed: 0,text,target
0,Welcome to Kroger Join our community Discord s...,Kroger
1,/r/kroger Renovation,Kroger
2,This is fine.,Kroger
3,Thats just a fact,Kroger
4,When you just get to work and your boss immedi...,Kroger


## First we change our target to binary

In [7]:
#1 = Kroger, 0 = Publix
final['target'] = final['target'].map(lambda x: 1 if x == 'Kroger' else 0)

## Then we will start by starting some features

In [8]:
#Adding a few new features: 
# 1. word count
# 2. number of characters
# 3. average word count
# 4. stop word count
# 5. number of hastags
# 6. number of numerics
# 7. number of upper cased words


final['word_count'] = final['text'].apply(lambda x: len(str(x).split(" ")))
final['char_count'] = final['text'].str.len()

def avg_word(sentence):
  words = sentence.split()
  return (sum(len(word) for word in words)/len(words))
final['avg_word'] = final['text'].apply(lambda x: avg_word(x))

from nltk.corpus import stopwords
stop = stopwords.words('english')
final['stopwords'] = final['text'].apply(lambda x: len([x for x in x.split() if x in stop]))

final['hastags'] = final['text'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
final['numerics'] = final['text'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
final['upper'] = final['text'].apply(lambda x: len([x for x in x.split() if x.isupper()]))

### Next we will preprocess the words accordingly

In [9]:
#Part two: Preprocessing the words
#1. change all to lower case
#2. remove punctuation 
#3. remove stop words

final['text'] = final['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
final['text'] = final['text'].str.replace('[^\w\s/r/!.#$@%*)(]','')
final['text'] = final['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

In [10]:
#Now to remove frequently appeared words since they won't help us in our analysis

freq = pd.Series(' '.join(final['text']).split()).value_counts()[:10]
freq

freq = list(freq.index)
final['text'] = final['text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))

In [11]:
#And also to remove rare words since they are just noise
freq_low = pd.Series(' '.join(final['text']).split()).value_counts()[-10:]
freq_low

freq_low = list(freq_low.index)
final['text'] = final['text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq_low))

In [12]:
#Next it is to stem our words

from nltk.stem import PorterStemmer
st = PorterStemmer()
final['text'] = final['text'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

## We will then proceed to extract features using NLP techniques with TFIDF vectorizer

In [13]:
#, max_features=1000
tvec = TfidfVectorizer(stop_words='english')
tvec_ft = tvec.fit_transform(final.text)

In [14]:
from sklearn.preprocessing import StandardScaler
X1 = pd.DataFrame(tvec_ft.todense(),
                   columns=tvec.get_feature_names())
X2 = final.iloc[:,2:]

X = pd.concat([X1,X2] ,axis=1)
y = final.target

ss = StandardScaler()
Xs = ss.fit_transform(X)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(Xs,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=40,
                                                    stratify=y)

## We will start with our first model: Gaussian NB

In [16]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()

In [17]:
model_nb = nb.fit(X_train,y_train)

In [18]:
# Score our model on the training set.
model_nb.score(X_train, y_train)
#could it be overfitting?

0.9336016096579477

In [19]:
from sklearn.model_selection import cross_val_score
model_nb_score = cross_val_score(model_nb, X_train, y_train, cv=10).mean()
model_nb_score

0.5893936150916017

In [20]:
#prediction score
model_nb.score(X_test, y_test)

0.5875251509054326

In [21]:
predictions = model_nb.predict(X_test)

In [22]:
# Import the confusion matrix function.
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,predictions)

array([[135, 114],
       [ 91, 157]])

In [23]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()

In [24]:
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)
print("")

#Much more balanced in terms of overall prediction
print("Accuracy =","{0:.2f}".format((tp + tn)/(tn+fp+fn+tp)))
#59% overall accuracy

print("Recall =","{0:.2f}".format(tp/(fn+tp)))
#correctly predicted Kroger 63% of the time
print("Specificity =","{0:.2f}".format(tn/(tn+fp)))
#correctly predicted Publix 54% of the time

True Negatives: 135
False Positives: 114
False Negatives: 91
True Positives: 157

Accuracy = 0.59
Recall = 0.63
Specificity = 0.54


## followed by a 2nd model, AdaBoostClassifier

In [25]:
#building a secondary model - AdaBoostClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier

abc = AdaBoostClassifier(n_estimators=100, learning_rate=1.0)
model = abc.fit(X_train, y_train)  

cross_val_score(abc, X, y, cv=5).mean()

0.5744490715542447

## Warning: the following codes takes about an hour to run
The params code was intentionally # so that it won't run

In [42]:
from sklearn.model_selection import GridSearchCV
#params = {
    'learning_rate':[0.01,0.1,1],
    'n_estimators':[1000,1200,1800,2000,2300,2494]
}

abc = AdaBoostClassifier()
abc_opti = GridSearchCV(abc, params, cv=5, verbose=100)
model_abc_opti = abc_opti.fit(X_train,y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV] learning_rate=0.01, n_estimators=1000 ...........................
[CV]  learning_rate=0.01, n_estimators=1000, score=0.5953177257525084, total=  21.1s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   21.8s remaining:    0.0s
[CV] learning_rate=0.01, n_estimators=1000 ...........................
[CV]  learning_rate=0.01, n_estimators=1000, score=0.5536912751677853, total=  21.3s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   44.0s remaining:    0.0s
[CV] learning_rate=0.01, n_estimators=1000 ...........................
[CV]  learning_rate=0.01, n_estimators=1000, score=0.6174496644295302, total=  22.1s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.1min remaining:    0.0s
[CV] learning_rate=0.01, n_estimators=1000 ...........................
[CV]  learning_rate=0.01, n_estimators=1000, score=0.5604026845637584, total=  

[CV]  learning_rate=0.1, n_estimators=1000, score=0.6208053691275168, total=  20.4s
[Parallel(n_jobs=1)]: Done  35 out of  35 | elapsed: 21.7min remaining:    0.0s
[CV] learning_rate=0.1, n_estimators=1200 ............................
[CV]  learning_rate=0.1, n_estimators=1200, score=0.6187290969899666, total=  24.5s
[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed: 22.1min remaining:    0.0s
[CV] learning_rate=0.1, n_estimators=1200 ............................
[CV]  learning_rate=0.1, n_estimators=1200, score=0.6073825503355704, total=  24.6s
[Parallel(n_jobs=1)]: Done  37 out of  37 | elapsed: 22.6min remaining:    0.0s
[CV] learning_rate=0.1, n_estimators=1200 ............................
[CV]  learning_rate=0.1, n_estimators=1200, score=0.6208053691275168, total=  24.5s
[Parallel(n_jobs=1)]: Done  38 out of  38 | elapsed: 23.0min remaining:    0.0s
[CV] learning_rate=0.1, n_estimators=1200 ............................
[CV]  learning_rate=0.1, n_estimators=1200, score=0.59060402

[CV]  learning_rate=1, n_estimators=1200, score=0.5604026845637584, total=  24.5s
[Parallel(n_jobs=1)]: Done  70 out of  70 | elapsed: 42.7min remaining:    0.0s
[CV] learning_rate=1, n_estimators=1800 ..............................
[CV]  learning_rate=1, n_estimators=1800, score=0.5886287625418061, total=  36.7s
[Parallel(n_jobs=1)]: Done  71 out of  71 | elapsed: 43.4min remaining:    0.0s
[CV] learning_rate=1, n_estimators=1800 ..............................
[CV]  learning_rate=1, n_estimators=1800, score=0.5906040268456376, total=  36.8s
[Parallel(n_jobs=1)]: Done  72 out of  72 | elapsed: 44.0min remaining:    0.0s
[CV] learning_rate=1, n_estimators=1800 ..............................
[CV]  learning_rate=1, n_estimators=1800, score=0.5671140939597316, total=  36.6s
[Parallel(n_jobs=1)]: Done  73 out of  73 | elapsed: 44.6min remaining:    0.0s
[CV] learning_rate=1, n_estimators=1800 ..............................
[CV]  learning_rate=1, n_estimators=1800, score=0.6140939597315436, 

In [43]:
abc_opti.best_params_

{'learning_rate': 0.1, 'n_estimators': 1200}

In [44]:
abc_opti.best_score_

0.6096579476861167

In [45]:
prediction_abc = abc_opti.predict(X_test)

In [46]:
confusion_matrix(y_test,prediction_abc)

array([[169,  80],
       [109, 139]])

In [47]:
tn2, fp2, fn2, tp2 = confusion_matrix(y_test, prediction_abc).ravel()

In [48]:
print("True Negatives: %s" % tn2)
print("False Positives: %s" % fp2)
print("False Negatives: %s" % fn2)
print("True Positives: %s" % tp2)
print("")

#Most predictions leaning towards Kroger, regardless of accuracy
print("Accuracy =","{0:.2f}".format((tp2 + tn2)/(tn2+fp2+fn2+tp2)))
#61% accuracy

print("Recall =","{0:.2f}".format(tp2/(fn2+tp2)))
#correctly predicted Kroger about 54% of the time
print("Specificity =","{0:.2f}".format(tn/(tn2+fp2)))
#correctly predicted Publix only 54% of the time

True Negatives: 169
False Positives: 80
False Negatives: 109
True Positives: 139

Accuracy = 0.62
Recall = 0.56
Specificity = 0.54
