# Challenge: Iterate and evaluate your classifier

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
%matplotlib inline

In [55]:
amazon_cells=pd.read_csv('amazon_cells_labelled.txt', delimiter= '\t', header=None)
amazon_cells.columns = ['message','spam']

In [57]:
key_words=['!','recommend','click', 'offer']

for i in key_words:
    amazon_cells[str(i)]=amazon_cells.message.str.contains(str(i),
        case=False)


In [42]:
from sklearn.naive_bayes import BernoulliNB

bnb = BernoulliNB()

target=amazon_cells['spam']

variables=amazon_cells[key_words]

bnb.fit(variables,target)

y_pred = bnb.predict(variables)

In [43]:
print(target.shape)
print(y_pred.shape)

(1000,)
(1000,)


### Model accuracy Analysis

In [44]:
from sklearn import metrics

metrics.accuracy_score(target, y_pred)

0.552

This model is predcting the spam messages at 55.2% accuracy. So not very good. 

### Confusion Matrix Analysis

In [45]:
from sklearn.metrics import confusion_matrix
confusion_matrix(target, y_pred)

array([[457,  43],
       [405,  95]])

43 of the values were false potives ( identified as spam while they are not)

405 of the values were false negative (identified as ham while they are not)

457 of the values were correctly identified as negatives from the total messages that are hams

95 of the values were correctly identified as positives from the total messages that are spams

## Holdout groups validation

In [53]:
from sklearn.model_selection import train_test_split
#Use train_test_split to create the necessary training and test groups

X_train, X_test, y_train, y_test = train_test_split(variables, target, test_size=0.2, random_state=20)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))

With 20% Holdout: 0.52


## Cross validation scores

In [50]:
from sklearn.model_selection import cross_val_score
cross_val_score(bnb, variables, target, cv=10)

array([0.6 , 0.57, 0.56, 0.51, 0.54, 0.59, 0.54, 0.54, 0.52, 0.54])

Since the cross validation scores are consistant we can say that this model fits the data well

# Create more features and test the model

In [46]:
key_words1=['winner','buy', 'free', 'cash']

for i in key_words1:
    amazon_cells[str(i)]=amazon_cells.message.str.contains(str(i),
        case=False)

In [47]:
new_variables = amazon_cells[key_words1]

y_pred = bnb.predict(new_variables)

In [40]:
metrics.accuracy_score(target, y_pred)

0.553

Since the accuracy of the prediction increased our model is not overfitting