In [189]:
import numpy as np
from scipy.stats import norm, t, ttest_1samp, ttest_ind, poisson
import pandas as pd
import csv
import random

In [190]:
df = pd.read_csv('spam.csv', encoding = 'latin-1') #have to encode the file to get it in

In [191]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [192]:
df = df.dropna(axis=1) #dropping the columns with Null values

In [193]:
#Let's imagine these are the messages in our inbox. Our hypothesis is 
#that the percentage of spam in our inbox is greater than 12.5%.
#Alpha is 0.025
#Do we reject or not reject the null hypothesis? What's the p-value?

In [194]:
#ttest_1samp(grade, 70)

In [195]:
spam = df[df['v1'] == 'spam']
ham = df[df['v1'] == 'ham']

In [196]:
spam.shape

(747, 2)

In [197]:
ham.shape

(4825, 2)

In [198]:
#Part 2
hyp_mean = 0.125
alpha = 0.025
N = len(df)

x_bar = len(df[df['v1'] == 'spam']) / N
x_std = np.sqrt(hyp_mean * (1 - hyp_mean))
dof = N - 1
x_stderr = x_std / np.sqrt(N)



In [199]:
T_dist = t(dof, hyp_mean, x_stderr)

In [200]:
pval = 1 - T_dist.cdf(x_bar)
pval

0.02041989487069895

In [201]:
#Part 3 - Poisson

In [202]:
rv = poisson(26.678) #26.678 emails a day

In [203]:
rv.cdf(29) #cumulative up to 29 since we want to know less than 30

0.7153152645111905

In [204]:
1 - rv.cdf(29) #1 - cdf

0.28468473548880946

In [205]:
#Part 4: Human Detector

In [206]:
df.loc[random.randint(1,N), ['v2']]

v2    God bless.get good sleep my dear...i will pray!
Name: 1805, dtype: object

In [207]:
df.loc[3100] #how to print full email

v1                             ham
v2    Pathaya enketa maraikara pa'
Name: 3100, dtype: object

In [208]:
#Part 5 - You will be creating a Naive Bayes Text Classification Model to determine if an incoming message is spam or not.
#What is the baseline accuracy? Accuracy from human model (85%)
#What is the accuracy from your trained model?
#How many Type 1 and Type 2 errors occurred?
#What performed better, the Human or NB model?

In [209]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
count_vect = CountVectorizer()
tfidf_transformer = TfidfTransformer()

In [210]:
counts = count_vect.fit_transform(df['v2']) 
tfidfs = tfidf_transformer.fit_transform(counts)

In [211]:
X_train, X_test, y_train, y_test = train_test_split(tfidfs, df['v1'], test_size=0.33, random_state=42) 
nb = MultinomialNB().fit(X_train, y_train)
predictions = nb.predict(X_test)

In [212]:
y_true = np.array(y_test)

In [230]:
def NB_Model_Accuracy(predictions, y_true):
    accurate, false_pos, false_neg, results = 0, 0, 0, []
    for i in range(len(predictions)):
        if predictions[i] == y_true[i]:
            accurate += 1
        elif predictions[i] == 'spam':
            false_pos += 1
        else:
            false_neg += 1
    accuracy = accurate / (accurate + false_pos + false_neg)
    results.append([accuracy, false_neg, false_pos])
    return results

In [233]:
r = NB_Model_Accuracy(predictions, y_true)
r

[[0.953779227841218, 85, 0]]

In [234]:
#Part VI 

In [236]:
new_predictions = nb.predict(tfidfs)
new_predictions

array(['ham', 'ham', 'spam', ..., 'ham', 'ham', 'ham'], dtype='<U4')

In [239]:
new_y_true = np.array(df['v1'])
new_y_true

array(['ham', 'ham', 'spam', ..., 'ham', 'ham', 'ham'], dtype=object)

In [241]:
NB_Model_Accuracy(new_predictions, new_y_true)

[[0.9603374012921752, 221, 0]]

In [243]:
sum(new_predictions == 'spam')

526

In [250]:
#need to re-run off new predictions
hyp_mean = 0.125
alpha = 0.025
N = len(new_predictions)

x_bar = sum(new_predictions == 'spam') / N
x_std = np.sqrt(hyp_mean * (1 - hyp_mean))
dof = N - 1
x_stderr = x_std / np.sqrt(N)

In [255]:
#T Distribution
T_dist = t(dof, hyp_mean, x_stderr)
pval = 1 - T_dist.cdf(x_bar)
pval

0.9999999999972398