### Naive Bayes/Logistic Model on Bag of Words

#### Reading the Data

In [1]:
import pandas as pd
df=pd.read_excel("All_Spam_data.xlsx")

In [2]:
df.shape

(10000, 10)

In [3]:
df=df[:1500]

In [4]:
df.head()

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime,Spam
0,B0015RB39O,"[0, 0]",5.0,This product works pretty well for me. I don't...,"08 15, 2012",AQDA17WK201Q1,Taylor M.,Work as advertise,1344989000.0,1
1,B0015RB39O,"[0, 0]",1.0,Worked for 3 days then never worked again. Was...,"04 28, 2012",A1V5XDKBWSHHX5,T. Chang,Would not recommend.,1335571000.0,1
2,B001630QZE,"[17, 34]",1.0,always been a proponent of Bluetooth headset ...,"10 14, 2011",A1P0OVX2637TYW,terry,WAY overpriced. Math simply doesn't work out f...,1318550000.0,1
3,B001630QZE,"[0, 0]",4.0,"Well they say you get what you pay for, and th...","01 2, 2013",AT6701B61F2X1,The Beast,"Super clear audio, very disceet design",1357085000.0,1
4,B001630QZE,"[17, 20]",5.0,"OK, I splurged. I usually don't spend this muc...","06 15, 2013",A24QFMD1RXLJMB,thelastpiece,"Fantastic, but pricey",1371254000.0,1


In [50]:
from sklearn.utils import shuffle

df = shuffle(df)#Shuffing the data

In [51]:
df=df.reset_index()#Resetting the index

In [52]:
df=df.drop(['index'],axis=1)#Dropping the index

In [53]:
df1=df[['reviewText','Spam']]

In [54]:
df1.head()

Unnamed: 0,reviewText,Spam
0,I bought this to fit inside a kit that creates...,0
1,I love the way it can charge 2 devices at once...,0
2,I read the first few reviews and didn't read a...,0
3,I had to replace the battery on my phone cause...,0
4,i have 6 otter box covers in my house. fits we...,1


In [55]:
df1['Spam'].value_counts()

1    785
0    715
Name: Spam, dtype: int64

In [56]:
df1.isnull().sum(), df1.shape#Checking the Missing Values

(reviewText    1
 Spam          0
 dtype: int64, (1500, 2))

In [57]:
df1 = df1.dropna()#Dropping the missing Values

### Document Term Matrix

In [58]:
import pandas as pd
import re
import nltk

In [59]:
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [60]:
from sklearn.feature_extraction.text import CountVectorizer
stemmer = PorterStemmer()
def tokenize(text):
    text = stemmer.stem(text)               #stemming
    text = re.sub(r'\W+|\d+|_', ' ', text)    #removing numbers and punctuations and Underscores
    tokens = nltk.word_tokenize(text)       #tokenizing
    return tokens
countvec = CountVectorizer(min_df= 5, tokenizer=tokenize, stop_words=stopwords.words('english'))
dtm = pd.DataFrame(countvec.fit_transform(df1['reviewText']).toarray(), columns=countvec.get_feature_names(), index=None)

In [61]:
#Adding label Column
dtm['Spam'] = df['Spam']
dtm.head()

Unnamed: 0,aa,ability,able,absolutely,absorbing,abuse,ac,accent,accept,acceptable,...,x,xoom,yeah,year,years,yes,yet,yubi,zero,Spam
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [62]:
dtm.shape

(1499, 2531)

In [63]:
X=dtm.drop(['Spam'],axis=1)

In [64]:
y=dtm['Spam']

In [65]:
import numpy as np

In [66]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y, 
                                                    test_size=0.25)

In [67]:
y_test.value_counts()

1    196
0    179
Name: Spam, dtype: int64

### Naive Bayes

In [68]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()

In [69]:
clf.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [70]:
clf.score(X_test,y_test)

0.5333333333333333

### Logistic Regression Model

In [71]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()

clf_log.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [72]:
import sklearn.metrics as metrics
# calculate the fpr and tpr for all thresholds of the classification
probs = clf_log.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)

In [73]:
roc_auc

0.4900524455592293

In [74]:
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

<Figure size 640x480 with 1 Axes>