In [1]:
import numpy as np
from collections import Counter
import pandas as pd
import nltk
import string
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier


In [2]:
data_path = 'data/SMSSpamCollection'
df = pd.read_table(data_path,header=None)

In [3]:
df.columns = ['spam','message']
df.head()

Unnamed: 0,spam,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
stopwords = set(nltk.corpus.stopwords.words('english'))

In [5]:
#removables from text
stopwords_set=set(stopwords)
punctuation_set=set(string.punctuation)


In [6]:
#Remove punctuation and stopwords
df['cleaned_msg'] = df.message.apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords_set
                                                        and word not in punctuation_set]))
df.head()

Unnamed: 0,spam,message,cleaned_msg
0,ham,"Go until jurong point, crazy.. Available only ...","Go jurong point, crazy.. Available bugis n gre..."
1,ham,Ok lar... Joking wif u oni...,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry 2 wkly comp win FA Cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,U dun say early hor... U c already say...
4,ham,"Nah I don't think he goes to usf, he lives aro...","Nah I think goes usf, lives around though"


In [7]:
#Logistic Regression Model with count vectorizer


In [8]:
count_vect = CountVectorizer()

In [9]:
# Transform text into tokens
X = count_vect.fit_transform(df.cleaned_msg)

In [10]:
y = df.spam

In [11]:
print(X.shape,
      y.shape)

(5572, 8703) (5572,)


In [12]:
X_train,X_test,y_train,y_test = train_test_split(X,y)

In [13]:
lg = LogisticRegression()

In [14]:
lg.fit(X_train,y_train)
y_pred =lg.predict(X_test)

#test score
lg.score(X_test,y_test)

0.9849246231155779

In [15]:
lg_cv_mat = confusion_matrix(y_test,y_pred)
lg_cv_mat

array([[1219,    1],
       [  20,  153]], dtype=int64)

In [16]:
#Logistic regressor with tfidfvectorizer


In [17]:
tfidf = TfidfVectorizer()

In [18]:
X = tfidf.fit_transform(df.cleaned_msg)
y = df.spam
X_train,X_test,y_train,y_test = train_test_split(X,y)

In [19]:
lg = LogisticRegression()
lg.fit(X_train,y_train)
y_pred =lg.predict(X_test)

#test score
lg.score(X_test,y_test)

0.9569274946159368

In [20]:
lg_tfidf_mat = confusion_matrix(y_test,y_pred)
lg_tfidf_mat

array([[1204,    1],
       [  59,  129]], dtype=int64)

In [21]:
# Random Forest modek with count vectorization


In [22]:
count_vect = CountVectorizer()

In [23]:
# Transform text into tokens
X = count_vect.fit_transform(df.cleaned_msg)
y = df.spam

In [24]:
X_train,X_test,y_train,y_test = train_test_split(X,y)

In [25]:
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
rf.score(X_test,y_test)

0.9798994974874372

In [26]:
rf_cv_mat = confusion_matrix(y_test,y_pred)
rf_cv_mat

array([[1200,    0],
       [  28,  165]], dtype=int64)

In [27]:
#Random Forest Model with tfidfvectorizer


In [28]:
tfidf = TfidfVectorizer()

In [29]:
X = tfidf.fit_transform(df.cleaned_msg)
y = df.spam
X_train,X_test,y_train,y_test = train_test_split(X,y)

In [30]:
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
rf.score(X_test,y_test)

0.9755922469490309

In [31]:
rf_tfidf_mat = confusion_matrix(y_test,y_pred)
rf_tfidf_mat

array([[1211,    0],
       [  34,  148]], dtype=int64)