# Email spam filtering

### Load Data

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('sms.tsv',delimiter='\t',names=['label','Messages'])

In [3]:
data.head()

Unnamed: 0,label,Messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
data.shape

(5572, 2)

In [5]:
data['label']=data['label'].map({'ham':0,'spam':1})

In [6]:
data.head()

Unnamed: 0,label,Messages
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
data.label.value_counts()

0    4825
1     747
Name: label, dtype: int64

### clean and Prepare data

In [8]:
import re

def clean(x):
    
    #remove all html tags from data
    #remove all numbers from data
    #remove all special chars from data
    #etc..
    s = re.sub('[^A-Za-z]',' ',x)  #to replace everything except A-Z or a-z
    s = re.sub('\s+',' ',s)
    s = s.strip()
    
    return s.lower()

In [9]:
data['Messages'] = data.Messages.apply(clean)

In [10]:
X = data.Messages.values
y = data.label.values

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
xtrain,xtest,ytrain,ytest = train_test_split(X,y,test_size=.25,random_state=12)

In [17]:
import nltk
nltk.download('stopwords')
  

[nltk_data] Downloading package stopwords to C:\Users\Abhishek
[nltk_data]     Ranjan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [18]:
from nltk.corpus import stopwords

stopwords = stopwords.words('english')

In [None]:
### Remove 'not' from stopword list

In [19]:
if 'not' in stopwords:
    stopwords.remove('not')

#Transform text data into Numeric

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

In [22]:
cv1 = CountVectorizer(stop_words=stopwords)  

In [23]:
cv_train = cv1.fit_transform(xtrain).toarray()

cv_test = cv1.transform(xtest).toarray()

In [24]:
#cv1.get_feature_names()

In [25]:
cv_train.shape

(4179, 6489)

In [26]:
cv_test.shape

(1393, 6489)

In [27]:
#cv1.get_feature_names()

### Train model

In [28]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,recall_score,precision_score
import numpy as np

### Naive bayes

In [29]:
nb = MultinomialNB()

nb.fit(cv_train,ytrain)

test_score = nb.score(cv_test,ytest)
test_score

0.9741564967695621

In [30]:
pred = nb.predict(cv_test)

In [32]:
recall_score(ytest,pred)

0.8928571428571429

In [33]:
precision_score(ytest,pred)

0.9210526315789473

### Logistic Regression

In [34]:
from sklearn.model_selection import GridSearchCV

In [35]:
#nb = MultinomialNB()
log = LogisticRegression(C=.01,class_weight={1:3})

log.fit(cv_train,ytrain)

test_score = log.score(cv_test,ytest)
test_score

0.9540559942569993

In [36]:
log_pred = log.predict(cv_test)

In [37]:
confusion_matrix(ytest,log_pred)

array([[1182,   15],
       [  49,  147]], dtype=int64)

In [38]:
recall_score(ytest,log_pred)

0.75

In [39]:
precision_score(ytest,log_pred)

0.9074074074074074

In [40]:
import pickle

In [41]:
filename = 'spamcheck'

In [42]:
pickle.dump(nb,open(filename,'wb'))

### Evaluate model on test data

In [69]:
test = ['its important pls respond']

In [70]:
cleaned_data=[]

for i in test:
    t = clean(i)
    cleaned_data.append(t)

In [71]:
cleaned_data

['its important pls respond']

In [72]:
t1 = cv1.transform(cleaned_data)

In [73]:
t1.shape

(1, 6489)

In [74]:
nb.predict(t1)

array([0], dtype=int64)