# Spam Detection with Natural Language Processing using NLTK 

In [69]:
import numpy as np   
import pandas as pd 
import nltk

import chardet
with open("spam.csv", 'rb') as rawdata:
    result = chardet.detect(rawdata.read(100000))
print(result)

{'encoding': 'Windows-1252', 'confidence': 0.7270322499829184, 'language': ''}


## 1) Loading Data Set 

In [70]:
dt = pd.read_csv('spam.csv' , encoding = 'Windows-1252')  
dt.head()


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [71]:
dt = pd.DataFrame(dt)
dt.rename(columns={"v1": 'type', "v2": 'text'}, inplace = True)
dt.head(10)

Unnamed: 0,type,text,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


In [72]:
dt.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace = True)

In [73]:
dt['spam'] = dt['type'].map( {'spam': 1, 'ham': 0} ).astype(int)
dt.head(5)

Unnamed: 0,type,text,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [74]:
print("COLUMS IN THE GIVEN DATA:")
for col in dt.columns: 
    print(col) 

COLUMS IN THE GIVEN DATA:
type
text
spam


In [75]:
t=len(dt['type'])
print("NO OF ROWS IN REVIEW COLUMN:",t)
t=len(dt['text'])
print("NO OF ROWS IN liked COLUMN:",t)

NO OF ROWS IN REVIEW COLUMN: 5572
NO OF ROWS IN liked COLUMN: 5572


## 2) Tokenization

In [76]:
#before
dt['text'][1]   

'Ok lar... Joking wif u oni...'

In [77]:
def tokenizer(text):
    return text.split()

In [78]:
dt['text']=dt['text'].apply(tokenizer)

In [79]:
#after
dt['text'][1] 

['Ok', 'lar...', 'Joking', 'wif', 'u', 'oni...']

In [80]:
#before
dt['text'][1] 

['Ok', 'lar...', 'Joking', 'wif', 'u', 'oni...']

## 3) Stemming

In [81]:
from nltk.stem.snowball import SnowballStemmer
porter = SnowballStemmer("english", ignore_stopwords=False)

In [82]:
def stem_it(text):
    return [porter.stem(word) for word in text]

In [83]:
dt['text']=dt['text'].apply(stem_it)

In [84]:
#after stemming
dt['text'][1]  

['ok', 'lar...', 'joke', 'wif', 'u', 'oni...']

## 4) Lemmatization

In [85]:
#before
dt['text'][100] 

['okay',
 'name',
 'ur',
 'price',
 'as',
 'long',
 'as',
 'it',
 'legal!',
 'wen',
 'can',
 'i',
 'pick',
 'them',
 'up?',
 'y',
 'u',
 'ave',
 'x',
 'am',
 'xx']

In [86]:
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer() 

In [87]:
def lemmit_it(text):
    return [lemmatizer.lemmatize(word, pos ="a") for word in text]

In [88]:
dt['text']=dt['text'].apply(lemmit_it)

In [89]:
#before
dt['text'][100] 

['okay',
 'name',
 'ur',
 'price',
 'as',
 'long',
 'as',
 'it',
 'legal!',
 'wen',
 'can',
 'i',
 'pick',
 'them',
 'up?',
 'y',
 'u',
 'ave',
 'x',
 'am',
 'xx']

## 5) Stopword Removal

In [90]:
#before 
dt['text'][100] 

['okay',
 'name',
 'ur',
 'price',
 'as',
 'long',
 'as',
 'it',
 'legal!',
 'wen',
 'can',
 'i',
 'pick',
 'them',
 'up?',
 'y',
 'u',
 'ave',
 'x',
 'am',
 'xx']

In [91]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [92]:
def stop_it(text):
    review = [word for word in text if not word in stop_words ] 
    return review

In [93]:
dt['text']=dt['text'].apply(stop_it)

In [94]:
#after 
dt['text'][100] 

['okay',
 'name',
 'ur',
 'price',
 'long',
 'legal!',
 'wen',
 'pick',
 'up?',
 'u',
 'ave',
 'x',
 'xx']

In [95]:
dt.head(100)

Unnamed: 0,type,text,spam
0,ham,"[go, jurong, point,, crazy.., avail, onli, bug...",0
1,ham,"[ok, lar..., joke, wif, u, oni...]",0
2,spam,"[free, entri, 2, wkli, comp, win, fa, cup, fin...",1
3,ham,"[u, dun, say, earli, hor..., u, c, alreadi, sa...",0
4,ham,"[nah, think, goe, usf,, live, around, though]",0
...,...,...,...
95,spam,"[free, rington, wait, collected., simpli, text...",1
96,ham,"[watch, telugu, movie..wat, abt, u?]",0
97,ham,"[see., finish, load, loan, pay]",0
98,ham,"[hi., wk, ok, -, hol, now!, yes, bit, run., fo...",0


In [96]:
dt['text']=dt['text'].apply(' '.join)

In [97]:
dt.head()

Unnamed: 0,type,text,spam
0,ham,"go jurong point, crazy.. avail onli bugi n gre...",0
1,ham,ok lar... joke wif u oni...,0
2,spam,free entri 2 wkli comp win fa cup final tkts 2...,1
3,ham,u dun say earli hor... u c alreadi say...,0
4,ham,"nah think goe usf, live around though",0


##  6) Transform Text Data into TDF /TF-IDF Vectors   

In [98]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer()
y=dt.spam.values
x=tfidf.fit_transform(dt['text'])

In [99]:
from sklearn.model_selection import train_test_split
x_train,x_text,y_train,y_text=train_test_split(x,y,random_state=1,test_size=0.2,shuffle=False)

## 7) Applying Classifiers and Evaluating 

### `Logistic Regression`

In [100]:
from sklearn.linear_model import LogisticRegression
clf=LogisticRegression()
clf.fit(x_train,y_train)
y_pred=clf.predict(x_text)
from sklearn.metrics import accuracy_score
acc_log = accuracy_score(y_pred, y_text)*100
print("accuracy:",acc_log )

accuracy: 96.05381165919282


### `LinearSVC Accuracy` 

In [101]:
from sklearn.svm import LinearSVC

linear_svc = LinearSVC(random_state=0)
linear_svc.fit(x_train, y_train)
y_pred = linear_svc.predict(x_text)
acc_linear_svc =accuracy_score(y_pred, y_text) * 100
print("accuracy:",acc_linear_svc)

accuracy: 97.66816143497758
