In [109]:
import pandas as pd
import numpy as np

# 1) Loading Data Set

In [111]:
dt = pd.read_csv("spam.csv")
dt.head(10)

Unnamed: 0,type,text,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


In [112]:
dt['spam'] = dt['type'].map({'spam':1,'ham':0}).astype(int)
dt.head(5)

Unnamed: 0,type,text,Unnamed: 2,Unnamed: 3,Unnamed: 4,spam
0,ham,"Go until jurong point, crazy.. Available only ...",,,,0
1,ham,Ok lar... Joking wif u oni...,,,,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,,1
3,ham,U dun say so early hor... U c already then say...,,,,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,,0


In [113]:
print("Columns in Given Data:")
for col in dt.columns:
    print(col)

Columns in Given Data:
type
text
Unnamed: 2
Unnamed: 3
Unnamed: 4
spam


In [114]:
t = len(dt['type'])
print("NO OF ROWS IN REVIEW COLUMN:",t)
t = len(dt['text'])
print("NO OF ROWS IN LIKED COLUMN:",t)

NO OF ROWS IN REVIEW COLUMN: 5572
NO OF ROWS IN LIKED COLUMN: 5572


In [115]:
dt.head()

Unnamed: 0,type,text,Unnamed: 2,Unnamed: 3,Unnamed: 4,spam
0,ham,"Go until jurong point, crazy.. Available only ...",,,,0
1,ham,Ok lar... Joking wif u oni...,,,,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,,1
3,ham,U dun say so early hor... U c already then say...,,,,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,,0


# 2)Tokenization

In [116]:
dt['text'][1]

'Ok lar... Joking wif u oni...'

In [117]:
def tokenizer(text):
    return text.split()

In [118]:
dt['text']=dt['text'].apply(tokenizer)

In [119]:
dt['text'][1]

['Ok', 'lar...', 'Joking', 'wif', 'u', 'oni...']

# 3)STEMMING

In [120]:
dt['text'][1]

['Ok', 'lar...', 'Joking', 'wif', 'u', 'oni...']

In [121]:
from nltk.stem.snowball import SnowballStemmer
porter = SnowballStemmer("english",ignore_stopwords=False)

In [122]:
def stem_it(text):
    return [porter.stem(word) for word in text]

In [123]:
dt['text']=dt['text'].apply(stem_it)

In [124]:
dt['text'][1]

['ok', 'lar...', 'joke', 'wif', 'u', 'oni...']

# 4)LEMMITIZATION

In [125]:
dt['text'][11]

['six',
 'chanc',
 'to',
 'win',
 'cash!',
 'from',
 '100',
 'to',
 '20,000',
 'pound',
 'txt>',
 'csh11',
 'and',
 'send',
 'to',
 '87575.',
 'cost',
 '150p/day,',
 '6days,',
 '16+',
 'tsandc',
 'appli',
 'repli',
 'hl',
 '4',
 'info']

In [126]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [127]:
def lemmit_it(text):
    return [lemmatizer.lemmatize(word, pos ="a") for word in text]

In [128]:
dt['text']=dt['text'].apply(lemmit_it)

# 5)STOPWORD REMOVAL

In [129]:
dt['text'][111]

['go', 'for', 'dinner.msg', 'you', 'after.']

In [130]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
def stop_it(text):
    review = [word for word in text if not word in stop_words]
    return review

In [131]:
dt['text']=dt['text'].apply(stop_it)

In [132]:
dt['text'][111]

['go', 'dinner.msg', 'after.']

In [133]:
dt.head()

Unnamed: 0,type,text,Unnamed: 2,Unnamed: 3,Unnamed: 4,spam
0,ham,"[go, jurong, point,, crazy.., avail, onli, bug...",,,,0
1,ham,"[ok, lar..., joke, wif, u, oni...]",,,,0
2,spam,"[free, entri, 2, wkli, comp, win, fa, cup, fin...",,,,1
3,ham,"[u, dun, say, earli, hor..., u, c, alreadi, sa...",,,,0
4,ham,"[nah, think, goe, usf,, live, around, though]",,,,0


In [134]:
dt['text'] = dt['text'].apply(' '.join)

In [135]:
dt.head()

Unnamed: 0,type,text,Unnamed: 2,Unnamed: 3,Unnamed: 4,spam
0,ham,"go jurong point, crazy.. avail onli bugi n gre...",,,,0
1,ham,ok lar... joke wif u oni...,,,,0
2,spam,free entri 2 wkli comp win fa cup final tkts 2...,,,,1
3,ham,u dun say earli hor... u c alreadi say...,,,,0
4,ham,"nah think goe usf, live around though",,,,0


# 6)Transform Text Data into TDF/TF-IDF Vectors

In [136]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
y = dt.spam.values
x = tfidf.fit_transform(dt['text'])

In [137]:
from sklearn.model_selection import train_test_split
x_train,x_text,y_train,y_text=train_test_split(x,y,random_state=1,test_size=0.2,shuffle=False)

# 7) Classification using Logistic Regression

In [138]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(x_train,y_train)
y_predict = clf.predict(x_text)
from sklearn.metrics import accuracy_score
acc_log = accuracy_score(y_predict,y_text)*100
print("Accuracy:",acc_log)

Accuracy: 96.05381165919282


# 8) Classification using LinearSVC Accuracy

In [139]:
from sklearn.svm import LinearSVC

linear_svc = LinearSVC(random_state=0)
linear_svc.fit(x_train,y_train)
y_pred = linear_svc.predict(x_text)
acc_linear_svc = accuracy_score(y_pred,y_text)*100
print("Accuracy:",acc_linear_svc)

Accuracy: 97.75784753363229
