In [1]:
import numpy as np
import pandas as pd
import nltk


# step-1:load the data set

In [2]:
data=pd.read_csv("SPAM-210331-134237.csv")

# step 2:analyse the data 

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116 entries, 0 to 115
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   type    116 non-null    object
 1   text    116 non-null    object
dtypes: object(2)
memory usage: 1.9+ KB


In [4]:
data.head(5)

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
data=pd.DataFrame(data)

In [6]:
data

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
111,ham,What is the plural of the noun research?
112,ham,Going for dinner.msg you after.
113,ham,I'm ok wif it cos i like 2 try new things. But...
114,spam,GENT! We are trying to contact you. Last weeke...


In [7]:
data['spam']=data['type'].map({'spam':1,'ham':0}).astype(int)
data.head(5)

Unnamed: 0,type,text,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [8]:
data['text'][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

# step-3:Tokenise the data

In [9]:
for i in data.index:
    data['text'][i]=data['text'][i].split()
    


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['text'][i]=data['text'][i].split()


In [10]:
data['text'][1]

['Ok', 'lar...', 'Joking', 'wif', 'u', 'oni...']

# step-4:stemming

In [11]:
from nltk.stem.snowball import SnowballStemmer
porter=SnowballStemmer("english",ignore_stopwords=False)

In [12]:
def stem(text):
    return [porter.stem(word) for word in text]

In [13]:
data['text']=data['text'].apply(stem)

In [14]:
data['text'][0]

['go',
 'until',
 'jurong',
 'point,',
 'crazy..',
 'avail',
 'onli',
 'in',
 'bugi',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet...',
 'cine',
 'there',
 'got',
 'amor',
 'wat...']

# step-5:lemmitization

In [15]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()



[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mulla\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [16]:
def lemmit_it(text):
    return [lemmatizer.lemmatize(word,pos="a") for word in text]

In [17]:
data['text']=data['text'].apply(lemmit_it)

In [18]:
data['text'][100]

['pleas',
 "don't",
 'text',
 'me',
 'anymore.',
 'i',
 'have',
 'noth',
 'els',
 'to',
 'say.']

In [19]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words=stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mulla\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
def stop_it(text):
    review=[word for word in text if not word in stop_words]
    return review

In [21]:
data['text']=data['text'].apply(stop_it)

In [22]:
data['text'][100]

['pleas', 'text', 'anymore.', 'noth', 'els', 'say.']

In [23]:
data.head(10)

Unnamed: 0,type,text,spam
0,ham,"[go, jurong, point,, crazy.., avail, onli, bug...",0
1,ham,"[ok, lar..., joke, wif, u, oni...]",0
2,spam,"[free, entri, 2, wkli, comp, win, fa, cup, fin...",1
3,ham,"[u, dun, say, earli, hor..., u, c, alreadi, sa...",0
4,ham,"[nah, think, goe, usf,, live, around, though]",0
5,spam,"[freemsg, hey, darl, 3, week, word, back!, i'd...",1
6,ham,"[even, brother, like, speak, me., treat, like,...",0
7,ham,"[per, request, mell, mell, (oru, minnaminungin...",0
8,spam,"[winner!!, valu, network, custom, select, rece...",1
9,spam,"[mobil, 11, month, more?, u, r, entitl, updat,...",1


In [24]:
data['text']=data['text'].apply(' '.join)

In [25]:
data.head()

Unnamed: 0,type,text,spam
0,ham,"go jurong point, crazy.. avail onli bugi n gre...",0
1,ham,ok lar... joke wif u oni...,0
2,spam,free entri 2 wkli comp win fa cup final tkts 2...,1
3,ham,u dun say earli hor... u c alreadi say...,0
4,ham,"nah think goe usf, live around though",0


# step-6:vectorisiration

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer()
y=data.spam.values
x=tfidf.fit_transform(data['text'])

In [29]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=1,test_size=0.2,shuffle=False)

# step-7:classification using Logistic Regression

In [31]:
from sklearn.linear_model import LogisticRegression
clf=LogisticRegression()
clf.fit(x_train,y_train)
y_pred=clf.predict(x_test)
from sklearn.metrics import accuracy_score
acc_log=accuracy_score(y_pred,y_test)*100
acc_log

87.5

In [32]:
from sklearn.svm import LinearSVC
linear_svc=LinearSVC(random_state=0)
linear_svc.fit(x_train,y_train)
y_pred=linear_svc.predict(x_test)
acc_linear=accuracy_score(y_pred,y_test)*100
acc_linear

87.5