In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Aishvarya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [23]:
df = pd.read_csv('spam.csv',encoding='latin1')

In [24]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [25]:
df = df.drop('Unnamed: 2',axis=1)
df = df.drop('Unnamed: 3',axis=1)
df = df.drop('Unnamed: 4',axis=1)

In [26]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [27]:
new_names = {'v1':'Class','v2':'MessageDesc'}
df.rename(columns = new_names,inplace=True)

In [28]:
df.head()

Unnamed: 0,Class,MessageDesc
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [29]:
df['Class'].unique()

array(['ham', 'spam'], dtype=object)

In [30]:
df['Class'] = df['Class'].replace(['ham', 'spam'],['0','1'])

In [31]:
df.isnull().sum()

Class          0
MessageDesc    0
dtype: int64

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Class        5572 non-null   object
 1   MessageDesc  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [33]:
df.shape

(5572, 2)

In [34]:
port_stem = PorterStemmer()

In [35]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content


In [36]:
df['MessageDesc'] = df['MessageDesc'] .apply(stemming)

In [37]:
df['MessageDesc']

0       go jurong point crazi avail bugi n great world...
1                                   ok lar joke wif u oni
2       free entri wkli comp win fa cup final tkt st m...
3                     u dun say earli hor u c alreadi say
4                    nah think goe usf live around though
                              ...                        
5567    nd time tri contact u u pound prize claim easi...
5568                                b go esplanad fr home
5569                                    piti mood suggest
5570    guy bitch act like interest buy someth els nex...
5571                                       rofl true name
Name: MessageDesc, Length: 5572, dtype: object

In [38]:
X = df['MessageDesc']
Y = df['Class']

In [39]:
vectorizer = TfidfVectorizer()

In [40]:
X = vectorizer.fit_transform(X)

In [42]:
print(X)

  (0, 5886)	0.19459721085856554
  (0, 188)	0.35227555532712895
  (0, 2185)	0.16514812015268623
  (0, 957)	0.2976038126814356
  (0, 730)	0.33628509567872483
  (0, 2898)	0.28504484907271926
  (0, 6060)	0.23615475543085498
  (0, 2222)	0.19459721085856554
  (0, 732)	0.2976038126814356
  (0, 377)	0.2634906267537017
  (0, 1162)	0.2728131680559813
  (0, 4046)	0.24054119706179236
  (0, 2794)	0.35227555532712895
  (0, 2148)	0.140840528429051
  (1, 3743)	0.5647537939557097
  (1, 5982)	0.4459451111953121
  (1, 2761)	0.47451057922863127
  (1, 2926)	0.4218684931830353
  (1, 3718)	0.2811632882742994
  (2, 260)	0.18734543331896464
  (2, 4313)	0.18099233980499008
  (2, 5629)	0.13714969058149892
  (2, 5072)	0.220381850740506
  (2, 4268)	0.18480736620794727
  (2, 4350)	0.18099233980499008
  :	:
  (5567, 3928)	0.2704984914783844
  (5567, 4313)	0.2859214251331077
  (5568, 1690)	0.6651601234243666
  (5568, 1979)	0.5740011643413078
  (5568, 2431)	0.37453093229027423
  (5568, 2148)	0.29632963790124456
  (556

In [43]:
model = SVC()

In [45]:
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=2)

In [46]:
model.fit(X_train,y_train)

In [47]:
y_pred = model.predict(X_test)

In [48]:
print("Accuracy Score = ",100*accuracy_score(y_test,y_pred))

Accuracy Score =  96.68161434977578


In [52]:
list = X_test[0]

In [53]:
y_pred1 = model.predict(list)

In [60]:
y_pred1

array(['0'], dtype=object)

In [61]:
def classify(pred):
    if (pred==0):
        print("The message is not a spam")
    
    else:
        print("The message is a spam!!")




In [62]:
classify(y_pred1)

The message is a spam!!
