In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sms-spam-collection-dataset/spam.csv


# Step 1: Reading the data

In [185]:
df=pd.read_csv("/kaggle/input/sms-spam-collection-dataset/spam.csv",encoding='latin1')

In [186]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


# Removing unwanted columns

In [187]:
df=df.drop(["Unnamed: 2","Unnamed: 3","Unnamed: 4"],axis=1)

# Renaming Target columns

In [188]:
df.rename({'v1':"Target",'v2':'mail'},inplace=True,axis=1)

In [189]:
df.head()

Unnamed: 0,Target,mail
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [190]:
df.shape

(5572, 2)

# Importing necessary libraries and stopwordlist

In [191]:
import nltk
nltk.download("stopwords")
from nltk.stem import PorterStemmer
import string
tokill=string.punctuation

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [192]:
from nltk.corpus import stopwords
sw = stopwords.words('english')

In [193]:
ps=PorterStemmer()

# Implementing the data cleaning function

In [194]:
def data_cleaning(x):
    wordlist=[]
    for word in x.split():
        word=word.lower()
        if word not in sw:
            letterlist=[]
            for letter in word:
                if letter not in tokill:
                    letterlist.append(letter)
            stemword=ps.stem("".join(letterlist))
            wordlist.append(stemword)
    x=" ".join(wordlist)
    return x        

In [195]:
df["mail"]=df["mail"].apply(data_cleaning)

In [196]:
df.head()

Unnamed: 0,Target,mail
0,ham,go jurong point crazi avail bugi n great world...
1,ham,ok lar joke wif u oni
2,spam,free entri 2 wkli comp win fa cup final tkt 21...
3,ham,u dun say earli hor u c alreadi say
4,ham,nah think goe usf live around though


# Changing targets from categorical to numerical columns

In [197]:
df["Target"]=df["Target"].map({"ham":0,"spam":1})

# Training and testing models

In [198]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(df["mail"],df["Target"],test_size=0.3,random_state=42)

In [199]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [200]:
# cv=CountVectorizer()
tf=TfidfVectorizer()

In [201]:
xtrainout=tf.fit_transform(xtrain).toarray()

In [202]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
lr.fit(xtrainout,ytrain)

In [203]:
xtestout=tf.transform(xtest)

In [204]:
from sklearn.metrics import accuracy_score,classification_report
pred=lr.predict(xtestout)
print(accuracy_score(ytest,pred))
print(classification_report(ytest,pred))

0.9521531100478469
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      1453
           1       0.97      0.65      0.78       219

    accuracy                           0.95      1672
   macro avg       0.96      0.83      0.88      1672
weighted avg       0.95      0.95      0.95      1672



In [205]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier()
rfc.fit(xtrainout,ytrain)
pred=rfc.predict(xtestout)
print(accuracy_score(ytest,pred))
print(classification_report(ytest,pred))

0.9742822966507177
              precision    recall  f1-score   support

           0       0.97      1.00      0.99      1453
           1       1.00      0.80      0.89       219

    accuracy                           0.97      1672
   macro avg       0.99      0.90      0.94      1672
weighted avg       0.98      0.97      0.97      1672



In [206]:
cv=CountVectorizer()
xtrainout_2=cv.fit_transform(xtrain).toarray()
xtestout_2=cv.transform(xtest)
rfc=RandomForestClassifier()
rfc.fit(xtrainout_2,ytrain)
pred=rfc.predict(xtestout_2)
print(accuracy_score(ytest,pred))
print(classification_report(ytest,pred))

0.9736842105263158
              precision    recall  f1-score   support

           0       0.97      1.00      0.99      1453
           1       1.00      0.80      0.89       219

    accuracy                           0.97      1672
   macro avg       0.99      0.90      0.94      1672
weighted avg       0.97      0.97      0.97      1672



In [210]:
cv=CountVectorizer()
lr=LogisticRegression()
xtrainout_2=cv.fit_transform(xtrain).toarray()
xtestout_2=cv.transform(xtest)
lr=LogisticRegression()
lr.fit(xtrainout_2,ytrain)
pred=lr.predict(xtestout_2)
print(accuracy_score(ytest,pred))
print(classification_report(ytest,pred))

0.9802631578947368
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1453
           1       0.99      0.85      0.92       219

    accuracy                           0.98      1672
   macro avg       0.99      0.93      0.95      1672
weighted avg       0.98      0.98      0.98      1672

