In [1]:
import pandas as pd
import numpy as np

In [2]:
spam_df = pd.read_csv("./spam.csv", encoding="latin1")

spam_df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
spam_df = spam_df[["v2", "v1"]]

spam_df = spam_df.rename(columns={"v2":"message", "v1":"class"})

spam_df.head()

Unnamed: 0,message,class
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham


In [4]:
spam_df["class"] = spam_df["class"].apply(lambda x: 1 if x== "spam" else 0)

spam_df.head()

Unnamed: 0,message,class
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [5]:
spam_df["class"].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
0,4825
1,747


In [6]:
!pip install nltk



In [7]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:
from nltk.corpus import stopwords

In [10]:
import re

In [12]:
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):

  # converts all characters to lower case
  text = text.lower()

  # removes special characters
  text = re.sub(r'[^0-9a-zA-Z]', ' ', text)

  # removes extra spaces
  text = re.sub(r'\s+', ' ', text)

  # removes special words from STOPWORDS from the text
  text = " ".join(word for word in text.split() if word not in STOPWORDS)

  return text

In [14]:
spam_df["clean_message"] = spam_df["message"].apply(clean_text)

spam_df.head()

Unnamed: 0,message,class,clean_message
0,"Go until jurong point, crazy.. Available only ...",0,go jurong point crazy available bugis n great ...
1,Ok lar... Joking wif u oni...,0,ok lar joking wif u oni
2,Free entry in 2 a wkly comp to win FA Cup fina...,1,free entry 2 wkly comp win fa cup final tkts 2...
3,U dun say so early hor... U c already then say...,0,u dun say early hor u c already say
4,"Nah I don't think he goes to usf, he lives aro...",0,nah think goes usf lives around though


## Data Spliting to Train and Test

In [38]:
from sklearn.model_selection import train_test_split

X = spam_df["clean_message"]
y = spam_df["class"]

X_train, X_test, y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [39]:
spam_df.shape, X_train.shape, X_test.shape

((5572, 3), (4457,), (1115,))

In [26]:
X_train.head()

Unnamed: 0,clean_message
1978,boat still moms check yo half naked
3989,bank granite issues strong buy explosive pick ...
3935,r giving second chance rahul dengra
4078,played smash bros lt gt religiously
4086,private 2003 account statement 07973788240 sho...


In [40]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.metrics import classification_report

def model_building(X_train, X_test, y_train, y_test, model):

  model_pipeline = Pipeline([
      ("vec", CountVectorizer()),
      ("tfid", TfidfTransformer()),
      ("clf_model", model)
  ])

  model_pipeline.fit(X_train, y_train)

  print("Accuracy: ", model_pipeline.score(X_test, y_test) * 100)

  y_pred = model_pipeline.predict(X_test)

  print("Classification report: \n", classification_report(y_test, y_pred))


In [42]:
from sklearn.linear_model import LogisticRegression

lg = LogisticRegression(random_state=42)

model_building(X_train, X_test, y_train, y_test, lg)

Accuracy:  95.42600896860986
Classification report: 
               precision    recall  f1-score   support

           0       0.95      0.99      0.97       965
           1       0.95      0.69      0.80       150

    accuracy                           0.95      1115
   macro avg       0.95      0.84      0.89      1115
weighted avg       0.95      0.95      0.95      1115



In [43]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(random_state=42)

model_building(X_train, X_test, y_train, y_test, forest)

Accuracy:  98.02690582959642
Classification report: 
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       1.00      0.85      0.92       150

    accuracy                           0.98      1115
   macro avg       0.99      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [44]:
from sklearn.svm import SVC

svm = SVC(random_state=42)

model_building(X_train, X_test, y_train, y_test, svm)

Accuracy:  97.9372197309417
Classification report: 
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.99      0.85      0.92       150

    accuracy                           0.98      1115
   macro avg       0.98      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115

