In [2]:
%pip install -r requirements.txt
# importing the Dataset
import numpy as np
import pandas as pd

df = pd.read_csv('SMSSpamCollection', sep='\t',
                           names=["label", "message"])






In [3]:
df.sample(5)

Unnamed: 0,label,message
4651,ham,Finally it has happened..! Aftr decades..! BEE...
4008,ham,I'm reaching home in 5 min.
4141,ham,Leave it wif me lar... Ü wan to carry meh so h...
3624,ham,"Damn, poor zac doesn't stand a chance"
4383,ham,yeah sure thing mate haunt got all my stuff so...


In [4]:
df.shape

(5572, 2)

In [5]:
# 1. Data cleaning
# 2. EDA
# 3. Text Preprocessing
# 4. Model building
# 5. Evaluation
# 6. Improvement
# 7. Website
# 8. Deploy

1. Data Cleaning

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [7]:
# renaming the cols
df.rename(columns={'label':'target','message':'text'},inplace=True)
df.sample(5)

Unnamed: 0,target,text
3369,ham,"Hey elaine, is today's meeting still on?"
3791,ham,I love you !!! You know? Can you feel it? Does...
1930,spam,Free 1st week entry 2 TEXTPOD 4 a chance 2 win...
1411,ham,Pls speak to that customer machan.
5383,ham,Good day to You too.Pray for me.Remove the tee...


In [8]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [9]:
df['target'] = encoder.fit_transform(df['target'])

In [10]:
df.head()

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
# missing values
df.isnull().sum()

target    0
text      0
dtype: int64

In [12]:
# check for duplicate values
df.duplicated().sum()

403

In [13]:
# remove duplicates
df = df.drop_duplicates(keep='first')

In [14]:
df.duplicated().sum()

0

In [15]:
df.shape

(5169, 2)

2.EDA

In [16]:
df.head()

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [17]:
df['target'].value_counts()

target
0    4516
1     653
Name: count, dtype: int64

In [18]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Create an instance of PorterStemmer
ps = PorterStemmer()


3. Data Preprocessing

In [19]:
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    
    y = []
    for i in text:
        if i.isalnum():
            y.append(i)
    
    text = y[:]
    y.clear()
    
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
            
    text = y[:]
    y.clear()
    
    for i in text:
        y.append(ps.stem(i))
    
            
    return " ".join(y)

In [20]:
transform_text("I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today.")

'gon na home soon want talk stuff anymor tonight k cri enough today'

In [21]:
df['text'][10]

"I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today."

In [22]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
ps.stem('loving')

'love'

In [23]:
df['transformed_text'] = df['text'].apply(transform_text)

In [24]:
df.head()

Unnamed: 0,target,text,transformed_text
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,0,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though


In [25]:
spam_corpus = []
for msg in df[df['target'] == 1]['transformed_text'].tolist():
    for word in msg.split():
        spam_corpus.append(word)

In [26]:
len(spam_corpus)

9982

In [27]:
ham_corpus = []
for msg in df[df['target'] == 0]['transformed_text'].tolist():
    for word in msg.split():
        ham_corpus.append(word)

In [28]:
len(ham_corpus)

35939

In [29]:
# Text Vectorization
# using Bag of Words
df.head()

Unnamed: 0,target,text,transformed_text
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,0,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though


4. Model Building

In [30]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
cv = CountVectorizer()
tfidf = TfidfVectorizer(max_features=2500)

In [31]:
X = tfidf.fit_transform(df['transformed_text']).toarray()

In [32]:
X.shape

(5169, 2500)

In [33]:
y = df['target'].values

In [34]:
from sklearn.model_selection import train_test_split

In [35]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [36]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score

In [37]:
gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()

In [38]:
gnb.fit(X_train,y_train)
y_pred1 = gnb.predict(X_test)
print(accuracy_score(y_test,y_pred1))
print(confusion_matrix(y_test,y_pred1))
print(precision_score(y_test,y_pred1))

0.8558994197292069
[[783 125]
 [ 24 102]]
0.44933920704845814


In [39]:
mnb.fit(X_train,y_train)
y_pred2 = mnb.predict(X_test)
print(accuracy_score(y_test,y_pred2))
print(confusion_matrix(y_test,y_pred2))
print(precision_score(y_test,y_pred2))

0.971953578336557
[[908   0]
 [ 29  97]]
1.0


In [40]:
bnb.fit(X_train,y_train)
y_pred3 = bnb.predict(X_test)
print(accuracy_score(y_test,y_pred3))
print(confusion_matrix(y_test,y_pred3))
print(precision_score(y_test,y_pred3))

0.9796905222437138
[[906   2]
 [ 19 107]]
0.981651376146789


In [41]:
# tfidf --> MNB

In [42]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [43]:
svc = SVC(kernel='sigmoid', gamma=1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver='liblinear', penalty='l1')
rfc = RandomForestClassifier(n_estimators=50, random_state=2)
abc = AdaBoostClassifier(n_estimators=50, random_state=2)
bc = BaggingClassifier(n_estimators=50, random_state=2)
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
gbdt = GradientBoostingClassifier(n_estimators=50,random_state=2)
xgb = XGBClassifier(n_estimators=50,random_state=2)

In [44]:
clfs = {
    'SVC' : svc,
    'KN' : knc, 
    'NB': mnb, 
    'DT': dtc, 
    'LR': lrc, 
    'RF': rfc, 
    'AdaBoost': abc, 
    'BgC': bc, 
    'ETC': etc,
    'GBDT':gbdt,
    'xgb':xgb
}

In [45]:
def train_classifier(clf,X_train,y_train,X_test,y_test):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    
    return accuracy,precision

In [46]:
train_classifier(svc,X_train,y_train,X_test,y_test)

(0.9738878143133463, 0.9805825242718447)

In [47]:
accuracy_scores = []
precision_scores = []

for name,clf in clfs.items():
    
    current_accuracy,current_precision = train_classifier(clf, X_train,y_train,X_test,y_test)
    
    print("For ",name)
    print("Accuracy - ",current_accuracy)
    print("Precision - ",current_precision)
    
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)

For  SVC
Accuracy -  0.9738878143133463
Precision -  0.9805825242718447
For  KN
Accuracy -  0.9187620889748549
Precision -  1.0
For  NB
Accuracy -  0.971953578336557
Precision -  1.0
For  DT
Accuracy -  0.9410058027079303
Precision -  0.891566265060241
For  LR
Accuracy -  0.9497098646034816
Precision -  0.9302325581395349
For  RF
Accuracy -  0.9671179883945842
Precision -  0.9693877551020408
For  AdaBoost
Accuracy -  0.9661508704061895
Precision -  0.9333333333333333
For  BgC
Accuracy -  0.9613152804642167
Precision -  0.8771929824561403
For  ETC
Accuracy -  0.9758220502901354
Precision -  0.9719626168224299
For  GBDT
Accuracy -  0.9458413926499033
Precision -  0.9375
For  xgb
Accuracy -  0.9661508704061895
Precision -  0.9333333333333333


In [48]:
performance_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy':accuracy_scores,'Precision':precision_scores}).sort_values('Precision',ascending=False)


In [49]:
performance_df

Unnamed: 0,Algorithm,Accuracy,Precision
1,KN,0.918762,1.0
2,NB,0.971954,1.0
0,SVC,0.973888,0.980583
8,ETC,0.975822,0.971963
5,RF,0.967118,0.969388
9,GBDT,0.945841,0.9375
6,AdaBoost,0.966151,0.933333
10,xgb,0.966151,0.933333
4,LR,0.94971,0.930233
3,DT,0.941006,0.891566


In [50]:
performance_df1 = pd.melt(performance_df, id_vars = "Algorithm")

In [51]:
performance_df1

Unnamed: 0,Algorithm,variable,value
0,KN,Accuracy,0.918762
1,NB,Accuracy,0.971954
2,SVC,Accuracy,0.973888
3,ETC,Accuracy,0.975822
4,RF,Accuracy,0.967118
5,GBDT,Accuracy,0.945841
6,AdaBoost,Accuracy,0.966151
7,xgb,Accuracy,0.966151
8,LR,Accuracy,0.94971
9,DT,Accuracy,0.941006


In [52]:
# model improve
# 1. Change the max_features parameter of TfIdf

In [53]:
temp_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy_max_ft_3000':accuracy_scores,'Precision_max_ft_3000':precision_scores}).sort_values('Precision_max_ft_3000',ascending=False)


In [54]:
temp_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy_scaling':accuracy_scores,'Precision_scaling':precision_scores}).sort_values('Precision_scaling',ascending=False)


In [55]:
new_df = performance_df.merge(temp_df,on='Algorithm')


In [56]:
new_df_scaled = new_df.merge(temp_df,on='Algorithm')


In [57]:
temp_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy_num_chars':accuracy_scores,'Precision_num_chars':precision_scores}).sort_values('Precision_num_chars',ascending=False)


In [58]:
new_df_scaled.merge(temp_df,on='Algorithm')


Unnamed: 0,Algorithm,Accuracy,Precision,Accuracy_scaling_x,Precision_scaling_x,Accuracy_scaling_y,Precision_scaling_y,Accuracy_num_chars,Precision_num_chars
0,KN,0.918762,1.0,0.918762,1.0,0.918762,1.0,0.918762,1.0
1,NB,0.971954,1.0,0.971954,1.0,0.971954,1.0,0.971954,1.0
2,SVC,0.973888,0.980583,0.973888,0.980583,0.973888,0.980583,0.973888,0.980583
3,ETC,0.975822,0.971963,0.975822,0.971963,0.975822,0.971963,0.975822,0.971963
4,RF,0.967118,0.969388,0.967118,0.969388,0.967118,0.969388,0.967118,0.969388
5,GBDT,0.945841,0.9375,0.945841,0.9375,0.945841,0.9375,0.945841,0.9375
6,AdaBoost,0.966151,0.933333,0.966151,0.933333,0.966151,0.933333,0.966151,0.933333
7,xgb,0.966151,0.933333,0.966151,0.933333,0.966151,0.933333,0.966151,0.933333
8,LR,0.94971,0.930233,0.94971,0.930233,0.94971,0.930233,0.94971,0.930233
9,DT,0.941006,0.891566,0.941006,0.891566,0.941006,0.891566,0.941006,0.891566


In [59]:
# Voting Classifier
svc = SVC(kernel='sigmoid', gamma=1.0,probability=True)
mnb = MultinomialNB()
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)

from sklearn.ensemble import VotingClassifier

In [60]:
voting = VotingClassifier(estimators=[('svm', svc), ('nb', mnb), ('et', etc)],voting='soft')

In [61]:
voting.fit(X_train,y_train)

In [62]:
y_pred = voting.predict(X_test)
print("Accuracy",accuracy_score(y_test,y_pred))
print("Precision",precision_score(y_test,y_pred))

Accuracy 0.9777562862669246
Precision 0.9904761904761905


In [63]:
# Applying stacking
estimators=[('svm', svc), ('nb', mnb), ('et', etc)]
final_estimator=RandomForestClassifier()

In [64]:
from sklearn.ensemble import StackingClassifier

In [65]:
clf = StackingClassifier(estimators=estimators, final_estimator=final_estimator)

In [66]:
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Accuracy",accuracy_score(y_test,y_pred))
print("Precision",precision_score(y_test,y_pred))

Accuracy 0.9845261121856866
Precision 0.9824561403508771


from sklearn.naive_bayes import MultinomialNB

# Load your training data and preprocess it

# Instantiate the MultinomialNB classifier
model = MultinomialNB()

# Fit the model on your training data
model.fit(X_train, y_train)


In [67]:
from sklearn.naive_bayes import MultinomialNB

# Load your training data and preprocess it

# Instantiate the MultinomialNB classifier
model = MultinomialNB()

# Fit the model on your training data
mnb=model.fit(X_train, y_train)

In [68]:
import pickle
pickle.dump(tfidf,open('vectorizer.pkl','wb'))
pickle.dump(mnb,open('model.pkl','wb'))