<a href="https://colab.research.google.com/github/nitish-pandey/Spam-Classifier/blob/main/Spam_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Basic Imports

In [1]:
from os import remove
import pandas as pd
import numpy 
import sklearn
import re

In [2]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score,roc_auc_score,f1_score,log_loss,precision_score,recall_score,classification_report

from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier,GradientBoostingClassifier,RandomForestClassifier



# Data imports and preprocessing

In [5]:
dataset=pd.read_csv(r'/content/spam.csv',encoding='latin-1')
dataset.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [6]:
X=dataset['v2']
y=dataset['v1']

y=pd.get_dummies(y,drop_first=True)
print(y.head(),'\n\n',X.head())

   spam
0     0
1     0
2     1
3     0
4     0 

 0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: v2, dtype: object


# Data Cleaning

In [7]:
stopword=set(stopwords.words('english'))

lemmatizer=WordNetLemmatizer()

n,=X.shape
n

5572

In [8]:
for i in range(n):
    temp=re.sub('[^a-zA-Z]',' ',X[i])
    temp=temp.lower()
    temp=temp.split()
    temp=[lemmatizer.lemmatize(word) for word in temp if word not in set(stopword)]
    temp=' '.join(temp)
    X[i]=temp


In [9]:
cv=CountVectorizer()

X=cv.fit_transform(X).toarray()

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3)

In [10]:
classifier=[GaussianNB(),MultinomialNB(),GradientBoostingClassifier(),RandomForestClassifier(),SVC(),BaggingClassifier()]
names=["GaussianNb",'MultiNomialNB','Gradient Boosting','Random Forest','Support Vector','bagging']
accuracy=[]
rocscore=[]
f1=[]
log=[]
precision=[]
recall=[]

for i in range(len(classifier)):
    model=classifier[i]
    model.fit(X_train,y_train.values.ravel())
    y_pred=model.predict(X_test)
    acc=accuracy_score(y_test,y_pred)
    accuracy.append(acc)
    roc=roc_auc_score(y_test,y_pred)
    rocscore.append(roc)
    matrix=confusion_matrix(y_test,y_pred)
    print('\n\n','\033[1m',names[i],'\033[0m')
    loss=log_loss(y_test,y_pred)
    f1s=f1_score(y_test,y_pred)
    log.append(loss)
    f1.append(f1s)
    prec=precision_score(y_test,y_pred)
    precision.append(prec)
    rec=recall_score(y_test,y_pred)
    recall.append(rec)
    print('Accuracy score : ',acc)
    print('ROC score : ',roc)
    print('F1 score : ',f1s)
    print('Precision score : ',prec)
    print('Recall score : ',rec)
    print('Log-Loss : ',loss)
    print('Confusion Matrix : \n',matrix)



 [1m GaussianNb [0m
Accuracy score :  0.8947368421052632
ROC score :  0.884892644212272
F1 score :  0.697594501718213
Precision score :  0.5816618911174785
Recall score :  0.871244635193133
Log-Loss :  3.635730494455197
Confusion Matrix : 
 [[1293  146]
 [  30  203]]


 [1m MultiNomialNB [0m
Accuracy score :  0.9754784688995215
ROC score :  0.9605755666041332
F1 score :  0.9144050104384134
Precision score :  0.8902439024390244
Recall score :  0.9399141630901288
Log-Loss :  0.8469565916997337
Confusion Matrix : 
 [[1412   27]
 [  14  219]]


 [1m Gradient Boosting [0m
Accuracy score :  0.9629186602870813
ROC score :  0.8669527896995708
F1 score :  0.8465346534653465
Precision score :  1.0
Recall score :  0.7339055793991416
Log-Loss :  1.2807441007682197
Confusion Matrix : 
 [[1439    0]
 [  62  171]]


 [1m Random Forest [0m
Accuracy score :  0.9706937799043063
ROC score :  0.8948497854077253
F1 score :  0.882494004796163
Precision score :  1.0
Recall score :  0.78969957081545

In [11]:
performance=pd.DataFrame({'Name':names,'Accuracy Score':acc,'ROC AUC score':roc,'F1 Score':f1,'Log-Loss':log,'Precision':precision,'Recall':recall})
performance.head(10)

Unnamed: 0,Name,Accuracy Score,ROC AUC score,F1 Score,Log-Loss,Precision,Recall
0,GaussianNb,0.965909,0.886675,0.697595,3.63573,0.581662,0.871245
1,MultiNomialNB,0.965909,0.886675,0.914405,0.846957,0.890244,0.939914
2,Gradient Boosting,0.965909,0.886675,0.846535,1.280744,1.0,0.733906
3,Random Forest,0.965909,0.886675,0.882494,1.012201,1.0,0.7897
4,Support Vector,0.965909,0.886675,0.871671,1.09483,1.0,0.772532
5,bagging,0.965909,0.886675,0.863962,1.177461,0.973118,0.776824


# Support vector Machine 

In [None]:
## Run this if you want to tune the parameters else run the cell below this.

from sklearn.model_selection import GridSearchCV
 
# defining parameter range
parameters = {'C': [0.1, 1, 10, 100,1000],
              'gamma': [1, 0.1, 0.01,0.001],
              'kernel': ['rbf','poly','sigmoid','linear']
              }
 
model = GridSearchCV(SVC(), parameters, refit = True, verbose = 3)
 
# fitting the model for grid search
model.fit(X_train, y_train.values.ravel())

print(model.best_params_)
print(model.best_estimator_)

In [12]:
model=SVC()
model.fit(X_train,y_train.values.ravel())

SVC()

In [13]:
y_pred = model.predict(X_test)
print('Classification Report : \n',classification_report(y_test,y_pred))
print('\nConfusion Matrix\n',confusion_matrix(y_test,y_pred))

Classification Report : 
               precision    recall  f1-score   support

           0       0.96      1.00      0.98      1439
           1       1.00      0.77      0.87       233

    accuracy                           0.97      1672
   macro avg       0.98      0.89      0.93      1672
weighted avg       0.97      0.97      0.97      1672


Confusion Matrix
 [[1439    0]
 [  53  180]]


# Saving the Model

In [14]:
import joblib

joblib.dump(model,'mymodel.pkl')

['mymodel.pkl']