**Workflow:-**
1. Data Cleaning and Preprocessing
2. Train, Test Split
3. TF-IDF/BOW
4. Model Training
5. Making Predictions
6. Performance Evaluation

In [1]:
# importing basic libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# importing our dataset

messages=pd.read_csv('SMSSpamCollection.csv')
messages.head()

Unnamed: 0,Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
# importing NLP libraries

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [4]:
ps=PorterStemmer()

In [5]:
corpus=[]

for i in range(0, len(messages)):
    review=re.sub('[^a-zA-Z]', ' ', messages['Message'][i])
    review=review.lower()
    review=review.split()
    review=[ps.stem(word) for word in review if not word in stopwords.words()]
    review=' '.join(review)
    corpus.append(review)

In [6]:
corpus

['jurong point crazi avail bugi great world buffet amor',
 'lar joke wif',
 'free entri wkli comp win cup final tkt st text receiv entri question std txt rate appli',
 'dun earli',
 'usf live',
 'freemsg hey darl week word back fun tb xxx std chg send rcv',
 'brother speak treat aid patent',
 'request mell mell oru minnaminungint nurungu vettam set callertun caller press copi friend callertun',
 'winner valu network custom select receivea prize reward claim call claim code kl valid hour',
 'mobil month r entitl updat latest colour mobil camera free call mobil updat co free',
 'gonna home talk stuff anymor tonight cri today',
 'chanc win cash pound txt csh send cost p day day tsandc appli repli hl info',
 'urgent week free membership prize jackpot txt word claim www dbuk net lccltd pobox ldnw rw',
 'search word breather promis grant fulfil promis wonder bless time',
 'date sunday',
 'xxxmobilemovieclub credit click wap link txt messag click http wap xxxmobilemovieclub qjkgighjjgcbl',
 '

In [7]:
# Label Encoding our Output Variable

from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y=le.fit_transform(messages['Label'])

In [8]:
y

array([0, 0, 1, ..., 0, 0, 0])

In [9]:
# Train, Test, Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(corpus, y, test_size=0.25, random_state=33)

In [10]:
# creating our Bag Of Words Model
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=2500, ngram_range=(1,2))

In [11]:
X_train=cv.fit_transform(X_train).toarray()
X_test=cv.transform(X_test).toarray()

In [12]:
np.set_printoptions(edgeitems=30, linewidth=1000000, formatter=dict(float=lambda x:'%.3g'%x) )

In [14]:
cv.vocabulary_

{'ask': 98,
 'call': 246,
 'place': 1572,
 'room': 1763,
 'cheap': 347,
 'friend': 804,
 'order': 1510,
 'gram': 888,
 'lt': 1236,
 'gt': 900,
 'lt gt': 1238,
 'princess': 1626,
 'make': 1256,
 'happi': 941,
 'princess make': 1627,
 'make happi': 1260,
 'sura': 2051,
 'sun': 2039,
 'tv': 2220,
 'month': 1377,
 'ur': 2268,
 'standard': 1979,
 'network': 1438,
 'charg': 340,
 'activ': 14,
 'net': 1435,
 'st': 1974,
 'term': 2104,
 'pobox': 1586,
 'uz': 2307,
 'cost': 467,
 'pobox uz': 1588,
 'meet': 1295,
 'work': 2427,
 'tel': 2095,
 'tomorrow': 2180,
 'work tomorrow': 2431,
 'april': 81,
 'real': 1684,
 'date': 528,
 'today': 2163,
 'give': 857,
 'special': 1951,
 'treat': 2203,
 'secret': 1804,
 'wish': 2406,
 'buy': 236,
 'insid': 1056,
 'bedroom': 155,
 'orchard': 1509,
 'dad': 519,
 'car': 302,
 'dinner': 589,
 'leh': 1167,
 'free': 778,
 'tonight': 2188,
 'hey': 968,
 'gave': 831,
 'photo': 1553,
 'regist': 1706,
 'drive': 625,
 'tmr': 2156,
 'wanna': 2351,
 'wait': 2339,
 'bu': 2

In [15]:
from sklearn.naive_bayes import MultinomialNB

In [16]:
spam_detect_model=MultinomialNB()
spam_detect_model.fit(X_train, y_train)
spam_detect_model

In [17]:
y_pred=spam_detect_model.predict(X_test)

In [18]:
# performance metrics

from sklearn.metrics import confusion_matrix, classification_report
cm=confusion_matrix(y_pred=y_pred, y_true=y_test)
cr=classification_report(y_pred=y_pred, y_true=y_test)
print(cm)
print(cr)

[[1179   16]
 [  16  182]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1195
           1       0.92      0.92      0.92       198

    accuracy                           0.98      1393
   macro avg       0.95      0.95      0.95      1393
weighted avg       0.98      0.98      0.98      1393



In [19]:
# Train, Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(corpus, y, test_size=0.25, random_state=23)

In [20]:
# creating the TF-IDF model

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer()
X_train=tfidf.fit_transform(X_train).toarray()
X_test=tfidf.transform(X_test).toarray()

In [21]:
tfidf.vocabulary_

{'fine': 1561,
 'love': 2573,
 'safe': 3770,
 'ride': 3702,
 'equal': 1385,
 'unev': 4705,
 'peski': 3255,
 'cyclist': 1022,
 'time': 4524,
 'night': 2965,
 'guarante': 1871,
 'cash': 675,
 'prize': 3445,
 'claim': 791,
 'yr': 5122,
 'call': 631,
 'custom': 1012,
 'servic': 3879,
 'repres': 3660,
 'read': 3580,
 'shame': 3902,
 'take': 4370,
 'run': 3757,
 'blame': 466,
 'long': 2544,
 'neshanth': 2938,
 'tel': 4414,
 'wan': 4861,
 'meet': 2712,
 'pm': 3325,
 'costa': 929,
 'sol': 4082,
 'holiday': 2018,
 'await': 308,
 'collect': 843,
 'toclaim': 4551,
 'sae': 3768,
 'tc': 4399,
 'pobox': 3329,
 'stockport': 4224,
 'sk': 4006,
 'xh': 5064,
 'cost': 928,
 'max': 2689,
 'min': 2751,
 'hot': 2042,
 'live': 2517,
 'fantasi': 1492,
 'ntt': 3019,
 'box': 526,
 'croydon': 978,
 'cr': 951,
 'wb': 4889,
 'lanr': 2424,
 'fakey': 1481,
 'eckankar': 1314,
 'detail': 1119,
 'mail': 2637,
 'nokia': 2988,
 'colour': 846,
 'phone': 3271,
 'deliveredtomorrow': 1093,
 'free': 1652,
 'minut': 2763,
 'mo

In [22]:
# training our model

from sklearn.naive_bayes import MultinomialNB
spam_detect_model_tfidf=MultinomialNB()
spam_detect_model_tfidf.fit(X_train, y_train)

In [23]:
# making predictions

y_pred=spam_detect_model_tfidf.predict(X_test)

In [24]:
# performance evaluation

from sklearn.metrics import confusion_matrix, classification_report
cm=confusion_matrix(y_pred=y_pred, y_true=y_test)
cr=classification_report(y_pred=y_pred, y_true=y_test)
print(cm)
print(cr)

[[1206    1]
 [  41  145]]
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1207
           1       0.99      0.78      0.87       186

    accuracy                           0.97      1393
   macro avg       0.98      0.89      0.93      1393
weighted avg       0.97      0.97      0.97      1393

