In [29]:
#required liberaries
import pandas as pd
import numpy as np
import  re # regular expression
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings("ignore")

In [31]:
#data gathering 
df=pd.read_csv("SMSSpamCollection", sep='\t', names = ['lable','mesg'])
df.head()

Unnamed: 0,lable,mesg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
#EDA(Exploratory Data Analysis)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   lable   5572 non-null   object
 1   mesg    5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [None]:
df.isna().sum()

lable    0
mesg     0
dtype: int64

In [None]:
df['lable'].value_counts()

lable
ham     4825
spam     747
Name: count, dtype: int64

In [None]:
corpus = []
lm = WordNetLemmatizer()
for i in range(len(df)):
    review = re.sub(r'[^a-zA-Z0-9]', ' ', df['mesg'][i])
    review = review.lower()
    review = review.split()
    review = [data for data in review if data not in stopwords.words('english')]
    review = [lm.lemmatize(data) for data in review]
    review = "".join(review)
    corpus.append(review)

In [None]:
df['mesg'][0]


'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [None]:
len(df['mesg'])

5572

In [None]:
len(corpus)

5572

In [None]:
df['mesg']=corpus
df.head()

Unnamed: 0,lable,mesg
0,ham,gojurongpointcrazyavailablebugisngreatworldlae...
1,ham,oklarjokingwifuoni
2,spam,freeentry2wklycompwinfacupfinaltkts21stmay2005...
3,ham,udunsayearlyhorucalreadysay
4,ham,nahthinkgousflifearoundthough


In [None]:
# model building
## data splitting
x = df['mesg']
y = df['lable']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.3, random_state = 10)

In [None]:
len(x_train), len(y_train)

(3900, 3900)

In [None]:
len(x_test), len(y_test)

(1672, 1672)

In [None]:
## vectorization (convert text data into the vectors)
tf_obj = TfidfVectorizer()
x_train_tfidf = tf_obj.fit_transform(x_train).toarray()
x_train_tfidf 

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
x_train_tfidf.shape

(3900, 3639)

In [None]:
##pipeline
text_mnb = Pipeline([('tfidf',TfidfVectorizer()),('mnb',MultinomialNB())])

In [None]:
text_mnb.fit(x_train,y_train)

In [None]:
#acuucracy score on testing data
y_pred_test = text_mnb.predict(x_test)
print("Acurracy Score: ",accuracy_score(y_test,y_pred_test)*100)

Acurracy Score:  87.14114832535886


In [None]:
#acuucracy score on training data
y_pred_train = text_mnb.predict(x_train)
print("Acurracy Score: ",accuracy_score(y_train,y_pred_train)*100)

Acurracy Score:  86.66666666666667


In [None]:
#confusion matrix on testing data
y_pred_test = text_mnb.predict(x_test)
print("Confusion matrix on test data:\n",confusion_matrix(y_test,y_pred_test))

Confusion matrix on test data:
 [[1457    0]
 [ 215    0]]


In [None]:
#classification report on testing data
y_pred_test = text_mnb.predict(x_test)
print("Classifcation report on test data:\n",classification_report(y_test,y_pred_test))

Classifcation report on test data:
               precision    recall  f1-score   support

         ham       0.87      1.00      0.93      1457
        spam       0.00      0.00      0.00       215

    accuracy                           0.87      1672
   macro avg       0.44      0.50      0.47      1672
weighted avg       0.76      0.87      0.81      1672



In [None]:
#prediction on user data
def preprocess_data(text):
    review = re.sub('^a-zA-Z0-9',' ',text)
    review = review.lower()
    review = review.split()
    review = [data for data in review if data not in stopwords.words('english')]
    review = [lm.lemmatize(data) for data in review]
    review = " ".join(review)
    return[review]
    

In [40]:
user_data = df['mesg'][2]
print(user_data)
user_data = preprocess_data(user_data)
user_data


Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's


["free entry 2 wkly comp win fa cup final tkts 21st may 2005. text fa 87121 receive entry question(std txt rate)t&c's apply 08452810075over18's"]

In [41]:
 text_mnb.predict(user_data)[0]

'ham'

In [None]:
class prediction:
    def __init__(self,data):
        self.data = data
    def user_data_preprocessing(self):
        lm=WordNetLemmatizer()
        review = re.sub('^a-zA-Z0-9',' ',self.data)
        review = review.lower()
        review = review.split()
        review = [data for data in review if data not in stopwords.words('english')]
        review = [lm.lemmatize(data) for data in review]
        review = " ".join(review)
        return[review]
    
    def user_data_prediction(self):
        preprocess_data = self.user_data_preprocessing()
        if text_mnb.predict(preprocess_data)[0] == 'spam':
            print('This Message is spam')
        else:
            print('This Message is Ham')

In [None]:
df.head()

Unnamed: 0,lable,mesg
0,ham,gojurongpointcrazyavailablebugisngreatworldlae...
1,ham,oklarjokingwifuoni
2,spam,freeentry2wklycompwinfacupfinaltkts21stmay2005...
3,ham,udunsayearlyhorucalreadysay
4,ham,nahthinkgousflifearoundthough


In [32]:
user_data = df['lable'][2]
print(user_data)
prediction(user_data).user_data_prediction()

spam
This Message is Ham
