# Librarries Required


In [85]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
import joblib

# 1.Data Gathering

In [86]:
df = pd.read_csv("C:/Users/achut/AppData/Roaming/Microsoft/Windows/Start Menu/Programs/Visual Studio Code/SMSSpamCollection_new.txt", sep='\t', names=['Label', 'Msg'])
df.head()

Unnamed: 0,Label,Msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# 2. Exploratory Data Analysis


In [87]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5606 entries, 0 to 5605
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Label   5606 non-null   object
 1   Msg     5605 non-null   object
dtypes: object(2)
memory usage: 87.7+ KB


In [88]:
df.isna().sum()

Label    0
Msg      1
dtype: int64

In [89]:
df['Label'].value_counts()

Label
ham                        4836
spam                        769
District Administration       1
Name: count, dtype: int64

# 3. Data Preprocessing

In [90]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Ensure stopwords and wordnet are downloaded
nltk.download('stopwords')
nltk.download('wordnet')

corpus = []
lm = WordNetLemmatizer()

for i in range(len(df)):
    # Convert message to string to handle NaN values and remove non-alphanumeric characters
    review = re.sub('[^a-zA-Z0-9]', ' ', str(df['Msg'][i]))
    review = review.lower()
    review = review.split()
    
    # Remove stopwords
    review = [word for word in review if word not in stopwords.words('english')]
    
    # Lemmatize each word
    review = [lm.lemmatize(word) for word in review]
    
    # Join the cleaned words back into a single string
    review = " ".join(review)
    
    # Append to corpus
    corpus.append(review)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\achut\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\achut\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [91]:
df['Msg'][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [92]:
len(df['Msg'])

5606

In [93]:
len(corpus)

5606

In [94]:
df['Msg']=corpus
df.head()

Unnamed: 0,Label,Msg
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor u c already say
4,ham,nah think go usf life around though


# 4. Model Building

## 4.1 Data Splitting

In [95]:
x = df['Msg']
y = df['Label']

In [96]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.3, random_state = 10)

In [97]:
len(x_train), len(y_train)

(3924, 3924)

In [98]:
len(x_test),len(y_test)

(1682, 1682)

## 4.2 Vectorization (Convert Text Data Into The Vectors)

In [99]:
tf_obj = TfidfVectorizer()
x_train_tfidf = tf_obj.fit_transform(x_train).toarray()
x_train_tfidf

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [100]:
x_train_tfidf.shape

(3924, 6782)

## 4.3 Pipeline

### Naive Bayes

In [101]:
text_mnb = Pipeline([('tfidf',TfidfVectorizer()),('mnb',MultinomialNB())])

In [102]:
text_mnb.fit(x_train,y_train)

In [103]:
y_pred_test = text_mnb.predict(x_test)
print("Accuracy Score:", accuracy_score(y_test,y_pred_test)*100)

Accuracy Score: 96.43281807372176


In [104]:
y_pred_train = text_mnb.predict(x_train)
print("Accuracy Score:",accuracy_score(y_train,y_pred_train)*100)

Accuracy Score: 98.11416921508665


In [105]:
y_pred_test = text_mnb.predict(x_test)
print("Confusion Matrix on Test Data:\n", confusion_matrix(y_test,y_pred_test))

Confusion Matrix on Test Data:
 [[1478    0]
 [  60  144]]


In [106]:
y_pred_test = text_mnb.predict(x_test)
print("Classification Reportx on Test Data:\n", classification_report(y_test,y_pred_test))

Classification Reportx on Test Data:
               precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1478
        spam       1.00      0.71      0.83       204

    accuracy                           0.96      1682
   macro avg       0.98      0.85      0.90      1682
weighted avg       0.97      0.96      0.96      1682



### svc

In [107]:
text_svc = Pipeline([('tfidf', TfidfVectorizer()), ('svc', SVC(kernel='linear'))])
text_svc.fit(x_train, y_train)

In [108]:
y_pred_svc = text_svc.predict(x_test)
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svc) * 100)

SVM Accuracy: 98.21640903686088


In [109]:
y_pred_svc = text_svc.predict(x_train)
print("SVM Accuracy:", accuracy_score(y_train, y_pred_svc) * 100)

SVM Accuracy: 99.66870540265036


In [110]:
y_pred_svc = text_svc.predict(x_test)
print("Classification Reportx on Test Data:\n", classification_report(y_test,y_pred_svc))

Classification Reportx on Test Data:
               precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1478
        spam       0.98      0.87      0.92       204

    accuracy                           0.98      1682
   macro avg       0.98      0.93      0.96      1682
weighted avg       0.98      0.98      0.98      1682



# Prediction on User_data 

In [111]:
def preprocess_data(text):
    review = re.sub('^a-zA-Z0-9',' ',text)
    review = review.lower()
    review = review.split()
    review = [data for data in review if data not in stopwords.words('english')]
    review = [lm.lemmatize(data) for data in review]
    review = " ".join(review)
    return [review]

In [112]:
user_data = df['Msg'][0]
print(user_data)
user_data = preprocess_data(user_data)
user_data

go jurong point crazy available bugis n great world la e buffet cine got amore wat


['go jurong point crazy available bugis n great world la e buffet cine got amore wat']

In [113]:
text_mnb.predict(user_data)[0]

np.str_('ham')

In [114]:
class prediction:

    def __init__(self,data):
        self.data = data

    def user_data_preprocessing(self):
        lm = WordNetLemmatizer()
        review = re.sub('^a-zA-Z0-9',' ',self.data)
        review = review.lower()
        review = review.split()
        review = [data for data in review if data not in stopwords.words('english')]
        review = [lm.lemmatize(data) for data in review]
        review = " ".join(review)
        return [review]

    def user_data_prediction(self):
        preprocess_data = self.user_data_preprocessing()

        if text_mnb.predict(preprocess_data)[0] == 'spam':
            return 'This Message is Spam'

        else:
            return 'This Message is Ham'

In [115]:
text_svc.predict(user_data)[0]

'ham'

In [116]:
class prediction:

    def __init__(self,data):
        self.data = data

    def user_data_preprocessing(self):
        lm = WordNetLemmatizer()
        review = re.sub('^a-zA-Z0-9',' ',self.data)
        review = review.lower()
        review = review.split()
        review = [data for data in review if data not in stopwords.words('english')]
        review = [lm.lemmatize(data) for data in review]
        review = " ".join(review)
        return [review]

    def user_data_prediction(self):
        preprocess_data = self.user_data_preprocessing()

        if text_svc.predict(preprocess_data)[0] == 'spam':
            return 'This Message is Spam'

        else:
            return 'This Message is Ham'

In [117]:
df.head()

Unnamed: 0,Label,Msg
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor u c already say
4,ham,nah think go usf life around though


In [118]:
user_data = df['Msg'][2]
print(user_data)
prediction(user_data).user_data_prediction()

free entry 2 wkly comp win fa cup final tkts 21st may 2005 text fa 87121 receive entry question std txt rate c apply 08452810075over18


'This Message is Spam'

In [119]:
user_data = df['Msg'][3]
print(user_data)
prediction(user_data).user_data_prediction()

u dun say early hor u c already say


'This Message is Ham'

In [120]:
user_data = df['Msg'][111]
print(user_data)
prediction(user_data).user_data_prediction()

plural noun research


'This Message is Ham'