# Spam Classifier

- to determine the authenticity of the received message, classifying it either as spam or non-spam

In [1]:
# importing important libraries
import numpy as np
import pandas as pd

In [2]:
# Importing dataset
df = pd.read_csv('F:/College/ML/Datasets/SMSSpamCollection',sep='\t',names=["label", "message"])

In [3]:
df

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


- ham : means message is not a spam

- spam : means that the message is spam

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [5]:
df['label'].unique()

array(['ham', 'spam'], dtype=object)

Uniqueness of 'label' column

In [6]:
df.isnull().sum()

label      0
message    0
dtype: int64

Checking for null values if any.
- no null values found

# Data Cleaning and Preporcessing

- **Lowercasing** the message column

In [7]:
df['message'].str.lower()

0       go until jurong point, crazy.. available only ...
1                           ok lar... joking wif u oni...
2       free entry in 2 a wkly comp to win fa cup fina...
3       u dun say so early hor... u c already then say...
4       nah i don't think he goes to usf, he lives aro...
                              ...                        
5567    this is the 2nd time we have tried 2 contact u...
5568                 will ü b going to esplanade fr home?
5569    pity, * was in mood for that. so...any other s...
5570    the guy did some bitching but i acted like i'd...
5571                           rofl. its true to its name
Name: message, Length: 5572, dtype: object

- Removing the **Stop Words** from messages

In [8]:
# importing Natural Language Tool-kit to remove Stop Words
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sharma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
from nltk.corpus import stopwords

In [10]:
def remove_stopwords(text):
    new_text = []
    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append("")
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

In [None]:
df['message'].apply(remove_stopwords)

# Text Vectorization

- using **Bag Of Words**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [None]:
x = cv.fit_transform(df['message'])

In [None]:
x.shape

- x : as independent feature here, 


In [None]:
np.sum(x)

Total words after vectorization

In [None]:
# Example metrics after Vectorization OF first rows 

for i in range(5):
    print(x[i].toarray())

Assigning dummy variables to 'Label' column , as 

- ham = 0

- spam = 1

In [None]:
df['label'] = df['label'].apply(lambda x:0 if x=='ham' else 1)

In [None]:
df['label']

In [None]:
y = df['label']

- y : as dependent feature 

# Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.20,random_state=10)

# Trainig Model Using Naive Bayes

**Multinomial Naive Bayes** , highly used for text classification

In [None]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB().fit(x_train,y_train)

In [None]:
y_pred = model.predict(x_test)

In [None]:
y_pred

# Accuracy

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_m = confusion_matrix(y_test,y_pred)

In [None]:
confusion_m

In [None]:
# checking the accuracy of model
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_pred))