In [1]:
import pandas as pd
# vectorizer - it will take our training value and convert it into frequency table , like Sparse table
from sklearn.feature_extraction.text import CountVectorizer
# importing naive_bayes multinomial because it works well with text data
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score , f1_score , confusion_matrix
from sklearn.model_selection import train_test_split

In [2]:
#passing encoding-latin1 because it is encoded in latin1 but not in encoded in utf-8

df=pd.read_csv("spam.csv" , encoding='latin-1')
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


# Domain Analysis

1] In the Excel file, we originally had 2 features, but after Latin encoding, we now have 5 features.

2] Three of them are corrupted features, which we will drop later.

3] The input feature is a collection of emails (data), and the output is Spam/Ham (label).

# Basic Check

1]no null values are there

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [4]:
df.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [5]:
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


# no EDA required

# Feature Engineering

In [6]:
from sklearn.preprocessing import LabelEncoder

le=LabelEncoder()

df['v1']=le.fit_transform(df['v1'])

In [7]:
df['v1'].unique()

array([0, 1])

# Feature Selection

1]dropped un-necessary columns

2] renamed both columns

In [8]:
# dropping corrupted columns

df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'] , inplace=True , axis=1)

In [9]:
# renaming with meaningful name

df.rename({'v1':'Label' , 'v2':'Data'} , axis=1 , inplace=True)

# Creating Model

In [10]:
# splitting input-output

x , y = df['Data'] , df['Label']

In [11]:
# train-test split

x_train , x_test , y_train , y_test = train_test_split(x , y , test_size=0.3 , random_state=40)

In [12]:
vectorizer= CountVectorizer()

vcized=vectorizer.fit_transform(x_train)

In [13]:
vcized

<3900x7329 sparse matrix of type '<class 'numpy.int64'>'
	with 52165 stored elements in Compressed Sparse Row format>

# Selecting model

In [14]:
n_bay= MultinomialNB()
n_bay.fit(vcized , y_train)

In [15]:
# predecting task

pred = n_bay.predict(vectorizer.transform(x_test))

In [16]:
y.value_counts()

Label
0    4825
1     747
Name: count, dtype: int64

In [17]:
# Evaluating Model

In [18]:
confusion_matrix(y_test , pred)

array([[1451,    6],
       [  17,  198]], dtype=int64)

In [19]:
accuracy_score(y_test , pred)

0.986244019138756

In [20]:
f1_score(y_test , pred)

0.9451073985680191