# Detecting the spam and legitimate messages in SMS using ML

## Loading data

In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
data = pd.read_csv('./data/spam.txt', delimiter='\t' , header=None)

In [7]:
df = pd.DataFrame(data)

In [14]:
df.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [15]:
df.rename(columns={0: 'label', 1: 'text'}, inplace=True)

In [17]:
df = df[['text', 'label']]

In [18]:
df.head()

Unnamed: 0,text,label
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham


In [19]:
X = df['text']
y = df['label']

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
X_train.head()

1978    Reply to win £100 weekly! Where will the 2006 ...
3989    Hello. Sort of out in town already. That . So ...
3935     How come guoyang go n tell her? Then u told her?
4078    Hey sathya till now we dint meet not even a si...
4086    Orange brings you ringtones from all time Char...
Name: text, dtype: object

In [25]:
y_train.head()

1978    spam
3989     ham
3935     ham
4078     ham
4086    spam
Name: label, dtype: object

# we will use CountVectorizer to vectorize our text

In [27]:
from sklearn.feature_extraction.text import CountVectorizer

In [28]:
vectorizer = CountVectorizer()

In [29]:
X_train = vectorizer.fit_transform(X_train)

In [30]:
X_train

<4457x7702 sparse matrix of type '<class 'numpy.int64'>'
	with 59296 stored elements in Compressed Sparse Row format>

In [31]:
X_test = vectorizer.transform(X_test)

In [32]:
X_train.shape

(4457, 7702)

In [33]:
X_test.shape

(1115, 7702)

In [35]:
pd.DataFrame(X_train.toarray(), columns=vectorizer.get_feature_names_out())

Unnamed: 0,00,000,000pes,008704050406,0089,0121,01223585236,01223585334,02,0207,...,zeros,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada,èn,ú1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4452,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4453,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4454,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4455,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


 ###  our features have just 2 different values: 0 and 1 and also our Target variable have just 2 different values spam and ham . So --> The best model will be Beroulli Naive Bayes.

In [36]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score

In [37]:
clf = BernoulliNB()
clf.fit(X_train, y_train)

In [38]:
y_perdict = clf.predict(X_test)

In [39]:
y_perdict

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype='<U4')

In [40]:
accuracy_score(y_test, y_perdict)

0.9820627802690582