In [53]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [54]:
df = pd.read_csv('spam.csv', encoding='latin')

In [55]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


## Preprocessing

In [56]:
df.rename(columns = {'v1':'label', 'v2':'text', 'Unnamed: 2':'text1', 'Unnamed: 3':'text2', 'Unnamed: 4':'text3'}, inplace = True) 

In [57]:
df['text1'] = df['text1'].replace(np.nan, "")
df['text2'] = df['text2'].replace(np.nan, "")
df['text3'] = df['text3'].replace(np.nan, "")

In [58]:
df['text'] = df['text'] + df['text1'] + df['text2'] + df['text3']

In [59]:
df = df[['label', 'text']]

In [60]:
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [61]:
df.shape

(5572, 2)

In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   5572 non-null   object
 1   text    5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [63]:
X = df.iloc[:, 1]
y = df.iloc[:, 0]
print(X)
print(y)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: text, Length: 5572, dtype: object
0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: label, Length: 5572, dtype: object


In [64]:
## Convert ham/spam to 0/1
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [65]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0) 

In [67]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
cv.fit(X_train.values.ravel())
X_train= cv.transform(X_train.values.ravel())
X_test= cv.transform(X_test.values.ravel())
X_train=X_train.toarray()
X_test=X_test.toarray()

In [None]:
sns.set_theme(style="darkgrid")
ax = sns.countplot(x="label", data=df)

## Over sampling

In [68]:
from imblearn.over_sampling import SMOTE
smt = SMOTE()
X_train, y_train = smt.fit_sample(X_train, y_train)

In [69]:
print(X_train.shape)
print(y_train.shape)

(6782, 7158)
(6782,)


In [70]:
unique, counts =np.unique(y_train, return_counts=True)
dict(zip(unique, counts))

{0: 3391, 1: 3391}

# Training (SVM)

In [71]:
from sklearn.svm import SVC
svm = SVC(kernel = 'rbf', random_state = 0)
svm.fit(X_train, y_train)

SVC(random_state=0)

In [72]:
y_pred = svm.predict(X_test)

In [73]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[1348   86]
 [  43  195]]


0.9228468899521531

## Training (Naive Bayes)

In [74]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

In [75]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[1420   14]
 [  23  215]]


0.9778708133971292

## Prediction

In [76]:
test = ['Earn cashback of flat  500 on your online international transactions* on cumulative spends of  15,000 and above on your ICICI Bank Credit or Debit Card.']
test_vec = cv.transform(test).toarray()
result = nb.predict(test_vec)
print(le.inverse_transform(result))

['spam']


# Saving Model

In [77]:
import pickle
pkl_filename = "spam_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump((nb, cv, le), file)

#