In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [3]:
df = pd.read_csv('spam.csv')

In [4]:
df.head()

Unnamed: 0,Label,EmailText
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df.describe()

Unnamed: 0,Label,EmailText
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [6]:
x = df['EmailText']
y = df['Label']

In [7]:
from sklearn.model_selection import train_test_split
x_train, x_test,y_train, y_test = train_test_split(x,y,test_size = 0.2, random_state = 0)

In [8]:
print(x_train.shape)
print(y_train.shape)
x_test.shape

(4457,)
(4457,)


(1115,)

In [10]:
tuned_parameters = {'kernel':['linear','rbf'],'gamma':[1e-3,1e-4],'C':[1,10,100,1000]}

In [11]:
cv = CountVectorizer()
features = cv.fit_transform(x_train)


In [12]:
model = GridSearchCV( svm.SVC(),tuned_parameters)

model.fit(features,y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
                         'kernel': ['linear', 'rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [13]:
model.best_params_

{'C': 1, 'gamma': 0.001, 'kernel': 'linear'}

In [14]:
features_test = cv.transform(x_test)

In [15]:
model.score(features_test,y_test)

0.979372197309417

In [18]:
Spam_model = LogisticRegression(solver='liblinear', penalty='l1')
Spam_model.fit(features, y_train)
Spam_model.score(features_test,y_test)


0.9766816143497757

In [None]:
#Using NLTK

In [74]:
import nltk
from nltk.corpus import stopwords
import string

In [81]:
dataframe = pd.read_csv('spam.csv')

In [82]:
dataframe.head()

Unnamed: 0,Label,EmailText
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [87]:
dataframe.shape

(5572, 2)

In [88]:
df.columns

Index(['Label', 'EmailText'], dtype='object')

In [89]:
df.drop_duplicates(inplace = True)

In [90]:
df.shape

(5169, 2)

In [91]:
df.isnull().sum()

Label        0
EmailText    0
dtype: int64

In [93]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo-i5\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [95]:
def process_text(text):
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)

    cleanwords = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

    return cleanwords

In [98]:
dataframe['EmailText'].head().apply(process_text)

0    [Go, jurong, point, crazy, Available, bugis, n...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3        [U, dun, say, early, hor, U, c, already, say]
4    [Nah, dont, think, goes, usf, lives, around, t...
Name: EmailText, dtype: object

In [99]:
message_bow = CountVectorizer(analyzer=process_text).fit_transform(dataframe['EmailText'])

In [100]:
x_train,x_test,y_train,y_test = train_test_split(message_bow,dataframe['Label'],test_size = 0.20,random_state = 0)

In [102]:
classifier = MultinomialNB().fit(x_train,y_train)

In [103]:
classifier.score(x_test,y_test)

0.957847533632287

In [104]:
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score
prediction = classifier.predict(x_train)
print(classification_report(y_train,prediction))

              precision    recall  f1-score   support

         ham       1.00      1.00      1.00      3876
        spam       0.98      0.98      0.98       581

    accuracy                           0.99      4457
   macro avg       0.99      0.99      0.99      4457
weighted avg       0.99      0.99      0.99      4457



In [105]:
print(confusion_matrix(y_train,prediction))

[[3864   12]
 [  12  569]]


In [106]:
prediction = classifier.predict(x_test)
print(classification_report(y_test,prediction))

              precision    recall  f1-score   support

         ham       0.99      0.96      0.97       949
        spam       0.81      0.93      0.87       166

    accuracy                           0.96      1115
   macro avg       0.90      0.95      0.92      1115
weighted avg       0.96      0.96      0.96      1115



In [107]:
print(confusion_matrix(y_test,prediction))

[[914  35]
 [ 12 154]]
