In [1]:
import pandas as pd
import nltk
import re
from sklearn.feature_extraction.text import CountVectorizer
import string

# SUPPORT VECTOR MACHINE

In [2]:
df = pd.read_csv("SMSSpamCollection.txt", sep="\t", header=None,  names=["label", "Content"])

In [3]:
en_stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()
data = df.copy()

In [4]:
## count words in the email
data['Content_len'] = data['Content'].apply(lambda x: len(x) - x.count(" "))

In [5]:
def count_punctuation(text):
    binary_array = [1 for ch in text if ch in string.punctuation]
    nb_punctuation = sum(binary_array)
    total = len(text) - text.count(" ")
    return round(nb_punctuation/(total), 4)*100 

In [6]:
# Count the rate of punctuation in email
data['punctuation_rate'] = data['Content'].apply(lambda x : count_punctuation(x))

In [7]:
## cleaning and stemming emails
def clean_email(email):
    result = "".join([word for word in email if word not in string.punctuation])
    tokens = re.split('\W+', result)
    final_result = [ps.stem(word) for word in tokens if word not in en_stopwords]
    return final_result

## SVM WITH COUNTVECTORIZER

In [9]:
# Vectorisation using CountVectorizer
vectorisation_full = CountVectorizer(analyzer=clean_email)
vect_fnal = vectorisation_full.fit_transform(data['Content'])

In [10]:
## creating new dataframe with all the information that we need
all_data = pd.concat([pd.DataFrame(vect_fnal.toarray()),data['Content_len'], data['punctuation_rate']], axis=1)

In [8]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, confusion_matrix, f1_score

In [25]:
X_train, X_test, y_train, y_test = train_test_split(all_data,
                                                   data['label'],
                                                   test_size=0.2)

In [11]:
from sklearn import svm

In [28]:
alg_svm = svm.SVC(kernel= 'linear')
alg_svm.fit(X_train, y_train)

SVC(kernel='linear')

In [29]:
predictions = alg_svm.predict(X_test)

In [30]:
predictions

array(['ham', 'ham', 'ham', ..., 'ham', 'spam', 'ham'], dtype=object)

In [31]:
len(predictions)

1115

In [34]:
precision, recall, fscore, _ = score(y_test, predictions, pos_label='spam', average='binary')

In [37]:
print('Precission: {} / Recall: {} / Accuracy : {}'.format(round(precision, 3),
                                                          round(recall, 3),
                                                          round((predictions==y_test).sum()/len(predictions),3)))

Precission: 0.979 / Recall: 0.89 / Accuracy : 0.982


In [27]:
def generate_model_report(y_actual, y_predicted):
    print("Accuracy = " , accuracy_score(y_actual, y_predicted))
    print("Precision = " ,precision_score(y_actual, y_predicted, pos_label='spam'))
    print("Recall = " ,recall_score(y_actual, y_predicted, pos_label='spam'))
    print("F1 Score = " ,f1_score(y_actual, y_predicted, pos_label='spam'))
    pass

In [48]:
generate_model_report(y_test, predictions)

Accuracy =  0.9820627802690582
Precision =  0.9785714285714285
Recall =  0.8896103896103896
F1 Score =  0.9319727891156462


* **With CROSS-VALIATION**

In [12]:
from sklearn.model_selection import KFold, cross_val_score

In [13]:
alg_svm_cross_validation = svm.SVC(kernel = 'linear')

k_fold = KFold(n_splits=4)
cross = cross_val_score(alg_svm_cross_validation, all_data, data['label'], cv=k_fold, scoring='accuracy', n_jobs=-1)

In [14]:
cross

array([0.97774587, 0.9856425 , 0.9798995 , 0.98420675])

In [15]:
cross.mean()

0.9818736539842067

## SVM WITH TF-IDF

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
# Vectorisation using TfidfVectorizer
vectorisation_full = TfidfVectorizer(analyzer=clean_email)
vect_fnal = vectorisation_full.fit_transform(data['Content'])

In [21]:
## creating new dataframe with all the information that we need
all_data2 = pd.concat([pd.DataFrame(vect_fnal.toarray()),data['Content_len'], data['punctuation_rate']], axis=1)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(all_data2,
                                                   data['label'],
                                                   test_size=0.2)

In [24]:
alg_svm = svm.SVC(kernel= 'linear')
alg_svm.fit(X_train, y_train)

SVC(kernel='linear')

In [25]:
predictions = alg_svm.predict(X_test)

In [28]:
generate_model_report(y_test, predictions)

Accuracy =  0.9838565022421525
Precision =  1.0
Recall =  0.88
F1 Score =  0.9361702127659575


# RANDOM FOREST

In [29]:
X_train, X_test, y_train, y_test = train_test_split(all_data,
                                                   data['label'],
                                                   test_size=0.2)

In [31]:
from sklearn.ensemble import RandomForestClassifier

alg_RandomForest = RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs=-1)
model = alg_RandomForest.fit(X_train, y_train)

In [32]:
predictions = model.predict(X_test)

In [33]:
generate_model_report(y_test, predictions)

Accuracy =  0.9345291479820628
Precision =  1.0
Recall =  0.54375
F1 Score =  0.7044534412955465


* **With CROSS-VALIATION**

In [34]:
from sklearn.model_selection import KFold, cross_val_score

In [35]:
alg_RandomForest = RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs=-1)

k_fold = KFold(n_splits=4)
cross = cross_val_score(alg_RandomForest, all_data, data['label'], cv=k_fold, scoring='accuracy', n_jobs=-1)

In [36]:
cross

array([0.94831299, 0.94544149, 0.93826274, 0.94400574])

In [37]:
cross.mean()

0.9440057430007178

* **Hyper parameters tunning**

In [39]:
from sklearn.model_selection import GridSearchCV

In [None]:
alg_svm_params = svm.SVC()
params = {'kernel': ['linear', 'rbf'],
          'gamma': [0.01, 0.001]}
hyper_params_grid = GridSearchCV(alg_svm_params, param_grid=params, cv=4, n_jobs=-1)
hyper_params_models = hyper_params_grid.fit(all_data, data['label'])

In [None]:
pd.DataFrame(hyper_params_models)