In [1]:
# loading libraries
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import string
import spacy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score 
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

  from numpy.core.umath_tests import inner1d


In [2]:
# loading data
data = pd.read_csv('spam_data.csv', header=None, names=['message_class', 'message'])
message_class = {'spam':0, 'ham':1}

data['message_class'] = data['message_class'].map(message_class) 

text_data = data['message']
target_data = data['message_class']

In [3]:
# data cleaning
def clean_data(text_data):
    #Converting to lowercase
    text_data = [x.lower() for x in text_data]
    # Removing numbers
    digits = '0123456789'
    text_data = [''.join(c for c in x if c not in digits) for x in text_data]
    # Removing punctuation
    text_data = [''.join(c for c in x if c not in string.punctuation)
                 for x in text_data]
    # Trimming extra whitespace
    text_data = [' '.join(x.split()) for x in text_data]
    return text_data
text_data = clean_data(text_data)

In [4]:
# lemmatizing tokenized texts
en_nlp = spacy.load('en_core_web_sm')
def tokenizer(text_data):
    spacy_data = en_nlp(text_data)
    lemmatized_data = [token.lemma_ for token in spacy_data]
    return lemmatized_data 

In [5]:
# vectorizing texts for model training
min_word_frequency = 10
max_features = 1000
vocab_processor = TfidfVectorizer(min_df=min_word_frequency, max_features=max_features, 
                             tokenizer=tokenizer, stop_words='english')
processed_data = vocab_processor.fit_transform(text_data)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [6]:
# splitting into train and test sets
texts_train, texts_test, target_train, target_test = train_test_split (processed_data, target_data,
                                                                       test_size=0.20, random_state=42)  

In [7]:
# model selection
models = []
names = []

models.append(('LR', LogisticRegression()))
models.append(('smv', SVC()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('DCtree', DecisionTreeClassifier()))
models.append(('RmForest', RandomForestClassifier()))  

for name, model in models:
    classifier = model
    classifier.fit(texts_train, target_train)
    kfold = KFold(n_splits=10, random_state=7)
    val_scores = cross_val_score(classifier, texts_train, target_train, cv=kfold)
    msg = "validation score of {} is {}".format(name, val_scores)
    print(msg)
    print('mean:', val_scores.mean())
    print('standard deviation:', val_scores.std())

validation score of LR is [0.97309417 0.96860987 0.96860987 0.9573991  0.96412556 0.97533632
 0.94843049 0.96412556 0.97085202 0.94831461]
mean: 0.9638897566382829
standard deviation: 0.009124642219438373
validation score of smv is [0.88789238 0.867713   0.88340807 0.83856502 0.867713   0.89461883
 0.84977578 0.87892377 0.87219731 0.84494382]
mean: 0.8685750995112611
standard deviation: 0.017904212999400874
validation score of KNN is [0.93497758 0.92600897 0.94618834 0.92152466 0.91928251 0.93721973
 0.89686099 0.91928251 0.94394619 0.88988764]
mean: 0.9235179120270065
standard deviation: 0.01771431369080569
validation score of DCtree is [0.95964126 0.95964126 0.95067265 0.96636771 0.9573991  0.9529148
 0.93721973 0.96636771 0.95067265 0.94157303]
mean: 0.9542469894694412
standard deviation: 0.009164274113505136
validation score of RmForest is [0.97533632 0.97309417 0.97309417 0.97982063 0.96188341 0.97085202
 0.96188341 0.97757848 0.97085202 0.94831461]
mean: 0.9692709225575653
standa

In [8]:
selected_models = []
selected_models.append(('LR', LogisticRegression()))
selected_models.append(('RmForest', RandomForestClassifier()))
msg = 'Logistic Regression'
for name, selected_model in selected_models:
    print (msg)
    classifier = selected_model
    classifier.fit(texts_train, target_train)
    pred_ = classifier.predict(texts_test)
    confusion = confusion_matrix(target_test, pred_)
    print("Confusion matrix:\n{}".format(confusion))
    F1Score = f1_score(target_test, pred_)
    print("f1 score: {:.2f}".format(F1Score))
    clf_auc = roc_auc_score(target_test, classifier.predict_proba(texts_test)[:, 1]) 
    print("AUC: {:.3f}".format(clf_auc))
    msg = 'RMF'  

Logistic Regression
Confusion matrix:
[[134  27]
 [  2 952]]
f1 score: 0.98
AUC: 0.989
RMF
Confusion matrix:
[[140  21]
 [ 13 941]]
f1 score: 0.98
AUC: 0.983
