In [1]:
import string
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score

from collections import Counter
from nltk.tokenize import TweetTokenizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Python script for confusion matrix creation 
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 

# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

np.set_printoptions(precision=5)
%matplotlib inline
pd.options.display.max_columns=1000

In [2]:
def updated_dict(d, **kwargs):
    updated_d = d.copy()
    updated_d.update(kwargs)
    return updated_d

def clean_func_names(df):
    df = df.copy()   
    columns = [c for c in df.columns if callable(df[c][0])] 
    for c in columns:
        funcs = []
        for train in df[c]:
            funcs.append(train.__name__)
        df[c] = funcs
    return df
    

In [3]:
# setting hyperparamaters:
SCALED = True
PUNC = False

In [4]:
# Load data
df = pd.read_csv('SMSSpamCollection.txt', sep='\t', header=None, names=['spam', 'text'])

# set categorical values of spam to 0 or 1
df['spam'] = df['spam'] == 'spam' # makes True/False instead of "spam" and "ham"
df['spam'] = df['spam'].astype(int)  # number values instead of boolean value

# Adding new feature 'length'
L = []
for i in df.text:
    L.append(len(i))
df['length'] = L

# Add second engineered feature 'num_words'
words = df.copy()
num_words = []
for i in range(len(words.text)):
    value = words['text'][i].split(' ')
    num_words.append(len(value))
num_words
df['num_words'] = num_words

df.head()

Unnamed: 0,spam,text,length,num_words
0,0,"Go until jurong point, crazy.. Available only ...",111,20
1,0,Ok lar... Joking wif u oni...,29,6
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,155,28
3,0,U dun say so early hor... U c already then say...,49,11
4,0,"Nah I don't think he goes to usf, he lives aro...",61,13


Our first simple model will predict whether message is a spam or ham, just using feature 'lenght' and 'num_words'.

In [5]:
df_len = df[['spam', 'length', 'num_words']]
df_len.head()

Unnamed: 0,spam,length,num_words
0,0,111,20
1,0,29,6
2,1,155,28
3,0,49,11
4,0,61,13


In [6]:
if PUNC == True:
# Get rid of the punctuation
    translator = str.maketrans('', '', string.punctuation)
    df_len.text = df_len.text.apply(lambda x: x.translate(translator))

In [7]:
X_train,  X_test, y_train, y_test =  train_test_split(df_len.length.values, df_len.spam.values, test_size=0.2, random_state=42)

In [8]:
type(X_train)

numpy.ndarray

We don't need to scale features, as we have just one. However, in later models we use more than just this features and therefore as exercise, we do it right now as well.

In [9]:
# We will used MinMaxScaler, which scales values in a way that our new values will be within itnerval <0,1>.
# ATTENTION! With train set we use .fit_transform method(), with test set only .transform()!!!
if SCALED == True:
    scaler = MinMaxScaler()
    X_train_sc = scaler.fit_transform(X_train.reshape(-1, 1))
    X_test_sc = scaler.transform(X_test.reshape(-1, 1))
else:
    X_train_sc = X_train.reshape(-1, 1)
    X_test_sc = X_test.reshape(-1, 1)



## Train and model

In [10]:
def train_logreg(X_train_sc, y_train, **kwargs):
    clf = LogisticRegression(random_state=0, 
                             class_weight='balanced',
                             solver=kwargs.get('solver', 'sag'), # instead of defaults you can have {} which would rise exception
                             penalty=kwargs.get('penalty', 'l2'), 
                             C=kwargs.get('C', 1.0))
    model = clf.fit(X_train_sc, y_train)
    return model 

def train_SVM(X_train_sc, y_train, **kwargs):
    clf = SVC(random_state=0, 
                             class_weight='balanced',
                             kernel=kwargs.get('kernel', 'rbf'), 
                             coef0=kwargs.get('coef0', 0.0), 
                             C=kwargs.get('C', 1.0))
    model = clf.fit(X_train_sc, y_train)
    return model 

def train_random(X_train_sc, y_train, **kwargs):
    clf = RandomForestClassifier(random_state=0, class_weight='balanced', 
                                 bootstrap=kwargs.get('bootstrap', 'True'),
                                 n_estimators=kwargs.get('n_estimators', 100))
   
    model = clf.fit(X_train_sc, y_train)
    return model 


In [11]:
# return as pandas series, with multiple evaulation metrcis (fp, tn, fn, tp)
def eval_model(X_test_sc, y_test, X_train_sc, y_train):
    hp['test_score'] =  model.score(X_test_sc, y_test)
    hp['train_score'] = model.score(X_train_sc, y_train) 
    hp['tn'], hp['fp'], hp['fn'], hp['tp'] = confusion_matrix(y_test, model.predict(X_test_sc)).ravel()
    hp['auc score'] =   roc_auc_score(y_test, model.predict(X_test_sc))
    hp['f1_score'] = f1_score(y_test, model.predict(X_test_sc), average='weighted', labels=np.unique(model.predict(X_test_sc)))
    hp['recall'] = recall_score(y_test, model.predict(X_test_sc), average='weighted', labels=np.unique(model.predict(X_test_sc)))
    hp['precision'] = precision_score(y_test, model.predict(X_test_sc), average='weighted', labels=np.unique(model.predict(X_test_sc)))
    return hp

In [12]:
scores = []
logreg_def_hyperpar = dict(train_function=train_logreg, solver='liblinear', penalty='l1', C=1.0)
SVM_def_hyperpar = dict(train_function=train_SVM, kernel='poly', C=1.0, coef0=0.0)
RF_def_hyperpar = dict(train_function=train_random, bootstrap=True, n_estimators=100)

SVM_hyperparameters = [SVM_def_hyperpar, updated_dict(SVM_def_hyperpar, coef0=0.5),
                       updated_dict(SVM_def_hyperpar, C=0.5),
                       updated_dict(SVM_def_hyperpar,C=0.5, coef0=0.5),
                       updated_dict(SVM_def_hyperpar, C=0.1),
                       updated_dict(SVM_def_hyperpar, coef0=0.5),
                       updated_dict(SVM_def_hyperpar, kernel='linear'),
                       updated_dict(SVM_def_hyperpar,kernel='sigmoid')
                      ]   

RF_hyperparameters =[RF_def_hyperpar, updated_dict(RF_def_hyperpar,bootstrap=False),
                    updated_dict(RF_def_hyperpar,n_estimators=1),
                     updated_dict(RF_def_hyperpar,n_estimators=1000),
                     updated_dict(RF_def_hyperpar,n_estimators=50),
                     updated_dict(RF_def_hyperpar,n_estimators=1000, bootstrap=False)
                    ]
logreg_hyperparameters = [logreg_def_hyperpar, updated_dict(logreg_def_hyperpar, C=0.5),
                         updated_dict(logreg_def_hyperpar, C=0.1),
                         updated_dict(logreg_def_hyperpar,solver='sag', penalty='l2'),
                         updated_dict(logreg_def_hyperpar, solver='sag', penalty='l2', C=0.5),
                         updated_dict(logreg_def_hyperpar,solver='sag', penalty='l2', C=0.1),
                         updated_dict(logreg_def_hyperpar,solver='newton-cg', penalty='l2'),
                         updated_dict(logreg_def_hyperpar,solver='newton-cg', penalty='l2', C=0.5),
                         updated_dict(logreg_def_hyperpar, solver='newton-cg', penalty='l2', C=0.1),
                         updated_dict(logreg_def_hyperpar,solver='lbfgs', penalty='l2'),  
                         updated_dict(logreg_def_hyperpar,solver='lbfgs', penalty='l2', C=0.5),
                          updated_dict(logreg_def_hyperpar,solver='lbfgs', penalty='l2', C=0.1)
                         ]


for hyperparameters in (logreg_hyperparameters + SVM_hyperparameters + RF_hyperparameters):
    hp = logreg_def_hyperpar.copy()
    hp.update(hyperparameters)
    train = hp.get('train_function')
    print(hp)
    model = train(X_train_sc, y_train, **hp)
    hp = eval_model(X_test_sc, y_test, X_train_sc, y_train)
    scores.append(hp)

{'train_function': <function train_logreg at 0x0000020B08977048>, 'solver': 'liblinear', 'penalty': 'l1', 'C': 1.0}
{'train_function': <function train_logreg at 0x0000020B08977048>, 'solver': 'liblinear', 'penalty': 'l1', 'C': 0.5}
{'train_function': <function train_logreg at 0x0000020B08977048>, 'solver': 'liblinear', 'penalty': 'l1', 'C': 0.1}
{'train_function': <function train_logreg at 0x0000020B08977048>, 'solver': 'sag', 'penalty': 'l2', 'C': 1.0}
{'train_function': <function train_logreg at 0x0000020B08977048>, 'solver': 'sag', 'penalty': 'l2', 'C': 0.5}
{'train_function': <function train_logreg at 0x0000020B08977048>, 'solver': 'sag', 'penalty': 'l2', 'C': 0.1}
{'train_function': <function train_logreg at 0x0000020B08977048>, 'solver': 'newton-cg', 'penalty': 'l2', 'C': 1.0}
{'train_function': <function train_logreg at 0x0000020B08977048>, 'solver': 'newton-cg', 'penalty': 'l2', 'C': 0.5}
{'train_function': <function train_logreg at 0x0000020B08977048>, 'solver': 'newton-cg', '

In [34]:
clean_func_names(pd.DataFrame(scores))
df.to_csv('results_len_numwords.csv', index=False)
df

Unnamed: 0,train_function,precision,recall,f1_score,auc score,tn,fp,fn,tp,test_score,train_score,C,solver,kernel,penalty,bootstrap,coef0,n_estimators
0,train_logreg,0.895977,0.799103,0.826875,0.827296,762,204,20,129,0.799103,0.802333,1.0,liblinear,,l1,,,
1,train_logreg,0.895977,0.799103,0.826875,0.827296,762,204,20,129,0.799103,0.802333,0.5,liblinear,,l1,,,
2,train_logreg,0.899422,0.799103,0.827256,0.83581,759,207,17,132,0.799103,0.801212,0.1,liblinear,,l1,,,
3,train_logreg,0.899422,0.799103,0.827256,0.83581,759,207,17,132,0.799103,0.801212,1.0,sag,,l2,,,
4,train_logreg,0.899244,0.798206,0.826547,0.835293,758,208,17,132,0.798206,0.799417,0.5,sag,,l2,,,
5,train_logreg,0.900041,0.790135,0.820389,0.83631,747,219,15,134,0.790135,0.792013,0.1,sag,,l2,,,
6,train_logreg,0.899422,0.799103,0.827256,0.83581,759,207,17,132,0.799103,0.801212,1.0,newton-cg,,l2,,,
7,train_logreg,0.899244,0.798206,0.826547,0.835293,758,208,17,132,0.798206,0.799417,0.5,newton-cg,,l2,,,
8,train_logreg,0.899244,0.798206,0.826547,0.835293,758,208,17,132,0.798206,0.799417,0.1,newton-cg,,l2,,,
9,train_logreg,0.899422,0.799103,0.827256,0.83581,759,207,17,132,0.799103,0.801212,1.0,lbfgs,,l2,,,


In [14]:
df = clean_func_names(pd.DataFrame(scores))

df = df[['train_function','precision', 'recall', 'f1_score',  'auc score', 'tn', 'fp', 'fn', 'tp', 'test_score', 
        'train_score', 'C', 'solver', 'kernel', 'penalty', 'bootstrap', 'coef0',  
       'n_estimators']]



Unnamed: 0,train_function,precision,recall,f1_score,auc score,tn,fp,fn,tp,test_score,train_score,C,solver,kernel,penalty,bootstrap,coef0,n_estimators
0,train_logreg,0.895977,0.799103,0.826875,0.827296,762,204,20,129,0.799103,0.802333,1.0,liblinear,,l1,,,
1,train_logreg,0.895977,0.799103,0.826875,0.827296,762,204,20,129,0.799103,0.802333,0.5,liblinear,,l1,,,
2,train_logreg,0.899422,0.799103,0.827256,0.83581,759,207,17,132,0.799103,0.801212,0.1,liblinear,,l1,,,
3,train_logreg,0.899422,0.799103,0.827256,0.83581,759,207,17,132,0.799103,0.801212,1.0,sag,,l2,,,
4,train_logreg,0.899244,0.798206,0.826547,0.835293,758,208,17,132,0.798206,0.799417,0.5,sag,,l2,,,
5,train_logreg,0.900041,0.790135,0.820389,0.83631,747,219,15,134,0.790135,0.792013,0.1,sag,,l2,,,
6,train_logreg,0.899422,0.799103,0.827256,0.83581,759,207,17,132,0.799103,0.801212,1.0,newton-cg,,l2,,,
7,train_logreg,0.899244,0.798206,0.826547,0.835293,758,208,17,132,0.798206,0.799417,0.5,newton-cg,,l2,,,
8,train_logreg,0.899244,0.798206,0.826547,0.835293,758,208,17,132,0.798206,0.799417,0.1,newton-cg,,l2,,,
9,train_logreg,0.899422,0.799103,0.827256,0.83581,759,207,17,132,0.799103,0.801212,1.0,lbfgs,,l2,,,


In [27]:
s = df[['train_function', 'f1_score', 'precision', 'recall', 'C', 'coef0', 'kernel', 'solver', 'penalty', 'bootstrap', 'coef0',  
       'n_estimators']]
s

Unnamed: 0,train_function,f1_score,precision,recall,C,coef0,kernel,solver,penalty,bootstrap,coef0.1,n_estimators
0,train_logreg,0.826875,0.895977,0.799103,1.0,,,liblinear,l1,,,
1,train_logreg,0.826875,0.895977,0.799103,0.5,,,liblinear,l1,,,
2,train_logreg,0.827256,0.899422,0.799103,0.1,,,liblinear,l1,,,
3,train_logreg,0.827256,0.899422,0.799103,1.0,,,sag,l2,,,
4,train_logreg,0.826547,0.899244,0.798206,0.5,,,sag,l2,,,
5,train_logreg,0.820389,0.900041,0.790135,0.1,,,sag,l2,,,
6,train_logreg,0.827256,0.899422,0.799103,1.0,,,newton-cg,l2,,,
7,train_logreg,0.826547,0.899244,0.798206,0.5,,,newton-cg,l2,,,
8,train_logreg,0.826547,0.899244,0.798206,0.1,,,newton-cg,l2,,,
9,train_logreg,0.827256,0.899422,0.799103,1.0,,,lbfgs,l2,,,


In [33]:
s['solver'][15] = np.nan
s['solver'][15]

'liblinear'

In [15]:
df[df['train_function']=='train_logreg'].nlargest(3, 'f1_score')

Unnamed: 0,train_function,precision,recall,f1_score,auc score,tn,fp,fn,tp,test_score,train_score,C,solver,kernel,penalty,bootstrap,coef0,n_estimators
2,train_logreg,0.899422,0.799103,0.827256,0.83581,759,207,17,132,0.799103,0.801212,0.1,liblinear,,l1,,,
3,train_logreg,0.899422,0.799103,0.827256,0.83581,759,207,17,132,0.799103,0.801212,1.0,sag,,l2,,,
6,train_logreg,0.899422,0.799103,0.827256,0.83581,759,207,17,132,0.799103,0.801212,1.0,newton-cg,,l2,,,


In [16]:
df[df['train_function']=='train_SVM'].nlargest(3, 'f1_score')

Unnamed: 0,train_function,precision,recall,f1_score,auc score,tn,fp,fn,tp,test_score,train_score,C,solver,kernel,penalty,bootstrap,coef0,n_estimators
12,train_SVM,0.866368,1.0,0.9284,0.5,966,0,149,0,0.866368,0.864932,1.0,liblinear,poly,l1,,0.0,
14,train_SVM,0.866368,1.0,0.9284,0.5,966,0,149,0,0.866368,0.86538,0.5,liblinear,poly,l1,,0.0,
16,train_SVM,0.866368,1.0,0.9284,0.5,966,0,149,0,0.866368,0.865829,0.1,liblinear,poly,l1,,0.0,


In [17]:
df[df['train_function']=='train_random'].nlargest(3,'f1_score')

Unnamed: 0,train_function,precision,recall,f1_score,auc score,tn,fp,fn,tp,test_score,train_score,C,solver,kernel,penalty,bootstrap,coef0,n_estimators
24,train_random,0.899823,0.842152,0.85993,0.835112,816,150,26,123,0.842152,0.849226,1.0,liblinear,,l1,True,,50.0
20,train_random,0.898811,0.842152,0.859758,0.832274,817,149,27,122,0.842152,0.849675,1.0,liblinear,,l1,True,,100.0
23,train_random,0.898811,0.842152,0.859758,0.832274,817,149,27,122,0.842152,0.849675,1.0,liblinear,,l1,True,,1000.0


In [18]:
df.nlargest(3, 'f1_score')

Unnamed: 0,train_function,precision,recall,f1_score,auc score,tn,fp,fn,tp,test_score,train_score,C,solver,kernel,penalty,bootstrap,coef0,n_estimators
12,train_SVM,0.866368,1.0,0.9284,0.5,966,0,149,0,0.866368,0.864932,1.0,liblinear,poly,l1,,0.0,
14,train_SVM,0.866368,1.0,0.9284,0.5,966,0,149,0,0.866368,0.86538,0.5,liblinear,poly,l1,,0.0,
16,train_SVM,0.866368,1.0,0.9284,0.5,966,0,149,0,0.866368,0.865829,0.1,liblinear,poly,l1,,0.0,
