In [21]:
import time
import string
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score,  roc_curve

from collections import Counter
from nltk.tokenize import TweetTokenizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

# Python script for confusion matrix creation 
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 

# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

np.set_printoptions(precision=5)
%matplotlib inline
pd.options.display.max_columns=1000

In [22]:
def updated_dict(d, **kwargs):
    updated_d = d.copy()
    updated_d.update(kwargs)
    return updated_d

def clean_func_names(df):
    df = df.copy()   
    columns = [c for c in df.columns if callable(df[c][0])] 
    for c in columns:
        funcs = []
        for train in df[c]:
            funcs.append(train.__name__)
        df[c] = funcs
    return df
    

In [23]:
# setting hyperparamaters:
SCALED = True

In [24]:
# Load data
df = pd.read_csv('SMSSpamCollection.txt', sep='\t', header=None, names=['spam', 'text'])
df = df[:3000]
# set categorical values of spam to 0 or 1
df['spam'] = df['spam'] == 'spam' # makes True/False instead of "spam" and "ham"
df['spam'] = df['spam'].astype(int)  # number values instead of boolean value


# Adding new feature 'length'
L = []
for i in df.text:
    L.append(len(i))
df['length'] = L

# Add second engineered feature 'num_words'
words = df.copy()
num_words = []
for i in range(len(words.text)):
    value = words['text'][i].split(' ')
    num_words.append(len(value))
num_words
df['num_words'] = num_words

# TODO : creating features as #punctuation, <>, ...

#Create sub DataFrame
sub_df = df[['text', 'length', 'num_words']]

In [25]:
# Split train test
X_train,  X_test, y_train, y_test =  train_test_split(sub_df, df.spam.values, test_size=0.2, random_state=42)


We don't need to scale features, as we have just one. However, in later models we use more than just this features and therefore as exercise, we do it right now as well.

In [26]:
# Use TweetTokenizer 
tknzr = TweetTokenizer()
X_train['text'] = X_train.text.apply(tknzr.tokenize)
X_test['text'] = X_test.text.apply(tknzr.tokenize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [27]:
text_train = X_train.text
text_test = X_test.text

# We splitted data and therefore we need to get list of indices to iterate over if we want to create 'documents' variable
id_tr = list(text_train.index.values)
id_tr.sort()
doc_train = []
for i in id_tr:
    doc_train.append(' '.join(text_train[i]))
    
# The same for test set
id_ts = list(text_test.index.values)
id_ts.sort()

doc_test = []
for i in id_ts:
    doc_test.append(' '.join(text_test[i]))      


In [28]:
# Method .toarray() assures that we gain dense matrix
tfidf = TfidfVectorizer(ngram_range=(1, 2))
X_text_tr = tfidf.fit_transform(doc_train).toarray()
X_text_ts = tfidf.transform(doc_test).toarray()

#TODO  
DO_PCA = True
```python
pca = PCA(n_components=100)
pca.fit(X_text_tr)
X_text_tr_pca = pca.transform(X_text_tr)
X_text_ts_pca = pca.transform(X_text_ts)
```

In [29]:
X_train = X_train[['length', 'num_words']].values
X_test = X_test[['length', 'num_words']].values

In [30]:
# We will used MinMaxScaler, which scales values in a way that our new values will be within itnerval <0,1>.
# ATTENTION! With train set we use .fit_transform method(), with test set only .transform()!!!
if SCALED == True:
    scaler = MinMaxScaler()
    X_train_sc = scaler.fit_transform(X_train)#.reshape(-1, 1))
    X_test_sc = scaler.transform(X_test)#.reshape(-1, 1))
#else:
 #   X_train_sc = X_train.reshape(-1, 1)
     #X_test_sc = X_test.reshape(-1, 1)



In [31]:
X_train_sc = np.concatenate((X_train_sc, X_text_tr), axis=1)
X_test_sc = np.concatenate((X_test_sc, X_text_ts), axis=1)

## Train and model

In [32]:
def train_logreg(X_train_sc, y_train, **kwargs):
    clf = LogisticRegression(random_state=0, 
                             class_weight='balanced',
                             solver=kwargs.get('solver', 'sag'), # instead of defaults you can have {} which would rise exception
                             penalty=kwargs.get('penalty', 'l2'), 
                             C=kwargs.get('C', 1.0))
    model = clf.fit(X_train_sc, y_train)
    return model 

def train_SVM(X_train_sc, y_train, **kwargs):
    clf = SVC(random_state=0, 
                             class_weight='balanced',
                             kernel=kwargs.get('kernel', 'rbf'), # instead of defaults you can have {} which would rise exception
                             coef0=kwargs.get('coef0', 0.0), 
                             C=kwargs.get('C', 1.0))
    model = clf.fit(X_train_sc, y_train)
    return model 

def train_random(X_train_sc, y_train, **kwargs):
    clf = RandomForestClassifier(random_state=0, class_weight='balanced', 
                                 bootstrap=kwargs.get('bootstrap', 'True'),
                                 n_estimators=kwargs.get('n_estimators', 100))
   
    model = clf.fit(X_train_sc, y_train)
    return model 


In [33]:
# return as pandas series, with multiple evaulation metrcis (fp, tn, fn, tp)
def eval_model(X_test_sc, y_test, X_train_sc, y_train):
    hp['test_score'] =  model.score(X_test_sc, y_test)
    hp['train_score'] = model.score(X_train_sc, y_train) 
    hp['tn'], hp['fp'], hp['fn'], hp['tp'] = confusion_matrix(y_test, model.predict(X_test_sc)).ravel()
    hp['auc score'] =  roc_auc_score(y_test, model.predict(X_test_sc))
    hp['f1_score'] = f1_score(y_test, model.predict(X_test_sc), average='weighted', labels=np.unique(model.predict(X_test_sc)))
    hp['recall'] = recall_score(y_test, model.predict(X_test_sc), average='weighted', labels=np.unique(model.predict(X_test_sc)))
    hp['precision'] = precision_score(y_test, model.predict(X_test_sc), average='weighted', labels=np.unique(model.predict(X_test_sc)))
    return hp

```python
hp['test_score'] =  eval_model(X_test_sc, y_test)
    hp['train_score'] = eval_model(X_train_sc, y_train) 
    hp['tn'], hp['fp'], hp['fn'], hp['tp'] = confusion_matrix(y_test, model.predict(X_test_sc)).ravel()
    hp['auc score'] =   roc_auc_score(y_test, model.predict(X_test_sc))
    scores.append(hp)
```

In [38]:
scores = []
logreg_def_hyperpar = dict(train_function=train_logreg, solver='liblinear', penalty='l1', C=1.0)
SVM_def_hyperpar = dict(train_function=train_SVM, kernel='rbf', C=1.0, coef0=0.0)
RF_def_hyperpar = dict(train_function=train_random, bootstrap=True, n_estimators=1)

SVM_hyperparameters = [SVM_def_hyperpar, updated_dict(SVM_def_hyperpar, coef0=0.5),
                       updated_dict(SVM_def_hyperpar, C=0.5),
                       updated_dict(SVM_def_hyperpar,C=0.5, coef0=0.5),
                       updated_dict(SVM_def_hyperpar, C=0.1),
                       updated_dict(SVM_def_hyperpar, coef0=0.5),
                       updated_dict(SVM_def_hyperpar, kernel='linear'),
                       updated_dict(SVM_def_hyperpar,kernel='sigmoid')
                      ]   

RF_hyperparameters =[RF_def_hyperpar,updated_dict(RF_def_hyperpar, n_estimators=10),
                     updated_dict(RF_def_hyperpar, n_estimators=20),
                     updated_dict(RF_def_hyperpar, n_estimators=50),
                     updated_dict(RF_def_hyperpar, n_estimators=100), 
                     updated_dict(RF_def_hyperpar,bootstrap=False),
                     updated_dict(RF_def_hyperpar,n_estimators=1),
                     updated_dict(RF_def_hyperpar,n_estimators=1000),
                     updated_dict(RF_def_hyperpar,n_estimators=50),
                     updated_dict(RF_def_hyperpar,n_estimators=1000, bootstrap=False)
                    ]
logreg_hyperparameters = [logreg_def_hyperpar, updated_dict(logreg_def_hyperpar, C=0.5),
                         updated_dict(logreg_def_hyperpar, C=0.1),
                         updated_dict(logreg_def_hyperpar,solver='sag', penalty='l2'),
                         updated_dict(logreg_def_hyperpar, solver='sag', penalty='l2', C=0.5),
                         updated_dict(logreg_def_hyperpar,solver='sag', penalty='l2', C=0.1),
                         updated_dict(logreg_def_hyperpar,solver='newton-cg', penalty='l2'),
                         updated_dict(logreg_def_hyperpar,solver='newton-cg', penalty='l2', C=0.5),
                         updated_dict(logreg_def_hyperpar, solver='newton-cg', penalty='l2', C=0.1),
                         updated_dict(logreg_def_hyperpar,solver='lbfgs', penalty='l2'),  
                         updated_dict(logreg_def_hyperpar,solver='lbfgs', penalty='l2', C=0.5),
                         updated_dict(logreg_def_hyperpar,solver='lbfgs', penalty='l2', C=0.1)
                        ]

start_time = time.time()
for hyperparameters in (logreg_hyperparameters + RF_hyperparameters): #+ SVM_hyperparameters
    hp = logreg_def_hyperpar.copy()
    hp.update(hyperparameters)
    train = hp.get('train_function')
    print(hp)
    model = train(X_train_sc, y_train, **hp)
    hp = eval_model(X_test_sc, y_test, X_train_sc, y_train)
    scores.append(hp)
print("I am done!")
print("--- %s seconds ---" % (time.time() - start_time))

{'train_function': <function train_logreg at 0x00000144B4BD9598>, 'solver': 'liblinear', 'penalty': 'l1', 'C': 1.0}
{'train_function': <function train_logreg at 0x00000144B4BD9598>, 'solver': 'liblinear', 'penalty': 'l1', 'C': 0.5}
{'train_function': <function train_logreg at 0x00000144B4BD9598>, 'solver': 'liblinear', 'penalty': 'l1', 'C': 0.1}
{'train_function': <function train_logreg at 0x00000144B4BD9598>, 'solver': 'sag', 'penalty': 'l2', 'C': 1.0}
{'train_function': <function train_logreg at 0x00000144B4BD9598>, 'solver': 'sag', 'penalty': 'l2', 'C': 0.5}
{'train_function': <function train_logreg at 0x00000144B4BD9598>, 'solver': 'sag', 'penalty': 'l2', 'C': 0.1}
{'train_function': <function train_logreg at 0x00000144B4BD9598>, 'solver': 'newton-cg', 'penalty': 'l2', 'C': 1.0}
{'train_function': <function train_logreg at 0x00000144B4BD9598>, 'solver': 'newton-cg', 'penalty': 'l2', 'C': 0.5}
{'train_function': <function train_logreg at 0x00000144B4BD9598>, 'solver': 'newton-cg', '

In [39]:
clean_func_names(pd.DataFrame(scores))

Unnamed: 0,C,auc score,bootstrap,f1_score,fn,fp,n_estimators,penalty,precision,recall,solver,test_score,tn,tp,train_function,train_score
0,1.0,0.855238,,0.858337,9,89,,l1,0.910529,0.836667,liblinear,0.836667,436,66,train_logreg,0.860833
1,0.5,0.835238,,0.831016,9,110,,l1,0.903302,0.801667,liblinear,0.801667,415,66,train_logreg,0.83625
2,0.1,0.818095,,0.799906,8,135,,l1,0.898872,0.761667,liblinear,0.761667,390,67,train_logreg,0.796667
3,1.0,0.76381,,0.830901,22,94,,l2,0.877574,0.806667,sag,0.806667,431,53,train_logreg,0.911667
4,0.5,0.778095,,0.82081,18,107,,l2,0.882321,0.791667,sag,0.791667,418,57,train_logreg,0.875417
5,0.1,0.811429,,0.813666,11,121,,l2,0.89505,0.78,sag,0.78,404,64,train_logreg,0.837083
6,1.0,0.76381,,0.830901,22,94,,l2,0.877574,0.806667,newton-cg,0.806667,431,53,train_logreg,0.911667
7,0.5,0.778095,,0.82081,18,107,,l2,0.882321,0.791667,newton-cg,0.791667,418,57,train_logreg,0.875417
8,0.1,0.811429,,0.813666,11,121,,l2,0.89505,0.78,newton-cg,0.78,404,64,train_logreg,0.837083
9,1.0,0.76381,,0.830901,22,94,,l2,0.877574,0.806667,lbfgs,0.806667,431,53,train_logreg,0.911667


In [46]:
df = clean_func_names(pd.DataFrame(scores))

df = df[['train_function','precision', 'recall', 'f1_score',  'auc score', 'tn', 'fp', 'fn', 'tp', 'test_score', 
        'train_score', 'C',  'solver', 'penalty', 'bootstrap',  
       'n_estimators']]

df

Unnamed: 0,train_function,precision,recall,f1_score,auc score,tn,fp,fn,tp,test_score,train_score,C,solver,penalty,bootstrap,n_estimators
0,train_logreg,0.910529,0.836667,0.858337,0.855238,436,89,9,66,0.836667,0.860833,1.0,liblinear,l1,,
1,train_logreg,0.903302,0.801667,0.831016,0.835238,415,110,9,66,0.801667,0.83625,0.5,liblinear,l1,,
2,train_logreg,0.898872,0.761667,0.799906,0.818095,390,135,8,67,0.761667,0.796667,0.1,liblinear,l1,,
3,train_logreg,0.877574,0.806667,0.830901,0.76381,431,94,22,53,0.806667,0.911667,1.0,sag,l2,,
4,train_logreg,0.882321,0.791667,0.82081,0.778095,418,107,18,57,0.791667,0.875417,0.5,sag,l2,,
5,train_logreg,0.89505,0.78,0.813666,0.811429,404,121,11,64,0.78,0.837083,0.1,sag,l2,,
6,train_logreg,0.877574,0.806667,0.830901,0.76381,431,94,22,53,0.806667,0.911667,1.0,newton-cg,l2,,
7,train_logreg,0.882321,0.791667,0.82081,0.778095,418,107,18,57,0.791667,0.875417,0.5,newton-cg,l2,,
8,train_logreg,0.89505,0.78,0.813666,0.811429,404,121,11,64,0.78,0.837083,0.1,newton-cg,l2,,
9,train_logreg,0.877574,0.806667,0.830901,0.76381,431,94,22,53,0.806667,0.911667,1.0,lbfgs,l2,,


In [41]:
df[df['train_function']=='train_logreg'].nlargest(3, 'f1_score')

Unnamed: 0,train_function,precision,recall,f1_score,auc score,tn,fp,fn,tp,test_score,train_score,C,solver,penalty,bootstrap,n_estimators
0,train_logreg,0.910529,0.836667,0.858337,0.855238,436,89,9,66,0.836667,0.860833,1.0,liblinear,l1,,
1,train_logreg,0.903302,0.801667,0.831016,0.835238,415,110,9,66,0.801667,0.83625,0.5,liblinear,l1,,
3,train_logreg,0.877574,0.806667,0.830901,0.76381,431,94,22,53,0.806667,0.911667,1.0,sag,l2,,


In [42]:
df[df['train_function']=='train_SVM'].nlargest(3, 'f1_score')

Unnamed: 0,train_function,precision,recall,f1_score,auc score,tn,fp,fn,tp,test_score,train_score,C,solver,penalty,bootstrap,n_estimators


In [43]:
df[df['train_function']=='train_random'].nlargest(3,'f1_score')

Unnamed: 0,train_function,precision,recall,f1_score,auc score,tn,fp,fn,tp,test_score,train_score,C,solver,penalty,bootstrap,n_estimators
12,train_random,0.838291,0.831667,0.834854,0.635238,471,54,47,28,0.831667,0.95,1.0,liblinear,l1,True,1.0
18,train_random,0.838291,0.831667,0.834854,0.635238,471,54,47,28,0.831667,0.95,1.0,liblinear,l1,True,1.0
13,train_random,0.825542,0.871667,0.83121,0.532381,517,8,69,6,0.871667,0.989167,1.0,liblinear,l1,True,10.0


In [44]:
df.nlargest(3, 'f1_score')

Unnamed: 0,train_function,precision,recall,f1_score,auc score,tn,fp,fn,tp,test_score,train_score,C,solver,penalty,bootstrap,n_estimators
0,train_logreg,0.910529,0.836667,0.858337,0.855238,436,89,9,66,0.836667,0.860833,1.0,liblinear,l1,,
12,train_random,0.838291,0.831667,0.834854,0.635238,471,54,47,28,0.831667,0.95,1.0,liblinear,l1,True,1.0
18,train_random,0.838291,0.831667,0.834854,0.635238,471,54,47,28,0.831667,0.95,1.0,liblinear,l1,True,1.0
