In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import math

# Unfortunately, knn functions prompt "future warnings", so the commands below turn these off
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)  

In [7]:
## Note that textfile containing these data uses a tab delimiter to separate the label and message
sms = pd.read_csv("sms_spam.txt", sep='\t', names=['Label','Message'])

## Train-test split
from sklearn.model_selection import train_test_split
train, test = train_test_split(sms, test_size=0.2, random_state=8)

## Split outcome from predictors
train_y = (train['Label'] == 'spam').astype(int)
train_msg = train['Message']

## Feature engineering functions
def get_num(text):
    return sum(map(str.isdigit, text))/len(text)
def cap_percent(text):
    return sum(map(str.isupper, text))/len(text)
def alpha_percent(text):
    return sum(map(str.isalnum, text))/len(text)

## Define "first_word" function
def first_word(text):
    return text.split(sep=' ')[0].lower().replace('!','')

## Create data frame with these features
d = {'prop_num': train_msg.apply(get_num),
     'prop_cap': train_msg.apply(cap_percent),
     'prop_alp': train_msg.apply(alpha_percent),
    'first': train_msg.apply(first_word)}
train_X = pd.DataFrame(d)

## Assemble final training X data
train_X_ohe = pd.get_dummies(train_X, columns=['first'])
train_X = train_X_ohe[['prop_num','prop_cap', 'prop_alp', 'first_urgent','first_free']]

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import StandardScaler

## Defining the models
model1 = LogisticRegression(penalty='none')
model2 = DecisionTreeClassifier(max_depth=5)
model3 = Pipeline([('scaler', StandardScaler()),
                  ('model', KNeighborsClassifier())])
                  
## Creating the ensemble
my_ensemble = VotingClassifier(estimators=[('logr', model1),
                                           ('tree', model2), 
                                           ('knn', model3)],
                               voting='soft')

In [9]:
from sklearn.model_selection import cross_val_score
print(np.average(cross_val_score(my_ensemble, train_X, train_y, scoring='f1', cv=5)))

## Individual models
print([np.average(cross_val_score(model1, train_X, train_y, scoring='f1', cv=5)),
       np.average(cross_val_score(model2, train_X, train_y, scoring='f1', cv=5)),
       np.average(cross_val_score(model3, train_X, train_y, scoring='f1', cv=5))])

0.9128026222243044
[0.8453849485158784, 0.906203023823853, 0.9084395064728428]


In [10]:
#1-a


params = {'logr__penalty': ['none','l2'], 
          'tree__max_depth': [4,5,6,7],
         'knn__model__n_neighbors': [3,6,10,15],
         'knn__model__weights': ['distance','uniform'],
         'voting': ['soft','hard']}

from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(my_ensemble, param_grid=params, cv=5, scoring = 'f1').fit(train_X, train_y)
print(grid.best_estimator_)
print(grid.best_score_)

VotingClassifier(estimators=[('logr', LogisticRegression(penalty='none')),
                             ('tree', DecisionTreeClassifier(max_depth=5)),
                             ('knn',
                              Pipeline(steps=[('scaler', StandardScaler()),
                                              ('model',
                                               KNeighborsClassifier(n_neighbors=3,
                                                                    weights='distance'))]))],
                 voting='soft')
0.9167655692653746


In [12]:
#1-b
test_y = (test['Label'] == 'spam').astype(int)
test_msg = test['Message']

## Create data frame with these features
d = {'prop_num': test_msg.apply(get_num),
     'prop_cap': test_msg.apply(cap_percent),
     'prop_alp': test_msg.apply(alpha_percent),
    'first': test_msg.apply(first_word)}

test_X = pd.DataFrame(d)

## Assemble final training X data
test_X_ohe = pd.get_dummies(test_X, columns=['first'])
test_X = test_X_ohe[['prop_num','prop_cap', 'prop_alp', 'first_urgent','first_free']]

y_pred = grid.best_estimator_.predict(test_X)

from sklearn.metrics import f1_score
f1_score(test_y, y_pred)


0.9328859060402686