## Load Data

In [1]:
%qtconsole

In [2]:
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


from sklearn import metrics

from sklearn.model_selection import cross_val_score

In [3]:
header = ['F' + str(i) for i in range(57)] + ['spam']

In [4]:
spam = pd.read_csv("spambase.data", header = None, names = header)

In [5]:
spam.head()

Unnamed: 0,F0,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F48,F49,F50,F51,F52,F53,F54,F55,F56,spam
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [6]:
spam.shape

(4601, 58)

### check any missing value

In [7]:
spam.isnull().values.any()

False

### no missing values

## Simple EDA

### train/test split

In [8]:
X = spam[['F' +str(i) for i in range(0, 57)]]
y = spam.spam

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

### target distribution

In [9]:
y_train.value_counts()

0    1902
1    1180
Name: spam, dtype: int64

## First try with all variables

### Logistic regression

In [10]:
logitModel = LogisticRegression()

In [11]:
logitModel.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [12]:
y_predict = logitModel.predict(X_test)

In [13]:
print(metrics.classification_report(y_test, y_predict))

             precision    recall  f1-score   support

          0       0.93      0.96      0.94       886
          1       0.94      0.90      0.92       633

avg / total       0.93      0.93      0.93      1519



In [14]:
metrics.confusion_matrix(y_test, y_predict)

array([[847,  39],
       [ 66, 567]])

### Try given class 0 heavier weight

In [15]:
weights = {0:2, 1:1}

In [16]:
logitWeightModel = LogisticRegression(class_weight=weights)

In [17]:
logitWeightModel.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight={0: 2, 1: 1}, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [18]:
y_predict_w = logitWeightModel.predict(X_test)

In [19]:
print(metrics.classification_report(y_test, y_predict_w))

             precision    recall  f1-score   support

          0       0.88      0.97      0.92       886
          1       0.95      0.81      0.87       633

avg / total       0.91      0.90      0.90      1519



In [20]:
metrics.confusion_matrix(y_test, y_predict_w)

array([[859,  27],
       [122, 511]])

### Seems a bit costy to increase recall for class 0

### Cross validation

In [21]:
clg = LogisticRegression()

In [22]:
scores = cross_val_score(clg, X, y, cv = 10)

In [23]:
scores

array([ 0.93058568,  0.92407809,  0.9175705 ,  0.94565217,  0.9326087 ,
        0.93478261,  0.95217391,  0.93913043,  0.8496732 ,  0.85620915])

In [24]:
recalls = cross_val_score(clg, X,y, cv = 10, scoring= 'recall')

In [25]:
recalls

array([ 0.86813187,  0.86813187,  0.87912088,  0.9281768 ,  0.91160221,
        0.94475138,  0.90055249,  0.91160221,  0.85635359,  0.79558011])

### Random Forest

In [26]:
rf = RandomForestClassifier(n_estimators=500)

In [27]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [28]:
y_predict = rf.predict(X_test)

In [29]:
print(metrics.classification_report(y_test, y_predict))

             precision    recall  f1-score   support

          0       0.95      0.97      0.96       886
          1       0.96      0.93      0.95       633

avg / total       0.96      0.96      0.96      1519



In [30]:
metrics.confusion_matrix(y_test, y_predict)

array([[863,  23],
       [ 42, 591]])

In [31]:
rfw = RandomForestClassifier(class_weight= weights)

In [32]:
rfw.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight={0: 2, 1: 1},
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [33]:
y_predict_w = rfw.predict(X_test)

In [34]:
print(metrics.classification_report(y_test, y_predict_w))

             precision    recall  f1-score   support

          0       0.94      0.97      0.96       886
          1       0.96      0.91      0.93       633

avg / total       0.95      0.95      0.95      1519



In [35]:
metrics.confusion_matrix(y_test, y_predict_w)

array([[863,  23],
       [ 58, 575]])

### Random Forest performs better

In [36]:
rf = RandomForestClassifier()
scores = cross_val_score(rf, X, y, cv = 10)

In [37]:
scores

array([ 0.94143167,  0.94360087,  0.93058568,  0.94347826,  0.94347826,
        0.94782609,  0.95434783,  0.95217391,  0.90631808,  0.85620915])

### Gradient Boosting

In [38]:
gb = GradientBoostingClassifier()

In [39]:
gb.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)

In [40]:
y_predict = gb.predict(X_test)

In [41]:
print(metrics.classification_report(y_test, y_predict))

             precision    recall  f1-score   support

          0       0.95      0.97      0.96       886
          1       0.95      0.93      0.94       633

avg / total       0.95      0.95      0.95      1519



In [42]:
gbc = GradientBoostingClassifier()

In [43]:
scores = cross_val_score(gbc, X, y, cv =10)

In [44]:
scores

array([ 0.95444685,  0.94577007,  0.93926247,  0.94565217,  0.94782609,
        0.95217391,  0.9673913 ,  0.97391304,  0.88235294,  0.85620915])