In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import precision_score, recall_score

In [2]:
df = pd.read_csv("./data/data.csv")
df_quiz = pd.read_csv("./data/quiz.csv")

In [3]:
df_y = df['label'].copy()
del df['label']...,,,,,,,,,,,,

In [4]:
cols_to_delete = ['18','25','29', '31', '32', '35', '23', '26', '58']
for col in cols_to_delete:
    del df[col]
    del df_quiz[col]

In [5]:
# Define categorical columns
categorical_cols_enhanced = list(df.columns)

# Remove purely numeric columns
categorical_cols_enhanced.remove('59')
categorical_cols_enhanced.remove('60')

In [6]:
# Convert categorical to one-hot sparse column
df_one_hot = pd.get_dummies(df, columns=categorical_cols_enhanced)

In [7]:
df_one_hot_quiz = pd.get_dummies(df_quiz, columns=categorical_cols_enhanced)

col_to_add = np.setdiff1d(df_one_hot.columns, df_one_hot_quiz.columns)
for c in col_to_add:
    df_one_hot_quiz[c] = 0

df_one_hot_quiz = df_one_hot_quiz[df_one_hot.columns]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df_one_hot, df_y, random_state=1, train_size=0.6)
# X_train, y_train = df_one_hot, df_y

In [9]:
def get_predictions(cls, test_features):
    predicted = cls.predict(test_features)
    predicted_probs = cls.predict_proba(test_features).transpose()[1]
    predicted = pd.Series(predicted, index=test_features.index)
    predicted.name = 'predicted'
    return predicted, predicted_probs

In [10]:
def run(cls):
    cls.fit(X_train, y_train)
    print (cls)
    test_preds, _ = get_predictions(cls, X_test)
    train_preds, _ = get_predictions(cls, X_train)
    
    print ('train accuracy:', cls.score(X_train, y_train))
    print ('train precision:', precision_score(y_train, train_preds))
    print ('train recall:', recall_score(y_train, train_preds))
    
    print ('test accuracy:', cls.score(X_test, y_test))
    print ('test precision:', precision_score(y_test, test_preds))
    print ('test recall:', recall_score(y_test, test_preds))

In [11]:
cls = AdaBoostClassifier(random_state=1, base_estimator=DecisionTreeClassifier(max_depth=2),
                         n_estimators=200)

In [12]:
run(cls)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
          learning_rate=1.0, n_estimators=200, random_state=1)
train accuracy: 0.913642217025
train precision: 0.906849232074
train recall: 0.89540274579
test accuracy: 0.907696856214
test precision: 0.895708560364
test recall: 0.893656213496


In [13]:
quiz_preds = cls.predict(df_one_hot_quiz)

In [14]:
def submission(preds, ver):
    with open("./submissions/submission_{}.csv".format(ver), 'w') as f:
        f.write("Id,Prediction\n")
        for i, pred in enumerate(preds):
            f.write("{},{}\n".format(i+1,pred))

In [15]:
#submission(quiz_preds, "09")

# AdaBoost-DecisionTreeClassifier Results

### train_size=0.2
### n_estimators=20
### max_depth=1
AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
          learning_rate=1.0, n_estimators=20, random_state=1)
          
train accuracy: 0.855126739465

train precision: 0.828982976866

train recall: 0.846949696753

test accuracy: 0.846870996354

test precision: 0.818011999209

test recall: 0.836844353515

### train_size=0.2
### n_estimators=200
### max_depth=1
AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
          learning_rate=1.0, n_estimators=200, random_state=1)
          
train accuracy: 0.892340442307

train precision: 0.879259458009

train recall: 0.876828398145

test accuracy: 0.882704247561

test precision: 0.866849845724

test recall: 0.865329706153

### train_size=0.4
### n_estimators=200
### max_depth=1
AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
          learning_rate=1.0, n_estimators=200, random_state=1)
          
train accuracy: 0.889265581267

train precision: 0.876271566211

train recall: 0.87251795993

test accuracy: 0.883539413689

test precision: 0.866764803076

test recall: 0.866947115385


### train_size=0.6
### n_estimators=200
### max_depth=2
AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
          learning_rate=1.0, n_estimators=200, random_state=1)
          
train accuracy: 0.913642217025

train precision: 0.906849232074

train recall: 0.89540274579

test accuracy: 0.907696856214

test precision: 0.895708560364

test recall: 0.893656213496
