In [1]:
# Import everything we need

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn import svm
from sklearn import model_selection
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import Pool, cv, CatBoostClassifier, CatBoostRegressor
import random
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
# Helper functions

# binary error definition
def bin_classification_err(real_y, y):
    len_data = y.size
    num_diff = 0.0
    for i in range(len_data):
        if (y[i] != real_y[i]):
            num_diff += 1.0
    return (num_diff / len_data)


# Handle missing values in data
def missing_values(original_data, method='omit', 
                   supply_data=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
                                15,16,17,18,19,20,21,22,23,24,25,26]):
    """
    Replace missing values in original data according to given rules.
    Parameters
    ----------
    original_data : numpy array
        The data set containing NaN.
    method : str, optional
        'omit' : remove rows containing NaN. Default.
        'mean' : replace NaN by the mean of its column.
        'median' : replace NaN by the median of its column.
        'zeros' : replace NaN by 0.
        'change_and_add_flags' : replace NaN by the values specified in 
         supply_data at each corresponding columns. Then add new columns 
         with 0 = not NaN and 1 = is NaN.
    supply_data : list of floats, optional
        values to replace NaN in each column. The default is 
        [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26].
        'imputation' : fill in missing values by simple machine learning
    Returns
    -------
    new_data : numpy array of size (row_original_data, 
               column_original_data + n_column_containing_missing)
        The processed data array.
    """
    if method == 'omit':
        new_data = original_data[~np.isnan(original_data).any(axis=1)]
        
    elif method == 'mean':
        non_nan_data = original_data[~np.isnan(original_data).any(axis=1)]
        mean_row = np.mean(non_nan_data, axis=0)
        for i_column in range(len(mean_row)):
            original_data[:,i_column] = np.nan_to_num(original_data[:,i_column], 
                                                      nan=mean_row[i_column])
            new_data = original_data
            
    elif method == 'median':
        non_nan_data = original_data[~np.isnan(original_data).any(axis=1)]
        median_row = np.median(non_nan_data, axis=0)
        for i_column in range(len(median_row)):
            original_data[:,i_column] = np.nan_to_num(original_data[:,i_column], 
                                                      nan=median_row[i_column])
            new_data = original_data
            
    elif method == 'zeros':
        new_data = np.nan_to_num(original_data, nan=0.0)
        
    elif method == 'change_and_add_flags':
        import numpy.ma as ma
        for i_column in range(27): # 27 columns in total, not including y
            new_column = np.zeros(len(original_data[:,i_column]))
            mask = np.ma.masked_invalid(original_data[:,i_column]).mask
            new_column[mask] = 1
            if np.sum(new_column) != 0:
                new_column = np.expand_dims(new_column, axis=0)
                new_column = new_column.transpose()
                original_data = np.insert(original_data, [-1], new_column, axis=1)
                original_data[:,i_column] = np.nan_to_num(original_data[:,i_column], 
                                                      nan=supply_data[i_column])
                new_data = original_data
                
    elif method == 'imputation':
        # to do
        pass
    
    else: 
        print('Invalid option for treating missing data.')
    
    return new_data

In [3]:
# Get data

# Parse from csv files
test = pd.read_csv("data/test.csv")
df_train = pd.read_csv('data/train.csv', index_col=0)
df_test = pd.read_csv('data/test.csv', index_col=0)

# Convert data and deal with missing values
dtrain = df_train.values[1:]
dtest = df_test.values[:]
dtest = missing_values(dtest, method = "zeros")
dtrain = missing_values(dtrain, method = "zeros")

# Separate validation and training data
X_all, Y_all = dtrain[:, :-1], dtrain[:, -1]
X_val = X_all[0:10000]
X_train = X_all[10000:(len(X_all) - 1)]
Y_val = Y_all[0:10000]
Y_train = Y_all[10000:(len(Y_all) - 1)]
X_test = dtest

In [4]:
# Functions to make and train classifiers

# Make and train a RandomForestClassifier
def get_forest(n_estimators, depth, X, Y):
    clf = RandomForestClassifier(n_estimators = n_estimators, max_depth = depth, criterion = 'gini')
    clf.fit(X, Y)
    return clf

In [10]:
# Try out Voting Classifier

X = X_train[:200000]
Y = Y_train[:200000]
random_state = 54321

clfs = []
wts = []

rf1 = RandomForestClassifier(n_estimators=250, criterion='entropy',  n_jobs = -1,  random_state=random_state)
rf1.fit(X, Y)
# Get validation error
val_probs = rf1.predict_proba(X_val)[:,1]
val_err = roc_auc_score(Y_val, val_probs)
print("rf1 validation error:", val_err)
clfs.append(('rf1', rf1))
wts.append(1)

rf2 = RandomForestClassifier(n_estimators=250, criterion='gini',  n_jobs = -1, random_state=random_state)
rf2.fit(X, Y)
val_probs = rf2.predict_proba(X_val)[:,1]
val_err = roc_auc_score(Y_val, val_probs)
print("rf2 validation error:", val_err)
clfs.append(('rf2', rf2))
wts.append(1)

gbc = GradientBoostingClassifier(random_state=random_state)
gbc.fit(X, Y)
val_probs = gbc.predict_proba(X_val)[:,1]
val_err = roc_auc_score(Y_val, val_probs)
print("gbc validation error:", val_err)
clfs.append(('gbc', gbc))
wts.append(3)

xgb = XGBClassifier(seed=random_state)
xgb.fit(X, Y)
val_probs = xgb.predict_proba(X_val)[:,1]
val_err = roc_auc_score(Y_val, val_probs)
print("xgb validation error:", val_err)
clfs.append(('xgb',xgb))
wts.append(3)

train_pool = Pool(data=X,label = Y)
cat = CatBoostClassifier(logging_level='Silent')
cat.fit(train_pool, plot=False,silent=True)
val_probs = cat.predict_proba(X_val)[:,1]
val_err = roc_auc_score(Y_val, val_probs)
print("cat validation error:", val_err)
clfs.append(('cat', cat))
wts.append(2)

eclf = VotingClassifier(estimators=clfs, voting='soft')
eclf.fit(X, Y)
val_probs = eclf.predict_proba(X_val)[:,1]
val_err = roc_auc_score(Y_val, val_probs)
print("eclf validation error:", val_err)

# Make submission
test_probs = eclf.predict_proba(X_test)[:,1]
test["Predicted"] = test_probs
test[["id","Predicted"]].to_csv("submission.csv",index=False)

rf1 validation error: 0.602106069174252
rf2 validation error: 0.5992959889924397
gbc validation error: 0.6103070636459774
xgb validation error: 0.6098181558715685
cat validation error: 0.6127258088735674
eclf validation error: 0.6113878620774853
