In [20]:
# Import everything we need

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import RandomForestClassifier
import random
import torch
import torchvision
import torchvision.transforms as transforms

np.random.seed(1)

In [70]:
# Classification error definition
def classification_err(y, real_y):
    len_data = y.size
    num_diff = 0.0
    for i in range(len_data):
        if (y[i] != real_y[i]):
            num_diff += 1.0
    return (num_diff / len_data)

def missing_values(original_data, method='omit', 
                   supply_data=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
                                15,16,17,18,19,20,21,22,23,24,25,26]):
    """
    Replace missing values in original data according to given rules.
    Parameters
    ----------
    original_data : numpy array
        The data set containing NaN.
    method : str, optional
        'omit' : remove rows containing NaN. Default.
        'mean' : replace NaN by the mean of its column.
        'median' : replace NaN by the median of its column.
        'zeros' : replace NaN by 0.
        'change_and_add_flags' : replace NaN by the values specified in 
         supply_data at each corresponding columns. Then add new columns 
         with 0 = not NaN and 1 = is NaN.
    supply_data : list of floats, optional
        values to replace NaN in each column. The default is 
        [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26].
        'imputation' : fill in missing values by simple machine learning
    Returns
    -------
    new_data : numpy array of size (row_original_data, 
               column_original_data + n_column_containing_missing)
        The processed data array.
    """
    if method == 'omit':
        new_data = original_data[~np.isnan(original_data).any(axis=1)]
        
    elif method == 'mean':
        non_nan_data = original_data[~np.isnan(original_data).any(axis=1)]
        mean_row = np.mean(non_nan_data, axis=0)
        for i_column in range(len(mean_row)):
            original_data[:,i_column] = np.nan_to_num(original_data[:,i_column], 
                                                      nan=mean_row[i_column])
            new_data = original_data
            
    elif method == 'median':
        non_nan_data = original_data[~np.isnan(original_data).any(axis=1)]
        median_row = np.median(non_nan_data, axis=0)
        for i_column in range(len(median_row)):
            original_data[:,i_column] = np.nan_to_num(original_data[:,i_column], 
                                                      nan=median_row[i_column])
            new_data = original_data
            
    elif method == 'zeros':
        new_data = np.nan_to_num(original_data, nan=0.0)
        
    elif method == 'change_and_add_flags':
        import numpy.ma as ma
        for i_column in range(27): # 27 columns in total, not including y
            new_column = np.zeros(len(original_data[:,i_column]))
            mask = np.ma.masked_invalid(original_data[:,i_column]).mask
            new_column[mask] = 1
            if np.sum(new_column) != 0:
                new_column = np.expand_dims(new_column, axis=0)
                new_column = new_column.transpose()
                original_data = np.insert(original_data, [-1], new_column, axis=1)
                original_data[:,i_column] = np.nan_to_num(original_data[:,i_column], 
                                                      nan=supply_data[i_column])
                new_data = original_data
                
    elif method == 'imputation':
        # to do
        pass
    
    else: 
        print('Invalid option for treating missing data.')
    
    return new_data

In [93]:
# Get data

df_train = pd.read_csv('data/train.csv', index_col=0)
df_test = pd.read_csv('data/test.csv', index_col=0)


df_train = df_train.dropna()
# don't need labels
dtrain = df_train.values[1:]
dtest = df_test.values[:]
dtest = missing_values(dtest, method = "mean")

random.shuffle(dtrain)

X_all, Y_all = dtrain[:, :-1], dtrain[:, -1]

X_val = X_all[0:10000]
X_train = X_all[10000:200000]
Y_val = Y_all[0:10000]
Y_train = Y_all[10000:200000]

X_test = dtest


In [94]:
# Try out random forest

n_estimators = 1000
depth = 15

clf = RandomForestClassifier(n_estimators = n_estimators, max_depth = depth, criterion = 'gini')
clf.fit(X_train, Y_train)
print("Done training")

y_val_pred = clf.predict(X_val)
#print(y_val_pred)
val_err = classification_err(y_val_pred, Y_val)
print("Validation error:", val_err)

# Make submission
test_probs = clf.predict_proba(X_test)[:, 1]
print(test_probs)
test["Predicted"] = test_probs
test[["id","Predicted"]].to_csv("submission.csv",index=False)


Done training
Validation error: 0.2
[0.54263202 0.24408337 0.3127382  ... 0.41443658 0.39582491 0.50916129]
