In [15]:
# Import everything we need

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
import random
import torch
import torch.nn as nn
import torch.nn.functional as F

np.random.seed(1)

In [23]:
# Classification error definition
def classification_err(y, real_y):
    len_data = y.size
    num_diff = 0.0
    for i in range(len_data):
        if (y[i] != real_y[i]):
            num_diff += 1.0
    return (num_diff / len_data)

def missing_values(original_data, method='omit', 
                   supply_data=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
                                15,16,17,18,19,20,21,22,23,24,25,26]):
    """
    Replace missing values in original data according to given rules.
    Parameters
    ----------
    original_data : numpy array
        The data set containing NaN.
    method : str, optional
        'omit' : remove rows containing NaN. Default.
        'mean' : replace NaN by the mean of its column.
        'median' : replace NaN by the median of its column.
        'zeros' : replace NaN by 0.
        'change_and_add_flags' : replace NaN by the values specified in 
         supply_data at each corresponding columns. Then add new columns 
         with 0 = not NaN and 1 = is NaN.
    supply_data : list of floats, optional
        values to replace NaN in each column. The default is 
        [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26].
        'imputation' : fill in missing values by simple machine learning
    Returns
    -------
    new_data : numpy array of size (row_original_data, 
               column_original_data + n_column_containing_missing)
        The processed data array.
    """
    if method == 'omit':
        new_data = original_data[~np.isnan(original_data).any(axis=1)]
        
    elif method == 'mean':
        non_nan_data = original_data[~np.isnan(original_data).any(axis=1)]
        mean_row = np.mean(non_nan_data, axis=0)
        for i_column in range(len(mean_row)):
            original_data[:,i_column] = np.nan_to_num(original_data[:,i_column], 
                                                      nan=mean_row[i_column])
            new_data = original_data
            
    elif method == 'median':
        non_nan_data = original_data[~np.isnan(original_data).any(axis=1)]
        median_row = np.median(non_nan_data, axis=0)
        for i_column in range(len(median_row)):
            original_data[:,i_column] = np.nan_to_num(original_data[:,i_column], 
                                                      nan=median_row[i_column])
            new_data = original_data
            
    elif method == 'zeros':
        new_data = np.nan_to_num(original_data, nan=0.0)
        
    elif method == 'change_and_add_flags':
        import numpy.ma as ma
        for i_column in range(27): # 27 columns in total, not including y
            new_column = np.zeros(len(original_data[:,i_column]))
            mask = np.ma.masked_invalid(original_data[:,i_column]).mask
            new_column[mask] = 1
            if np.sum(new_column) != 0:
                new_column = np.expand_dims(new_column, axis=0)
                new_column = new_column.transpose()
                original_data = np.insert(original_data, [-1], new_column, axis=1)
                original_data[:,i_column] = np.nan_to_num(original_data[:,i_column], 
                                                      nan=supply_data[i_column])
                new_data = original_data
                
    elif method == 'imputation':
        # to do
        pass
    
    else: 
        print('Invalid option for treating missing data.')
    
    return new_data

In [24]:
# Get data

test = pd.read_csv("data/test.csv")
df_train = pd.read_csv('data/train.csv', index_col=0)
df_test = pd.read_csv('data/test.csv', index_col=0)


df_train = df_train.dropna()
# don't need labels
dtrain = df_train.values[1:]
dtest = df_test.values[:]
dtest = missing_values(dtest, method = "median")

#random.shuffle(dtrain)

X_all, Y_all = dtrain[:, :-1], dtrain[:, -1]

X_val = X_all[0:10000]
X_train = X_all[10000:100000]
Y_val = Y_all[0:10000]
Y_train = Y_all[10000:100000]

X_test = dtest


In [25]:
# Try out random forest

n_estimators = 1000
depth = 15

clf = RandomForestClassifier(n_estimators = n_estimators, max_depth = depth, criterion = 'gini')
clf.fit(X_train, Y_train)
print("Done training")

y_val_pred = clf.predict_proba(X_val)[:,1]
#print(y_val_pred)
val_err = roc_auc_score(Y_val, y_val_pred)
print("Validation error:", val_err)

# Make submission
test_probs = clf.predict_proba(X_test)[:, 1]
print(test_probs)
test["Predicted"] = test_probs
test[["id","Predicted"]].to_csv("submission.csv",index=False)


Done training
Validation error: 0.6057279432421964
[0.4803708  0.34472494 0.4199356  ... 0.42304598 0.44360754 0.54446325]


In [18]:
# Try out SVM
# Idea is to train SVM's on random datasets, then aggregate predictions to get a probability
# Broken, not sure why. Going to try NN for now

num_svms = 10
set_size = 100

def make_data(size):
    X_set = []
    Y_set = []
    for i in range(size):
        idx = random.randint(0, len(X_train) - 1)
        X_set.append(X_train[idx])
        Y_set.append(Y_train[idx])
    return X_set, Y_set


# Train svms      
svms = []
for i in range(num_svms):
    #X_set, Y_set = make_data(set_size)
    clf = svm.SVC(kernel='rbf', gamma='auto')
    clf.fit(X_train, Y_train)
    svms.append(clf)
print("Done training")
    
# Make test predictions
test_probs = []
for i in range(10):
    print("On train pt:", i + 1)
    x = X_test[i]
    prob = 0.0
    for i in range(num_svms):
        y = clf.predict([x])
        print("I think it is:", y)
        prob += y[0]
    prob = prob / num_svms
    test_probs.append(prob)

clf = svm.SVC(kernel='rbf', random_state=0)
clf.fit(X_train, Y_train)

test_probs = clf.predict(X_test)
print(y_val_pred)
#val_err = classification_err(y_val_pred, Y_val)
#print("Validation error:", val_err)

# Make submission
#test_probs = clf.predict_proba(X_test)[:, 1]
print(test_probs)
#test["Predicted"] = test_probs
#test[["id","Predicted"]].to_csv("submission.csv",index=False)

Done training
On train pt: 1
I think it is: [0.]
I think it is: [0.]
I think it is: [0.]
I think it is: [0.]
I think it is: [0.]
I think it is: [0.]
I think it is: [0.]
I think it is: [0.]
I think it is: [0.]
I think it is: [0.]
On train pt: 2
I think it is: [0.]
I think it is: [0.]
I think it is: [0.]
I think it is: [0.]
I think it is: [0.]
I think it is: [0.]
I think it is: [0.]
I think it is: [0.]
I think it is: [0.]
I think it is: [0.]
On train pt: 3
I think it is: [0.]
I think it is: [0.]
I think it is: [0.]
I think it is: [0.]
I think it is: [0.]
I think it is: [0.]
I think it is: [0.]
I think it is: [0.]
I think it is: [0.]
I think it is: [0.]
On train pt: 4
I think it is: [0.]
I think it is: [0.]
I think it is: [0.]
I think it is: [0.]
I think it is: [0.]
I think it is: [0.]
I think it is: [0.]
I think it is: [0.]
I think it is: [0.]
I think it is: [0.]
On train pt: 5
I think it is: [0.]
I think it is: [0.]
I think it is: [0.]
I think it is: [0.]
I think it is: [0.]
I think it 



[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]


In [39]:
# Trying out NN

model = nn.Sequential(
    nn.Linear(26, 20),
    nn.ReLU(),
    nn.Dropout(0.1),
    nn.Linear(20, 15),
    nn.ReLU(),
    nn.Dropout(0.1),
    nn.Linear(15, 10),
    nn.ReLU(),
    nn.Dropout(0.1),
    nn.Linear(10, 5),
    nn.ReLU(),
    nn.Dropout(0.1),
    nn.Linear(5, 2)
)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Train model
model.train()

num_epochs = 20
X = X_train
Y = Y_train

for epoch in range(num_epochs):
    
    data = torch.tensor(X).float()
    target = torch.tensor(Y).long()
    
    # Erase accumulated gradients
    optimizer.zero_grad()

    # Forward pass
    output = model(data)

    # Calculate loss
    loss = loss_fn(output, target)

    # Backward pass
    loss.backward()
        
    # Weight update
    optimizer.step()

    # Track loss each epoch
    print('Train Epoch: %d  Loss: %.4f' % (epoch + 1,  loss.item()))
    
    
# Putting layers like Dropout into evaluation mode
model.eval()

#test_probs = model(torch.tensor(X_test).float())[:,1]
#print(test_probs)
#print(torch.exp(test_probs))
# Turning off automatic differentiation
with torch.no_grad():
    # Get validation error
    val_probs = model(torch.tensor(X_val).float())
    print(val_probs)
    #val_probs = torch.exp(val_probs)
    val_err = roc_auc_score(Y_val, val_probs)
    print("Validation error:", val_err)
    
    test_probs = model(torch.tensor(X_test).float())[:,1]
    print(test_probs)
    #test_probs = torch.exp(test_probs)
    #print(test_probs)
    test["Predicted"] = test_probs
    test[["id","Predicted"]].to_csv("submission.csv",index=False)




Train Epoch: 1  Loss: 8.9891
Train Epoch: 2  Loss: 7.9952
Train Epoch: 3  Loss: 7.2699
Train Epoch: 4  Loss: 6.8461
Train Epoch: 5  Loss: 6.3423
Train Epoch: 6  Loss: 5.8945
Train Epoch: 7  Loss: 5.4177
Train Epoch: 8  Loss: 4.9366
Train Epoch: 9  Loss: 4.4287
Train Epoch: 10  Loss: 4.0057
Train Epoch: 11  Loss: 3.6376
Train Epoch: 12  Loss: 3.2474
Train Epoch: 13  Loss: 2.9098
Train Epoch: 14  Loss: 2.6031
Train Epoch: 15  Loss: 2.3565
Train Epoch: 16  Loss: 2.1069
Train Epoch: 17  Loss: 1.8471
Train Epoch: 18  Loss: 1.6933
Train Epoch: 19  Loss: 1.5807
Train Epoch: 20  Loss: 1.4561
tensor([[1.2716, 0.7484],
        [1.3235, 0.8651],
        [1.3213, 0.8571],
        ...,
        [1.3668, 0.9806],
        [1.3562, 0.9438],
        [1.3680, 0.9792]])


ValueError: bad input shape torch.Size([10000, 2])

In [40]:
# Trying adaboost I guess?
# Achieving VERY low classification error, but does not receive a very good score....

clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=7),
    n_estimators=100
)
clf.fit(X_train, Y_train)
print("Done training")

y_val_pred = clf.predict_proba(X_val)[:,1]
val_err = roc_auc_score(Y_val, y_val_pred)
print("Validation error:", val_err)

# Make submission
test_probs = clf.predict_proba(X_test)[:,1]
print(test_probs)
test["Predicted"] = test_probs
test[["id","Predicted"]].to_csv("submission.csv",index=False)

Done training
Validation error: 0.5655748871256283
[0.40086314 0.70657799 0.55767697 ... 0.55307357 0.59432705 0.45561421]
