In [1]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

from sklearn import preprocessing 

In [2]:
def convert_numerical_cols(data, numeric_cols):    
    for col in numeric_cols:
        median = np.median(data[:, col].astype(float))
        for d in data: 
            if float(d[col]) <= median:
                d[col] = 0
            else:
                d[col] = 1
    return data

def convert_categorical_cols(data, categorical_cols):
    n_row, n_col = data.shape
    
    for col in categorical_cols:
        s = data[:,col:col+1]
        enc = preprocessing.LabelEncoder()
        d = enc.fit_transform(s.reshape(n_row))
        data[:,col:col+1] = d.reshape(n_row, 1)

    return data

def process(file):
    data = np.genfromtxt(file, delimiter=',', dtype='str')

    data = data[1:data.shape[0],]
    n_row, n_col = data.shape

    return data[:,0:n_col-1], data[:,n_col-1]

def process_test(file):
    data = np.genfromtxt(file, delimiter=',', dtype='str')

    data = data[1:data.shape[0],]
    n_row, n_col = data.shape

    return data[:,0:1], data[:,1:n_col]

In [3]:
numeric_cols = [0, 2, 4, 10, 11, 12]
categorical_cols = [1, 3, 5, 6, 7, 8, 9, 13]

In [4]:
# read data
X, Y = process('./data/train_final.csv')

X = convert_numerical_cols(X, numeric_cols)
X = convert_categorical_cols(X, categorical_cols)


# create decision tree
clf = DecisionTreeClassifier().fit(X, Y)

# tree.plot_tree(clf)

In [5]:
test_numeric_cols = [0, 2, 4, 10, 11, 12]
test_categorical_cols = [1, 3, 5, 6, 7, 8, 9, 13]
_ids, TEST_X = process_test('./data/test_final.csv')

print(TEST_X.shape)
TEST_X = convert_numerical_cols(TEST_X, test_numeric_cols)
TEST_X = convert_categorical_cols(TEST_X, test_categorical_cols)

(23842, 14)


In [6]:
# probability of each class
results = clf.predict_proba(TEST_X)

In [7]:
# generate result
auc_map = np.zeros((TEST_X.shape[0], 2))
for i in range(auc_map.shape[0]):
    auc_map[i][0] = _ids[i]
    auc_map[i][1] = np.amin(results[i])
    
auc_map.shape

(23842, 2)

In [8]:
import csv

# open the file in the write mode
f = open('./result/result.csv', 'w')
# create the csv writer
writer = csv.writer(f)
writer.writerow(["ID","Prediction"])

for row in auc_map:
    _id = str(int(row[0]))
    pred = "{:.8f}".format(float(row[1]))
    writer.writerow([int(_id), pred])

# close the file
f.close()