In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import itertools
import pickle

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
from sklearn.metrics import log_loss

import time

In [5]:
def load_data(filename):
    X = pd.read_csv(filename).sample(frac=1).reset_index(drop=True)
    Y = X.pop('LABEL').values
    return train_test_split(X.values, Y, test_size=0.2)
  
def build_classifier(X_train, Y_train):
    classifier = DecisionTreeClassifier() #CART
    classifier.fit(X_train, Y_train)
    return classifier
    
def predict(classifier, X_test):
    return classifier.predict(X_test)

def print_metrics(Y_test, Y_pred):
    print("Accuracy: %.2f" % accuracy_score(Y_test, Y_pred))
    PRF = precision_recall_fscore_support(Y_test, Y_pred, average='macro')
    print('Precision: %.2f' % PRF[0])
    print('Recall: %.2f' % PRF[1])
    print('F1: %.2f' % PRF[2])
    print('Logarithmic loss: %.2f' % log_loss(Y_test, Y_pred,normalize=True))
    
def dump_model(filename, classifier):
    pickle.dump(classifier, open(filename, 'wb'))

<h2>3000</h2>

In [6]:
X_train, X_test, Y_train, Y_test = load_data('conc_3000.csv')
start = time.time()
clf = build_classifier(X_train, Y_train)
end = time.time()
Y_pred = predict(clf, X_test)
print_metrics(Y_test, Y_pred)
dump_model('tree_3000.sav', clf)
print(end-start)

Accuracy: 0.88
Precision: 0.88
Recall: 0.88
F1: 0.88
Logarithmic loss: 4.26
0.10577940940856934


In [7]:
X_train, X_test, Y_train, Y_test = load_data('conc_2000.csv')
start = time.time()
clf = build_classifier(X_train, Y_train)
end = time.time()
Y_pred = predict(clf, X_test)
print_metrics(Y_test, Y_pred)
dump_model('tree_2000.sav', clf)
print(end-start)

Accuracy: 0.86
Precision: 0.86
Recall: 0.86
F1: 0.86
Logarithmic loss: 4.79
0.05771923065185547


In [8]:
X_train, X_test, Y_train, Y_test = load_data('conc_1000.csv')
start = time.time()
clf = build_classifier(X_train, Y_train)
end = time.time()
Y_pred = predict(clf, X_test)
print_metrics(Y_test, Y_pred)
dump_model('tree_1000.sav', clf)
print(end-start)

Accuracy: 0.84
Precision: 0.84
Recall: 0.84
F1: 0.84
Logarithmic loss: 5.61
0.028880834579467773


In [9]:
X_train, X_test, Y_train, Y_test = load_data('conc_500.csv')
start = time.time()
clf = build_classifier(X_train, Y_train)
end = time.time()
Y_pred = predict(clf, X_test)
print_metrics(Y_test, Y_pred)
dump_model('tree_500.sav', clf)
print(end-start)

Accuracy: 0.82
Precision: 0.82
Recall: 0.82
F1: 0.82
Logarithmic loss: 6.22
0.012599945068359375


In [10]:
X_train, X_test, Y_train, Y_test = load_data('conc_100.csv')
start = time.time()
clf = build_classifier(X_train, Y_train)
end = time.time()
Y_pred = predict(clf, X_test)
print_metrics(Y_test, Y_pred)
dump_model('tree_100.sav', clf)
print(end-start)

Accuracy: 0.78
Precision: 0.79
Recall: 0.78
F1: 0.77
Logarithmic loss: 7.77
0.002386331558227539


In [11]:
X_train, X_test, Y_train, Y_test = load_data('restricted_conc_3000.csv')
start = time.time()
clf = build_classifier(X_train, Y_train)
end = time.time()
Y_pred = predict(clf, X_test)
print_metrics(Y_test, Y_pred)
print(end-start)

Accuracy: 0.58
Precision: 0.59
Recall: 0.58
F1: 0.57
Logarithmic loss: 14.51
0.019609451293945312
