In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import itertools
import pickle

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
from sklearn.metrics import log_loss

In [9]:
def load_data(filename):
    X = pd.read_csv(filename).sample(frac=1).reset_index(drop=True)
    Y = X.pop('LABEL').values
    return train_test_split(X.values, Y, test_size=0.2)
  
def build_classifier(X_train, Y_train):
    classifier = DecisionTreeClassifier() #CART
    classifier.fit(X_train, Y_train)
    return classifier
    
def predict(classifier, X_test):
    return classifier.predict(X_test)

def print_metrics(Y_test, Y_pred):
    print("Accuracy: %.2f" % accuracy_score(Y_test, Y_pred))
    PRF = precision_recall_fscore_support(Y_test, Y_pred, average='macro')
    print('Precision: %.2f' % PRF[0])
    print('Recall: %.2f' % PRF[1])
    print('F1: %.2f' % PRF[2])
    print('Logarithmic loss: %.2f' % log_loss(Y_test, Y_pred,normalize=True))
    
def dump_model(filename, classifier):
    pickle.dump(classifier, open(filename, 'wb'))

<h2>3000</h2>

In [11]:
X_train, X_test, Y_train, Y_test = load_data('conc_3000.csv')
clf = build_classifier(X_train, Y_train)
Y_pred = predict(clf, X_test)
print_metrics(Y_test, Y_pred)
dump_model('tree_3000.sav', clf)

Accuracy: 0.88
Precision: 0.88
Recall: 0.88
F1: 0.88
Logarithmic loss: 4.09


In [12]:
X_train, X_test, Y_train, Y_test = load_data('conc_2000.csv')
clf = build_classifier(X_train, Y_train)
Y_pred = predict(clf, X_test)
print_metrics(Y_test, Y_pred)
dump_model('tree_2000.sav', clf)

Accuracy: 0.88
Precision: 0.88
Recall: 0.88
F1: 0.88
Logarithmic loss: 4.23


In [15]:
X_train, X_test, Y_train, Y_test = load_data('conc_1000.csv')
clf = build_classifier(X_train, Y_train)
Y_pred = predict(clf, X_test)
print_metrics(Y_test, Y_pred)
dump_model('tree_1000.sav', clf)

Accuracy: 0.84
Precision: 0.84
Recall: 0.84
F1: 0.84
Logarithmic loss: 5.53


In [16]:
X_train, X_test, Y_train, Y_test = load_data('conc_500.csv')
clf = build_classifier(X_train, Y_train)
Y_pred = predict(clf, X_test)
print_metrics(Y_test, Y_pred)
dump_model('tree_500.sav', clf)

Accuracy: 0.83
Precision: 0.84
Recall: 0.83
F1: 0.83
Logarithmic loss: 5.87


In [17]:
X_train, X_test, Y_train, Y_test = load_data('conc_100.csv')
clf = build_classifier(X_train, Y_train)
Y_pred = predict(clf, X_test)
print_metrics(Y_test, Y_pred)
dump_model('tree_100.sav', clf)

Accuracy: 0.78
Precision: 0.78
Recall: 0.79
F1: 0.77
Logarithmic loss: 7.77
