In [1]:
from multiclass import ECOC
from sklearn import svm
from sklearn import tree
import pandas as pd
import numpy as np
from sklearn.tree import export_text
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
import time
import sys

In [2]:
dataset_feature_names=[ 
                "amount_usd",        #-- 0 *
                "client_age",        #-- 1 *
                "client_gender",     #-- 2
                "debit_type",        #-- 3 Credit or debit card
                "agency_region",     #-- 4
                "merchant_departement" #-- 5
              ]



In [3]:
def extractPrediction (cum_prediction_arr):
    """This method makes a weighted prediction usin a set of decision trees.

    Parameters
    ----------
    cum_prediction_arr : np.array,  conatining the probability of an element belonging to a class (shape: sample,nb_classes)

    Return
    ------
    max_predictions: ndarray, containing the predicted class (throught vote) class
    """
    max_predictions = []
    for row in cum_prediction_arr:
        max_predictions.append((np.argmax(row))+1)
        
    return max_predictions

In [4]:
def makeEnsamblePrediction (ensambleModels, X_test, num_clases):
    """This method makes a weighted prediction usin a set of decision trees.

    Parameters
    ----------
    ensambleModels : list, set of decision trees
    X_test : ndarray, feature matrix
    num_clases : number of classes to be predicted

    Return
    ------
    cum_prediction_arr: ndarray, weighted prediction matrix.
    """
    
    i = 1

    cum_prediction_arr = np.zeros((len(X_test), num_clases)).copy()
    for model in ensambleModels:
        prediction_arr = (model.estimator.predict_proba(X_test)).copy()
        cum_prediction_arr = cum_prediction_arr.copy().__add__(prediction_arr.copy())
        #r = export_text(model.estimator, feature_names=dataset_feature_names)
        i +=1
        #print ("model {}".format(i))
        #print (r)
            
    return cum_prediction_arr.copy()

# Ensemble Decision tree

In [6]:
#read dataset
ensamble=[]
predictions=[]
batch_size=10000
f= open("summary_ensemble_dt.txt","w+")
aux_list = []
X_ = []
y_ = []
i =0 
numLine = 0
mean_acc = 0
print ("numLine","acc","mean_acc", "train_time","test_time","total_time","memory_model (bytes)" )
f.write("id,acc,mean_acc,train_time,test_time,total_time,memory_model (bytes)\n")
#with open("../data_output/export_dataframe_0v4.csv") as infile:

with open("../data_output/balanced.csv") as infile:
    for line in infile:
        # windows is not full. We add records 
        if (i < batch_size):
            aux_list= (line.split(","))
            aux_y = int(aux_list.pop())
            aux_x = np.array(aux_list.copy(),dtype=np.float32) 
            y_.append( aux_y )        
            X_.append(aux_x)  

        else: #windows is full
            X = np.asarray(X_).copy()
            y = np.asarray(y_).copy()
            
            #Is the first mode?
            if (len(ensamble) == 0 ): 
                
                start_train = time.time()
                parameters = {'max_depth':[None]}
                clf = GridSearchCV(tree.DecisionTreeClassifier(class_weight='balanced'), parameters, n_jobs=-1)
                clf.fit(X, y)
                tree_model = clf.best_estimator_
                end_train = time.time()
                train_time = end_train - start_train
                
                ensamble.append( ECOC(tree_model) )
                ensamble[-1].estimator.classes_ = [1,2]
                ensamble[-1].estimator.n_classes_= 2
                memory_model = sys.getsizeof(ensamble)
            else:# It is not the first model
                start_test = time.time()
                pred_arr = (makeEnsamblePrediction(ensamble,X.copy(),2)).copy()
                end_test = time.time()
                test_time = end_test - start_test
                
                start_train = time.time()
                parameters = {'max_depth':[None]}
                clf = GridSearchCV(tree.DecisionTreeClassifier(class_weight='balanced'), parameters, n_jobs=-1)
                clf.fit(X, y)
                tree_model = clf.best_estimator_
                
                end_train = time.time()
                train_time = end_train - start_train
                
                ensamble.append( ECOC(tree_model) )
                ensamble[-1].estimator.classes_ = [1,2]
                ensamble[-1].estimator.n_classes_= 2
                memory_model = sys.getsizeof(ensamble)
                predictions.append(pred_arr)
                y_pred = extractPrediction(predictions[-1])
                acc = accuracy_score(y, y_pred)
                
                #compute mean accuracy
                if mean_acc == 0 :
                    mean_acc = acc
                else:
                    mean_acc = (acc + mean_acc)/2
                  
                print (numLine,acc,mean_acc, train_time,test_time,(train_time+test_time),memory_model)
                f.write("{},{},{},{},{},{},{}\n".format(numLine,acc,mean_acc, train_time,
                                  test_time,(train_time+test_time),memory_model))
            
            X_=[]
            y_=[]            
            i=0
            
        i+=1
        numLine += 1
f.close()        

numLine acc mean_acc train_time test_time total_time memory_model (bytes)
20000 0.0593059305930593 0.0593059305930593 0.3530611991882324 0.003069162368774414 0.35613036155700684 96
30000 0.09160916091609161 0.07545754575457546 0.3253049850463867 0.003612041473388672 0.3289170265197754 96
40000 0.0428042804280428 0.059130913091309134 0.1171419620513916 0.0049779415130615234 0.12211990356445312 96
50000 0.052805280528052806 0.05596809680968097 0.12107014656066895 0.006803989410400391 0.12787413597106934 128
60000 0.0406040604060406 0.048286078607860786 0.12015008926391602 0.008685111999511719 0.12883520126342773 128
70000 0.050505050505050504 0.04939556455645565 0.127061128616333 0.009621143341064453 0.13668227195739746 128
80000 0.034403440344034406 0.04189950245024503 0.11926007270812988 0.011661052703857422 0.1309211254119873 128
90000 0.04590459045904591 0.04390204645464547 0.12499308586120605 0.012517929077148438 0.1375110149383545 192
100000 0.0332033203320332 0.03855268339333934 0

760000 0.033003300330033 0.03319461014853943 0.11405706405639648 0.12659597396850586 0.24065303802490234 768
770000 0.0337033703370337 0.033448990242786567 0.1330118179321289 0.12252378463745117 0.2555356025695801 768
780000 0.0372037203720372 0.035326355307411884 0.1322019100189209 0.13034510612487793 0.26254701614379883 768
790000 0.034103410341034104 0.034714882824223 0.13257408142089844 0.12285900115966797 0.2554330825805664 768
800000 0.029402940294029404 0.0320589115591262 0.12517189979553223 0.1272900104522705 0.25246191024780273 768
810000 0.0372037203720372 0.0346313159655817 0.13176393508911133 0.132551908493042 0.2643158435821533 768
820000 0.032503250325032505 0.0335672831453071 0.11654210090637207 0.13544583320617676 0.25198793411254883 768
830000 0.0368036803680368 0.03518548175667195 0.1412949562072754 0.13341093063354492 0.2747058868408203 768
840000 0.0331033103310331 0.034144396043852526 0.125870943069458 0.13610100746154785 0.26197195053100586 768
850000 0.0337033703