In [35]:
from multiclass import ECOC
from sklearn import svm
from sklearn import tree
import pandas as pd
import numpy as np
from sklearn.tree import export_text
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
import time
import sys

In [36]:
dataset_feature_names=[ 
                "amount_usd",        #-- 0 *
                "client_age",        #-- 1 *
                "client_gender",     #-- 2
                "debit_type",        #-- 3 Credit or debit card
                "agency_region",     #-- 4
                "merchant_departement" #-- 5
              ]



In [37]:
def extractPrediction (cum_prediction_arr):
    """This method makes a weighted prediction usin a set of decision trees.

    Parameters
    ----------
    cum_prediction_arr : np.array,  conatining the probability of an element belonging to a class (shape: sample,nb_classes)

    Return
    ------
    max_predictions: ndarray, containing the predicted class (throught vote) class
    """
    max_predictions = []
    for row in cum_prediction_arr:
        max_predictions.append((np.argmax(row))+1)
        
    return max_predictions

In [38]:
def makeEnsamblePrediction (ensambleModels, X_test, num_clases):
    """This method makes a weighted prediction usin a set of decision trees.

    Parameters
    ----------
    ensambleModels : list, set of decision trees
    X_test : ndarray, feature matrix
    num_clases : number of classes to be predicted

    Return
    ------
    cum_prediction_arr: ndarray, weighted prediction matrix.
    """
    
    i = 1

    cum_prediction_arr = np.zeros((len(X_test), num_clases)).copy()
    for model in ensambleModels:
        prediction_arr = (model.estimator.predict_proba(X_test)).copy()
        cum_prediction_arr = cum_prediction_arr.copy().__add__(prediction_arr.copy())
        #r = export_text(model.estimator, feature_names=dataset_feature_names)
        i +=1
        #print ("model {}".format(i))
        #print (r)
            
    return cum_prediction_arr.copy()

# Ensemble Decision tree

In [46]:
#read dataset
ensamble=[]
predictions=[]
batch_size=10000
f= open("summary_ensemble_dt.txt","w+")
aux_list = []
X_ = []
y_ = []
i =0 
numLine = 0
mean_acc = 0
print ("numLine","acc","mean_acc", "train_time","test_time","total_time","memory_model (bytes)" )
f.write("id,acc,mean_acc,train_time,test_time,total_time,memory_model (bytes)\n")
with open("../data_output/export_dataframe_0v4.csv") as infile:
    for line in infile:
        # windows is not full. We add records 
        if (i < batch_size):
            aux_list= (line.split(","))
            aux_y = int(aux_list.pop())
            aux_x = np.array(aux_list.copy(),dtype=np.float32) 
            y_.append( aux_y )        
            X_.append(aux_x)  

        else: #windows is full
            X = np.asarray(X_).copy()
            y = np.asarray(y_).copy()
            
            #Is the first mode?
            if (len(ensamble) == 0 ): 
                
                start_train = time.time()
                parameters = {'max_depth':[None]}
                clf = GridSearchCV(tree.DecisionTreeClassifier(class_weight='balanced'), parameters, n_jobs=-1)
                clf.fit(X, y)
                tree_model = clf.best_estimator_
                end_train = time.time()
                train_time = end_train - start_train
                
                ensamble.append( ECOC(tree_model) )
                ensamble[-1].estimator.classes_ = [1,2]
                ensamble[-1].estimator.n_classes_= 2
                memory_model = sys.getsizeof(ensamble)
            else:# It is not the first model
                start_test = time.time()
                pred_arr = (makeEnsamblePrediction(ensamble,X.copy(),2)).copy()
                end_test = time.time()
                test_time = end_test - start_test
                
                start_train = time.time()
                parameters = {'max_depth':[None]}
                clf = GridSearchCV(tree.DecisionTreeClassifier(class_weight='balanced'), parameters, n_jobs=-1)
                clf.fit(X, y)
                tree_model = clf.best_estimator_
                
                end_train = time.time()
                train_time = end_train - start_train
                
                ensamble.append( ECOC(tree_model) )
                ensamble[-1].estimator.classes_ = [1,2]
                ensamble[-1].estimator.n_classes_= 2
                memory_model = sys.getsizeof(ensamble)
                predictions.append(pred_arr)
                y_pred = extractPrediction(predictions[-1])
                acc = accuracy_score(y, y_pred)
                
                #compute mean accuracy
                if mean_acc == 0 :
                    mean_acc = acc
                else:
                    mean_acc = (acc + mean_acc)/2
                  
                print (numLine,acc,mean_acc, train_time,test_time,(train_time+test_time),memory_model)
                f.write("{},{},{},{},{},{},{}\n".format(numLine,acc,mean_acc, train_time,
                                  test_time,(train_time+test_time),memory_model))
            
            X_=[]
            y_=[]            
            i=0
            
        i+=1
        numLine += 1
f.close()        

numLine acc mean_acc train_time test_time total_time memory_model (bytes)
20000 0.7201720172017202 0.7201720172017202 0.11984705924987793 0.0026886463165283203 0.12253570556640625 96
30000 0.6637663766376638 0.6919691969196919 0.11939382553100586 0.0037407875061035156 0.12313461303710938 96
40000 0.8511851185118512 0.7715771577157715 0.0979311466217041 0.005682945251464844 0.10361409187316895 96
50000 0.7886788678867886 0.7801280128012801 0.11021065711975098 0.006439924240112305 0.11665058135986328 128
60000 0.8161816181618162 0.7981548154815481 0.11646580696105957 0.009161233901977539 0.1256270408630371 128
70000 0.7999799979998 0.799067406740674 0.11784482002258301 0.014043807983398438 0.13188862800598145 128
80000 0.8324832483248324 0.8157753275327533 0.1190497875213623 0.01172494888305664 0.13077473640441895 128
90000 0.8093809380938094 0.8125781328132813 0.11916279792785645 0.013423919677734375 0.13258671760559082 192
100000 0.8098809880988099 0.8112295604560456 0.1101770401000976

780000 0.8572857285728572 0.8525207568413229 0.11770987510681152 0.13086295127868652 0.24857282638549805 768
790000 0.8808880888088809 0.866704422825102 0.11143112182617188 0.13333702087402344 0.2447681427001953 768
800000 0.8662866286628663 0.8664955257439841 0.11675906181335449 0.13260221481323242 0.24936127662658691 768
810000 0.8691869186918691 0.8678412222179266 0.1052858829498291 0.13727903366088867 0.24256491661071777 768
820000 0.8686868686868687 0.8682640454523977 0.1127622127532959 0.14126086235046387 0.25402307510375977 768
830000 0.8511851185118512 0.8597245819821244 0.10712909698486328 0.1425769329071045 0.24970602989196777 768
840000 0.8401840184018402 0.8499543001919823 0.12998390197753906 0.14424395561218262 0.2742278575897217 768
850000 0.8438843884388438 0.846919344315413 0.11347293853759766 0.14590787887573242 0.2593808174133301 768
860000 0.8483848384838484 0.8476520913996307 0.12595105171203613 0.14725804328918457 0.2732090950012207 768
870000 0.8633863386338634 0.