# 1.Imports

In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn import metrics as mt
from sklearn.tree import DecisionTreeClassifier

# 2. Funcao para carregar os datasets

In [2]:
def Load_Data_Set(df_x_train, df_y_train):
    df1 = pd.read_csv(df_x_train)
    df2 = pd.read_csv(df_y_train)
    return df1, df2

# 3. Criando e concatenando os dataframes

### 3.1 Dataset de treinamento

In [3]:
df_x_train = "../1_ensaio_classificacao/1_dados_treinamento/X_training.csv"

In [4]:
df_y_train = "../1_ensaio_classificacao/1_dados_treinamento/y_training.csv"

In [5]:
df1_train, df2_train = Load_Data_Set(df_x_train, df_y_train)

In [6]:
df1_train['label'] = df2_train

In [7]:
df_train = df1_train.copy()

### 3.2 Dataset de validacao

In [8]:
df_x_val = "../1_ensaio_classificacao/2_dados_validacao/X_validation.csv"

In [9]:
df_y_val = "../1_ensaio_classificacao/2_dados_validacao/y_validation.csv"

In [10]:
df1_val, df2_val = Load_Data_Set(df_x_val, df_y_val)

In [11]:
df1_val['label'] = df2_val

In [12]:
df_val = df1_val.copy()

### 3.3 Dataset de teste

In [13]:
df_x_test = "../1_ensaio_classificacao/3_dados_teste/X_test.csv"

In [14]:
df_y_test = "../1_ensaio_classificacao/3_dados_teste/y_test.csv"

In [15]:
df1_test, df2_test = Load_Data_Set(df_x_test, df_y_test)

In [16]:
df1_test['label'] = df2_test

In [17]:
df_test = df1_test.copy()

# 4. Funções para treinar o modelo e avaliar as métricas

In [18]:
#Criando uma função para Treinar o Modelo encima dos dados de treinamento

#Fazer as previsoes encima dos proprios dados de treinamento

#Entrada: dataframe de treinamento, parametro max_depth

#Saida: valores de y_train + valores de previsao feitos a partir dos dados de treinamento (y_pred_train) +
# algoritmo decision_tree_classifier treinado

def Model_Training(df_train,max_depth): 
    
    features = ['id', 'customer_type', 'age', 'class', 'flight_distance',
       'inflight_wifi_service', 'departure_arrival_time_convenient',
       'ease_of_online_booking', 'gate_location', 'food_and_drink',
       'online_boarding', 'seat_comfort', 'inflight_entertainment',
       'on_board_service', 'leg_room_service', 'baggage_handling',
       'checkin_service', 'inflight_service', 'cleanliness',
       'departure_delay_in_minutes', 'arrival_delay_in_minutes',
       'gender_Female', 'gender_Male', 'type_of_travel_business_travel','type_of_travel_personal_travel']
    
    
    label = ['label']
    
    #Preparação dos dados de treino
    x_train = df_train.loc[:, features]

    #Esse metodo ravel transforma os valores em um array
    y_train = df_train.loc[:, label].values.ravel()      
    
    #Treinamento do algoritmo Decision Tree Classifier
    
    #No caso do algoritmo Decision Tree Classifier, precisamos ajustar o seguinte parametro
    
    #Max_depth: a profundidade maximo pela qual a arvore treinada poderá crescer e ramificar-se
  
    
    decision_tree_classifier = DecisionTreeClassifier(max_depth=max_depth)
    
    decision_tree_classifier.fit(x_train, y_train)

    #Fazendo previsoes emcima dos dados de treinamento
    y_pred_train = decision_tree_classifier.predict(x_train)
    
    return y_train, y_pred_train, decision_tree_classifier
   

In [19]:
def Previsoes_Dataframe_Val(df_val, decision_tree_classifier):
    
   #Entrada: dataset de validacao +  algoritmo Decision Tree Classifier treinado
    
   #Saida: os dados de y do proprio dataset de validacao (y_val) e as previsoes feitas encima dos dados de validacao
   #(y_pred_val)
    

    features = ['id', 'customer_type', 'age', 'class', 'flight_distance',
       'inflight_wifi_service', 'departure_arrival_time_convenient',
       'ease_of_online_booking', 'gate_location', 'food_and_drink',
       'online_boarding', 'seat_comfort', 'inflight_entertainment',
       'on_board_service', 'leg_room_service', 'baggage_handling',
       'checkin_service', 'inflight_service', 'cleanliness',
       'departure_delay_in_minutes', 'arrival_delay_in_minutes',
       'gender_Female', 'gender_Male', 'type_of_travel_business_travel','type_of_travel_personal_travel']
    
    
    label = ['label']
    
           
    #Separando os dados
    x_val = df_val.loc[:, features]

    y_val = df_val.loc[:, label].values.ravel()
    
    #Fazendo previsões    
    y_pred_val = decision_tree_classifier.predict(x_val)
    
    return y_val, y_pred_val

In [20]:
def Previsoes_Dataframe_Test(df_test, decision_tree_classifier):
    
    #Entrada: dataset de teste +  algoritmo Decision Tree Classifier treinado
    
    #Saida: os dados de y do proprio dataset de teste (y_test) e as previsoes feitas encima dos dados de teste (y_pred_test)
    
    features = ['id', 'customer_type', 'age', 'class', 'flight_distance',
       'inflight_wifi_service', 'departure_arrival_time_convenient',
       'ease_of_online_booking', 'gate_location', 'food_and_drink',
       'online_boarding', 'seat_comfort', 'inflight_entertainment',
       'on_board_service', 'leg_room_service', 'baggage_handling',
       'checkin_service', 'inflight_service', 'cleanliness',
       'departure_delay_in_minutes', 'arrival_delay_in_minutes',
       'gender_Female', 'gender_Male', 'type_of_travel_business_travel','type_of_travel_personal_travel']
    
    
    label = ['label']    
       
    #Separando os dados
    x_test = df_test.loc[:, features]

    y_test = df_test.loc[:, label].values.ravel() 
    
    #Fazendo as previsões
    
    y_pred_test = decision_tree_classifier.predict(x_test)
    
    return y_test, y_pred_test 
       

In [21]:
#Criando uma função para avaliar as metricas do modelo

#Entrada: Os proprios valores de y do dataset original; os valores de y que são
# as previsoes feitas, o parametro max_depth

#Saida: Dataframe com as principais metricas do modelo

def Model_Metrics(y, y_pred,max_depth):
    #Confusion Matrix
    confusion_matrix = mt.confusion_matrix(y , y_pred )
        
    #Accuracy Score
    accuracy_score = np.round(mt.accuracy_score(y, y_pred ),3)
        
    #Precision Score
    precision_score = np.round(mt.precision_score( y, y_pred, average="binary", pos_label= 1),3)
        
    #Recall Score
    recall_score = np.round(mt.recall_score( y, y_pred, average='binary', pos_label=1 ),3)
   
    #F1 Score
    f1_score = np.round(mt.f1_score( y, y_pred, average='binary', pos_label=1 ),3)
    
    #Criando um dataframe com as metricas
    
    parameters = 'max_depth = ' + str(max_depth)
    
    d = { parameters: [accuracy_score, precision_score, recall_score, f1_score]}  
    
    df_metrics =  pd.DataFrame(data = d, index = ['accuracy_score', 'precision_score', 'recall_score', 'f1_score'])
    
    return df_metrics

# 5. Treinamento do modelo, validação e verificação de performance

### 5.1 Treinar o modelo com os valores de parametro default: max_depth = None e verificação das métricas do modelo sobre os dados de treinamento

In [22]:
y_train, y_pred_train, decision_tree_classifier = Model_Training (df_train, None)

In [23]:
df_default_train = Model_Metrics(y_train, y_pred_train, None)

In [24]:
df_default_train

Unnamed: 0,max_depth = None
accuracy_score,1.0
precision_score,1.0
recall_score,1.0
f1_score,1.0


### 5.2 Testar o algoritmo com o parametro default  nos dados de validação e verificar sua performance

In [25]:
y_val, y_pred_val = Previsoes_Dataframe_Val(df_val, decision_tree_classifier)

In [26]:
df_default_val = Model_Metrics(y_val, y_pred_val, None)

In [28]:
df_default_val

Unnamed: 0,max_depth = None
accuracy_score,0.945
precision_score,0.934
recall_score,0.939
f1_score,0.936


### 5.3 Treinar o modelo nos dados de treinamento alterando os parametros e testar a performance desse modelo sobre os dados de validação

### a) max_depth = 10

In [29]:
y_train, y_pred_train , decision_tree_classifier = Model_Training(df_train, 10)
y_val, y_pred_val = Previsoes_Dataframe_Val(df_val,decision_tree_classifier) 
df_max_depth_10 = Model_Metrics(y_val, y_pred_val, 10)
df_max_depth_10

Unnamed: 0,max_depth = 10
accuracy_score,0.947
precision_score,0.954
recall_score,0.922
f1_score,0.938


### b) max_depth = 12

In [30]:
y_train, y_pred_train , decision_tree_classifier = Model_Training(df_train, 12)
y_val, y_pred_val = Previsoes_Dataframe_Val(df_val,decision_tree_classifier) 
df_max_depth_12 = Model_Metrics(y_val, y_pred_val, 12)
df_max_depth_12

Unnamed: 0,max_depth = 12
accuracy_score,0.952
precision_score,0.959
recall_score,0.929
f1_score,0.944


### c) max_depth = 14

In [31]:
y_train, y_pred_train , decision_tree_classifier = Model_Training(df_train, 14)
y_val, y_pred_val = Previsoes_Dataframe_Val(df_val,decision_tree_classifier) 
df_max_depth_14 = Model_Metrics(y_val, y_pred_val, 14)
df_max_depth_14

Unnamed: 0,max_depth = 14
accuracy_score,0.953
precision_score,0.957
recall_score,0.933
f1_score,0.945


### d) max_depth = 15

In [32]:
y_train, y_pred_train , decision_tree_classifier = Model_Training(df_train, 15)
y_val, y_pred_val = Previsoes_Dataframe_Val(df_val,decision_tree_classifier) 
df_max_depth_15 = Model_Metrics(y_val, y_pred_val, 15)
df_max_depth_15

Unnamed: 0,max_depth = 15
accuracy_score,0.952
precision_score,0.956
recall_score,0.931
f1_score,0.943


### e) max_depth = 18

In [33]:
y_train, y_pred_train , decision_tree_classifier = Model_Training(df_train, 18)
y_val, y_pred_val = Previsoes_Dataframe_Val(df_val,decision_tree_classifier) 
df_max_depth_18 = Model_Metrics(y_val, y_pred_val, 18)
df_max_depth_18

Unnamed: 0,max_depth = 18
accuracy_score,0.949
precision_score,0.946
recall_score,0.936
f1_score,0.941


### f) max_depth = 20

In [34]:
y_train, y_pred_train , decision_tree_classifier = Model_Training(df_train, 20)
y_val, y_pred_val = Previsoes_Dataframe_Val(df_val,decision_tree_classifier) 
df_max_depth_20 = Model_Metrics(y_val, y_pred_val, 20)
df_max_depth_20

Unnamed: 0,max_depth = 20
accuracy_score,0.948
precision_score,0.943
recall_score,0.937
f1_score,0.94


### g) max_depth = 25

In [35]:
y_train, y_pred_train , decision_tree_classifier = Model_Training(df_train, 25)
y_val, y_pred_val = Previsoes_Dataframe_Val(df_val,decision_tree_classifier) 
df_max_depth_25 = Model_Metrics(y_val, y_pred_val, 25)
df_max_depth_25

Unnamed: 0,max_depth = 25
accuracy_score,0.945
precision_score,0.935
recall_score,0.938
f1_score,0.936


### 5.4 Verificar o parametro que obteve os melhores resultados nos dados de validação

In [36]:
df_metrics_val = pd.concat([df_default_val, df_max_depth_10, df_max_depth_12, df_max_depth_14, df_max_depth_15,
                           df_max_depth_18, df_max_depth_20, df_max_depth_25], axis = 1)

In [37]:
df_metrics_val

Unnamed: 0,max_depth = None,max_depth = 10,max_depth = 12,max_depth = 14,max_depth = 15,max_depth = 18,max_depth = 20,max_depth = 25
accuracy_score,0.945,0.947,0.952,0.953,0.952,0.949,0.948,0.945
precision_score,0.934,0.954,0.959,0.957,0.956,0.946,0.943,0.935
recall_score,0.939,0.922,0.929,0.933,0.931,0.936,0.937,0.938
f1_score,0.936,0.938,0.944,0.945,0.943,0.941,0.94,0.936


#### Resposta: O parâmetro que obteve os melhores resultados nos dados de validação é max_depth = 14

# 6. Unificar os dados de treinamento e validação e retreinar o algoritmo, utilizando os melhores valores para os parâmetros

### 6.1 Dividir novamente os datasets de treinamento e validação

In [38]:
features = ['id', 'customer_type', 'age', 'class', 'flight_distance',
       'inflight_wifi_service', 'departure_arrival_time_convenient',
       'ease_of_online_booking', 'gate_location', 'food_and_drink',
       'online_boarding', 'seat_comfort', 'inflight_entertainment',
       'on_board_service', 'leg_room_service', 'baggage_handling',
       'checkin_service', 'inflight_service', 'cleanliness',
       'departure_delay_in_minutes', 'arrival_delay_in_minutes',
       'gender_Female', 'gender_Male', 'type_of_travel_business_travel','type_of_travel_personal_travel']
    
    
label = ['label']    
       
    
#Separação dos dados de treino
x_train = df_train.loc[:, features]

#Esse metodo ravel transforma os valores em um array
y_train = df_train.loc[:, label].values.ravel()      

#Separação dos dados de validação
x_val = df_val.loc[:, features]

y_val = df_val.loc[:, label].values.ravel()    

### 6.2 Treinar o modelo novamente com o melhor parametro obtido

In [39]:
model_last = DecisionTreeClassifier(max_depth = 14)
model_last.fit( np.concatenate( (x_train, x_val) ), np.concatenate((y_train, y_val)) )

DecisionTreeClassifier(max_depth=14)

### 6.3 Fazendo previsões nos dados de teste 

In [40]:
y_test, y_pred_test = Previsoes_Dataframe_Test(df_test, model_last)



### 6.4 Verificando as métricas do modelo com os dados de teste

In [41]:
df_metrics_test = Model_Metrics(y_test, y_pred_test, 14)

In [42]:
df_metrics_test

Unnamed: 0,max_depth = 14
accuracy_score,0.956
precision_score,0.956
recall_score,0.943
f1_score,0.95


# 7. Resguardando os objetos com as métricas de maior performance

### 7.1 Alterando o nome das colunas dos dataframes

#### a) dataframe teste com melhores métricas

In [43]:
df_metrics_test = df_metrics_test.rename(columns = {"max_depth = 14": "Decision Tree Classifier"})

In [44]:
df_metrics_test

Unnamed: 0,Decision Tree Classifier
accuracy_score,0.956
precision_score,0.956
recall_score,0.943
f1_score,0.95


#### b) dataframe validação com melhores métricas

In [45]:
df_max_depth_14 = df_max_depth_14.rename(columns = {"max_depth = 14": "Decision Tree Classifier"})

In [46]:
df_max_depth_14

Unnamed: 0,Decision Tree Classifier
accuracy_score,0.953
precision_score,0.957
recall_score,0.933
f1_score,0.945


#### c) dataframe treinamento com melhores métricas

In [47]:
df_default_train = df_default_train.rename(columns={"max_depth = None": "Decision Tree Classifier"})

In [48]:
df_default_train

Unnamed: 0,Decision Tree Classifier
accuracy_score,1.0
precision_score,1.0
recall_score,1.0
f1_score,1.0


### 7.2 Salvar os objetos com as melhores metricas em um arquivo pickle

### a) dataframe com métricas dos dados de treinamento

In [49]:
with open('arquivo_decision_tree_train.pkl', 'wb') as arquivo_decision_tree_train:
    pickle.dump(df_default_train, arquivo_decision_tree_train)  

In [50]:
arquivo_decision_tree_train.close()

### b) dataframe com métricas dos dados de validação

In [51]:
with open('arquivo_decision_tree_val.pkl', 'wb') as arquivo_decision_tree_val:
    pickle.dump(df_max_depth_14, arquivo_decision_tree_val) 

In [52]:
arquivo_decision_tree_val.close()

### c) dataframe com métricas dos dados de teste

In [53]:
with open('arquivo_decision_tree_test.pkl', 'wb') as arquivo_decision_tree_test:
    pickle.dump(df_metrics_test, arquivo_decision_tree_test) 

In [54]:
arquivo_decision_tree_test.close()