# 1.Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics as mt
from sklearn.linear_model import LogisticRegression
import pickle

# 2. Funcao para carregar os datasets

In [2]:
def Load_Data_Set(df_x_train, df_y_train):
    df1 = pd.read_csv(df_x_train)
    df2 = pd.read_csv(df_y_train)
    return df1, df2

# 3. Criando e concatenando os dataframes

### 3.1 Dataset de treinamento

In [3]:
df_x_train = "../1_ensaio_classificacao/1_dados_treinamento/X_training.csv"

In [4]:
df_y_train = "../1_ensaio_classificacao/1_dados_treinamento/y_training.csv"

In [5]:
df1_train, df2_train = Load_Data_Set(df_x_train, df_y_train)

In [6]:
df1_train['label'] = df2_train

In [7]:
df_train = df1_train.copy()

### 3.2 Dataset de validacao

In [8]:
df_x_val = "../1_ensaio_classificacao/2_dados_validacao/X_validation.csv"

In [9]:
df_y_val = "../1_ensaio_classificacao/2_dados_validacao/y_validation.csv"

In [10]:
df1_val, df2_val = Load_Data_Set(df_x_val, df_y_val)

In [11]:
df1_val['label'] = df2_val

In [12]:
df_val = df1_val.copy()

### 3.3 Dataset de teste

In [13]:
df_x_test = "../1_ensaio_classificacao/3_dados_teste/X_test.csv"

In [14]:
df_y_test = "../1_ensaio_classificacao/3_dados_teste/y_test.csv"

In [15]:
df1_test, df2_test = Load_Data_Set(df_x_test, df_y_test)

In [16]:
df1_test['label'] = df2_test

In [17]:
df_test = df1_test.copy()

# 4. Funções para treinar o modelo e avaliar as métricas

In [18]:
#Criando uma função para Treinar o Modelo encima dos dados de treinamento

#Fazer as previsoes encima dos proprios dados de treinamento

#Entrada: dataframe de treinamento, parametros - C, solver, max_iter

#Saida: valores de y_train + valores de previsao feitos a partir dos dados de treinamento (y_pred_train) +
# algoritmo logistic regression treinado

def Model_Training(df_train,C, solver, max_iter): 
    
    features = ['id', 'customer_type', 'age', 'class', 'flight_distance',
       'inflight_wifi_service', 'departure_arrival_time_convenient',
       'ease_of_online_booking', 'gate_location', 'food_and_drink',
       'online_boarding', 'seat_comfort', 'inflight_entertainment',
       'on_board_service', 'leg_room_service', 'baggage_handling',
       'checkin_service', 'inflight_service', 'cleanliness',
       'departure_delay_in_minutes', 'arrival_delay_in_minutes',
       'gender_Female', 'gender_Male', 'type_of_travel_business_travel','type_of_travel_personal_travel']
    
    
    label = ['label']
    
    #Preparação dos dados de treino
    x_train = df_train.loc[:, features]

    #Esse metodo ravel transforma os valores em um array
    y_train = df_train.loc[:, label].values.ravel()      
    
    #Treinamento do algoritmo Logistic Regression
    #No caso do algoritmo Logistic Regression, precisamos ajustar os seguintes parametros
    
    #C = forca da regularizacao aplicada
    #solver = algoritmo a ser utilizado na otimizacao do problema
    #max_iter = numero maximo de iteracoes para convergir
    
    logistic_regression = LogisticRegression(C = C, solver = solver, max_iter = max_iter)
    
    logistic_regression.fit(x_train, y_train)

    #Fazendo previsoes emcima dos dados de treinamento
    y_pred_train = logistic_regression.predict(x_train)
    
    return y_train, y_pred_train, logistic_regression



In [19]:
def Previsoes_Dataframe_Test(df_test, logistic_regression):
    
    #Entrada: dataset de teste +  algoritmo Logistic Regression treinado
    
    #Saida: os dados de y do proprio dataset de teste (y_test) e as previsoes feitas encima dos dados de teste (y_pred_test)
    
    features = ['id', 'customer_type', 'age', 'class', 'flight_distance',
       'inflight_wifi_service', 'departure_arrival_time_convenient',
       'ease_of_online_booking', 'gate_location', 'food_and_drink',
       'online_boarding', 'seat_comfort', 'inflight_entertainment',
       'on_board_service', 'leg_room_service', 'baggage_handling',
       'checkin_service', 'inflight_service', 'cleanliness',
       'departure_delay_in_minutes', 'arrival_delay_in_minutes',
       'gender_Female', 'gender_Male', 'type_of_travel_business_travel','type_of_travel_personal_travel']
    
    
    label = ['label']    
       
    #Separando os dados
    x_test = df_test.loc[:, features]

    y_test = df_test.loc[:, label].values.ravel() 
    
    #Fazendo previsões    
    y_pred_test = logistic_regression.predict(x_test)
    
    return y_test, y_pred_test 
       

In [20]:
def Previsoes_Dataframe_Val(df_val, logistic_regression):
    
   #Entrada: dataset de validacao +  algoritmo Logistic Regression treinado
    
   #Saida: os dados de y do proprio dataset de validacao (y_val) e as previsoes feitas encima dos dados de validacao
   #(y_pred_val)
    

    features = ['id', 'customer_type', 'age', 'class', 'flight_distance',
       'inflight_wifi_service', 'departure_arrival_time_convenient',
       'ease_of_online_booking', 'gate_location', 'food_and_drink',
       'online_boarding', 'seat_comfort', 'inflight_entertainment',
       'on_board_service', 'leg_room_service', 'baggage_handling',
       'checkin_service', 'inflight_service', 'cleanliness',
       'departure_delay_in_minutes', 'arrival_delay_in_minutes',
       'gender_Female', 'gender_Male', 'type_of_travel_business_travel','type_of_travel_personal_travel']
    
    
    label = ['label']
    
           
    #Separando os dados
    x_val = df_val.loc[:, features]

    y_val = df_val.loc[:, label].values.ravel() 
    
    #Fazendo previsões
    y_pred_val = logistic_regression.predict(x_val)
    
    return y_val, y_pred_val
        

In [21]:
#Criando uma função para avaliar as metricas do modelo

#Entrada: Os proprios valores da label daquele dataset, as previsoes feitas a partir desses dados, os parametros C,
# solver, max_iter

#Saida: Dataframe com as principais metricas do modelo

def Model_Metrics(y, y_pred,C, solver, max_iter):
    #Confusion Matrix
    confusion_matrix = mt.confusion_matrix(y , y_pred )
        
    #Accuracy Score
    accuracy_score = np.round(mt.accuracy_score(y, y_pred ),2)
        
    #Precision Score
    precision_score = np.round(mt.precision_score( y, y_pred, average="binary", pos_label= 1),2)
        
    #Recall Score
    recall_score = np.round(mt.recall_score( y, y_pred, average='binary', pos_label=1 ),2)
   
    #F1 Score
    f1_score = np.round(mt.f1_score( y, y_pred, average='binary', pos_label=1 ),2)
    
    #Criando um dataframe com as metricas
    
    parameters = 'C = ' + str(C) + ' ;solver = ' + str(solver) + '; max_iter= ' + str(max_iter) 
    
    d = { parameters: [accuracy_score, precision_score, recall_score, f1_score]}  
    
    df_metrics =  pd.DataFrame(data = d, index = ['accuracy_score', 'precision_score', 'recall_score', 'f1_score'])
    
    return df_metrics

# 5. Treinamento do modelo, validação e verificação de performance

### 5.1 Treinar o modelo com os valores de parametro default: C = 1, solver = lbfgs, max_iter = 100 e verificação das métricas do modelo sobre os dados de treinamento

In [22]:
y_train, y_pred_train, logistic_regression = Model_Training (df_train, 1, 'lbfgs', 100)
df_default_train = Model_Metrics(y_train, y_pred_train, 1, 'lbfgs', 100)
df_default_train

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,C = 1 ;solver = lbfgs; max_iter= 100
accuracy_score,0.57
precision_score,0.0
recall_score,0.0
f1_score,0.0


### 5.2 Testar o algoritmo com o parametro default  nos dados de validação e verificar sua performance

In [23]:
y_val, y_pred_val = Previsoes_Dataframe_Val(df_val, logistic_regression)
df_val_default = Model_Metrics(y_val, y_pred_val, 1, 'lbfgs', 100)
df_val_default

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,C = 1 ;solver = lbfgs; max_iter= 100
accuracy_score,0.57
precision_score,0.0
recall_score,0.0
f1_score,0.0


### 5.3 Treinar o modelo nos dados de treinamento alterando os parametros e testar a performance desse modelo sobre os dados de validação

### a)C = 1, solver = "lbfgs", max_iter=150

In [24]:
y_train, y_pred_train, logistic_regression = Model_Training (df_train, 1, 'lbfgs', 150)
df_val1 = Model_Metrics(y_train, y_pred_train, 1, 'lbfgs', 150)
df_val1

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,C = 1 ;solver = lbfgs; max_iter= 150
accuracy_score,0.57
precision_score,0.0
recall_score,0.0
f1_score,0.0


### b) C = 0.8, solver = "lbfgs", max_iter=100

In [25]:
y_train, y_pred_train, logistic_regression = Model_Training (df_train, 0.8, 'lbfgs', 100)
df_val2 = Model_Metrics(y_train, y_pred_train, 0.8, 'lbfgs', 100)
df_val2

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,C = 0.8 ;solver = lbfgs; max_iter= 100
accuracy_score,0.57
precision_score,0.0
recall_score,0.0
f1_score,0.0


### c) C = 1, solver = "liblinear", max_iter=100

In [26]:
y_train, y_pred_train, logistic_regression = Model_Training (df_train, 1, 'liblinear', 100)
df_val3 = Model_Metrics(y_train, y_pred_train, 1, 'liblinear', 100)
df_val3

Unnamed: 0,C = 1 ;solver = liblinear; max_iter= 100
accuracy_score,0.79
precision_score,0.73
recall_score,0.83
f1_score,0.78


### d) C = 1, solver = "liblinear", max_iter=150

In [27]:
y_train, y_pred_train, logistic_regression = Model_Training (df_train, 1, 'liblinear', 150)
df_val4 = Model_Metrics(y_train, y_pred_train, 1, 'liblinear', 150)
df_val4

Unnamed: 0,C = 1 ;solver = liblinear; max_iter= 150
accuracy_score,0.79
precision_score,0.73
recall_score,0.83
f1_score,0.78


### e) C = 0.7, solver = "liblinear", max_iter = 150

In [28]:
y_train, y_pred_train, logistic_regression = Model_Training (df_train, 0.7, 'liblinear', 150)
df_val5 = Model_Metrics(y_train, y_pred_train, 0.7, 'liblinear', 150)
df_val5

Unnamed: 0,C = 0.7 ;solver = liblinear; max_iter= 150
accuracy_score,0.79
precision_score,0.73
recall_score,0.83
f1_score,0.78


### f)  C = 0.2, solver = "liblinear", max_iter = 100

In [29]:
y_train, y_pred_train, logistic_regression = Model_Training (df_train, 0.2, 'liblinear', 100)
df_val6 = Model_Metrics(y_train, y_pred_train, 0.2, 'liblinear', 100)
df_val6

Unnamed: 0,C = 0.2 ;solver = liblinear; max_iter= 100
accuracy_score,0.84
precision_score,0.79
recall_score,0.84
f1_score,0.82


### g)  C = 0.2, solver = "liblinear", max_iter = 70

In [30]:
y_train, y_pred_train, logistic_regression = Model_Training (df_train, 0.2, 'liblinear', 70)
df_val7 = Model_Metrics(y_train, y_pred_train, 0.2, 'liblinear', 70)
df_val7

Unnamed: 0,C = 0.2 ;solver = liblinear; max_iter= 70
accuracy_score,0.84
precision_score,0.79
recall_score,0.84
f1_score,0.82


### 5.4 Verificar o parametro que obteve os melhores resultados nos dados de validação

In [31]:
df_metrics_val = pd.concat([df_val_default, df_val1, df_val2, df_val3, df_val4, df_val5, df_val6, df_val7], axis = 1)

In [32]:
df_metrics_val

Unnamed: 0,C = 1 ;solver = lbfgs; max_iter= 100,C = 1 ;solver = lbfgs; max_iter= 150,C = 0.8 ;solver = lbfgs; max_iter= 100,C = 1 ;solver = liblinear; max_iter= 100,C = 1 ;solver = liblinear; max_iter= 150,C = 0.7 ;solver = liblinear; max_iter= 150,C = 0.2 ;solver = liblinear; max_iter= 100,C = 0.2 ;solver = liblinear; max_iter= 70
accuracy_score,0.57,0.57,0.57,0.79,0.79,0.79,0.84,0.84
precision_score,0.0,0.0,0.0,0.73,0.73,0.73,0.79,0.79
recall_score,0.0,0.0,0.0,0.83,0.83,0.83,0.84,0.84
f1_score,0.0,0.0,0.0,0.78,0.78,0.78,0.82,0.82


#### Resposta: Os parâmetros que obteveram os melhores resultados nos dados de validação são os parâmetros:
#### C = 0.2, solver ='liblinear', max_iter = 100 que estão presentes no objeto df_val6

# 6. Unificar os dados de treinamento e validação e retreinar o algoritmo, utilizando os melhores valores para os parâmetros

### 6.1 Unificar novamente os datasets de treinamento e validação

In [33]:
features = ['id', 'customer_type', 'age', 'class', 'flight_distance',
       'inflight_wifi_service', 'departure_arrival_time_convenient',
       'ease_of_online_booking', 'gate_location', 'food_and_drink',
       'online_boarding', 'seat_comfort', 'inflight_entertainment',
       'on_board_service', 'leg_room_service', 'baggage_handling',
       'checkin_service', 'inflight_service', 'cleanliness',
       'departure_delay_in_minutes', 'arrival_delay_in_minutes',
       'gender_Female', 'gender_Male', 'type_of_travel_business_travel','type_of_travel_personal_travel']
    
label = ['label']    
    
#Separação dos dados de treino
x_train = df_train.loc[:, features]

#Esse metodo ravel transforma os valores em um array
y_train = df_train.loc[:, label].values.ravel()      

#Separação dos dados de validação
x_val = df_val.loc[:, features]

y_val = df_val.loc[:, label].values.ravel()    

### 6.2 Treinar o modelo novamente com o melhor parametro obtido

In [34]:
model_last = LogisticRegression(C = 0.2, solver ='liblinear', max_iter = 100)
model_last.fit( np.concatenate( (x_train, x_val) ), np.concatenate((y_train, y_val)) )

LogisticRegression(C=0.2, solver='liblinear')

### 6.3 Fazendo previsões nos dados de teste 

In [35]:
y_test, y_pred_test = Previsoes_Dataframe_Test(df_test, model_last)



### 6.4 Verificando as métricas do modelo com os dados de teste

In [36]:
df_metrics_test = Model_Metrics(y_test, y_pred_test, 0.2, 'liblinear', 100)

In [37]:
df_metrics_test

Unnamed: 0,C = 0.2 ;solver = liblinear; max_iter= 100
accuracy_score,0.64
precision_score,0.63
recall_score,0.44
f1_score,0.52


### 6.5 Dataframe comparando as métricas obtidas sob os dados de validação e sob os dados de teste

In [38]:
df_metrics_test = df_metrics_test.rename(columns = {"C = 0.2 ;solver = liblinear; max_iter= 100": "dados_teste"})
df_metrics_test

Unnamed: 0,dados_teste
accuracy_score,0.64
precision_score,0.63
recall_score,0.44
f1_score,0.52


In [39]:
df_val6 = df_val6.rename(columns = {"C = 0.2 ;solver = liblinear; max_iter= 100": "dados_validação"})
df_val6

Unnamed: 0,dados_validação
accuracy_score,0.84
precision_score,0.79
recall_score,0.84
f1_score,0.82


In [40]:
df_comparative = pd.concat([df_metrics_test, df_val6], axis = 1)
df_comparative

Unnamed: 0,dados_teste,dados_validação
accuracy_score,0.64,0.84
precision_score,0.63,0.79
recall_score,0.44,0.84
f1_score,0.52,0.82


#### Observação 1: As métricas obtidas nos dados de validação com os parametros  C = 0.2,solver = 'liblinear',max_iter = 100 não foram repetidas com os dados de teste. Podemos dizer que é um caso de overfitting, a saber o modelo apreendeu profundamente as relações nos dados de validação, mas não consegue generalizar para os dados de teste

# 6.6 Vamos refazer o processo e verificar a performance do modelo retreinado com os parametros default sob os dados de teste

### 6.7 Treinar o modelo novamente

In [41]:
model_last = LogisticRegression(C = 1,solver = 'lbfgs', max_iter= 100)
model_last.fit( np.concatenate( (x_train, x_val) ), np.concatenate((y_train, y_val)) )

LogisticRegression(C=1)

### 6.8 Fazendo as previsões nos dados de teste

In [42]:
y_test, y_pred_test = Previsoes_Dataframe_Test(df_test, model_last)



### 6.9 Verificando as métricas do modelo com os dados de teste

In [43]:
df_metrics_test = Model_Metrics(y_test, y_pred_test, 1, 'lbfgs', 100)
df_metrics_test

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,C = 1 ;solver = lbfgs; max_iter= 100
accuracy_score,0.56
precision_score,0.0
recall_score,0.0
f1_score,0.0


# 7. Resguardando os objetos com as métricas de maior performance

### 7.1 Alterando o nome das colunas dos dataframes

#### a) dataframe teste com melhores métricas

In [44]:
df_metrics_test = df_metrics_test.rename(columns = {"C = 1 ;solver = lbfgs; max_iter= 100": "Logistic Regression"})

In [45]:
df_metrics_test

Unnamed: 0,Logistic Regression
accuracy_score,0.56
precision_score,0.0
recall_score,0.0
f1_score,0.0


#### b) dataframe validação com melhores métricas

In [46]:
df_val_default = df_val_default.rename(columns = {"C = 1 ;solver = lbfgs; max_iter= 100": "Logistic Regression"})

In [47]:
df_val_default

Unnamed: 0,Logistic Regression
accuracy_score,0.57
precision_score,0.0
recall_score,0.0
f1_score,0.0


#### c) dataframe treinamento com melhores métricas

In [48]:
df_default_train = df_default_train.rename(columns={"C = 1 ;solver = lbfgs; max_iter= 100": "Logistic Regression"})

In [49]:
df_default_train

Unnamed: 0,Logistic Regression
accuracy_score,0.57
precision_score,0.0
recall_score,0.0
f1_score,0.0


### 7.2 Salvar os objetos com as melhores metricas em um arquivo pickle

### a) dataframe com métricas dos dados de treinamento

In [50]:
with open('arquivo_logistic_regression_train.pkl', 'wb') as arquivo_logistic_regression_train:
    pickle.dump(df_default_train, arquivo_logistic_regression_train)  

In [51]:
arquivo_logistic_regression_train.close()

### b) dataframe com métricas dos dados de validação

In [52]:
with open('arquivo_logistic_regression_val.pkl', 'wb') as arquivo_logistic_regression_val:
    pickle.dump(df_val_default, arquivo_logistic_regression_val)  

In [53]:
arquivo_logistic_regression_val.close()

### c) dataframe com métricas dos dados de teste

In [54]:
with open('arquivo_logistic_regression_teste.pkl', 'wb') as arquivo_logistic_regression_teste:
    pickle.dump(df_metrics_test, arquivo_logistic_regression_teste)  

In [55]:
arquivo_logistic_regression_teste.close()