# 1.Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics as mt
from sklearn.neighbors import KNeighborsClassifier
import pickle

# 2. Funcao para carregar os datasets

In [2]:
def Load_Data_Set(df_x_train, df_y_train):
    df1 = pd.read_csv(df_x_train)
    df2 = pd.read_csv(df_y_train)
    return df1, df2

# 3. Criando e concatenando os dataframes

### 3.1 Dataset de treinamento

In [3]:
df_x_train = "../1_ensaio_classificacao/1_dados_treinamento/X_training.csv"

In [4]:
df_y_train = "../1_ensaio_classificacao/1_dados_treinamento/y_training.csv"

In [5]:
df1_train, df2_train = Load_Data_Set(df_x_train, df_y_train)

In [6]:
df1_train['label'] = df2_train

In [7]:
df_train = df1_train.copy()

### 3.2 Dataset de validacao

In [8]:
df_x_val = "../1_ensaio_classificacao/2_dados_validacao/X_validation.csv"

In [9]:
df_y_val = "../1_ensaio_classificacao/2_dados_validacao/y_validation.csv"

In [10]:
df1_val, df2_val = Load_Data_Set(df_x_val, df_y_val)

In [11]:
df1_val['label'] = df2_val

In [12]:
df_val = df1_val.copy()

### 3.3 Dataset de teste

In [13]:
df_x_test = "../1_ensaio_classificacao/3_dados_teste/X_test.csv"

In [14]:
df_y_test = "../1_ensaio_classificacao/3_dados_teste/y_test.csv"

In [15]:
df1_test, df2_test = Load_Data_Set(df_x_test, df_y_test)

In [16]:
df1_test['label'] = df2_test

In [17]:
df_test = df1_test.copy()

# 4. Funções para treinar o modelo e avaliar as métricas

In [18]:
#Criando uma função para Treinar o Modelo encima dos dados de treinamento

#Fazer as previsoes encima dos proprios dados de treinamento

#Entrada: dataframe de treinamento, parametro neighbors

#Saida: valores de y_train + valores de previsao feitos a partir dos dados de treinamento (y_pred_train) +
# algoritmo knn treinado

def Model_Training(df_train, neighbors): 
    
    features = ['age', 'flight_distance', 'departure_arrival_time_convenient', 'departure_delay_in_minutes',
           'arrival_delay_in_minutes']
    
    label = ['label']    
    
    #Preparação dos dados de treino
    x_train = df_train.loc[:, features]

    #Esse metodo ravel transforma os valores em um array
    y_train = df_train.loc[:, label].values.ravel()      
    
    #Treinamento do algoritmo KNN
    
    #No caso do KNN, o algoritmo que necessitamos ajustar é o n_neighbors 
    
    knn_classifier = KNeighborsClassifier(n_neighbors = neighbors)
    knn_classifier.fit(x_train, y_train)

    #Fazendo previsoes emcima dos dados de treinamento
    y_pred_train = knn_classifier.predict(x_train)
    
    return y_train, y_pred_train, knn_classifier
   

In [19]:
def Previsoes_Dataframe_Test(df_test, knn_classifier):
    
    #Entrada: dataset de teste +  algoritmo KNN treinado
    
    #Saida: os dados de y do proprio dataset de teste (y_test) e as previsoes feitas encima dos dados de teste (y_pred_test)
    
    features = ['age', 'flight_distance', 'departure_arrival_time_convenient', 'departure_delay_in_minutes',
           'arrival_delay_in_minutes']
    label = ['label']  
    
       
    #Separando os dados
    x_test = df_test.loc[:, features]

    y_test = df_test.loc[:, label].values.ravel() 
    
    #Fazendo previsões    
    y_pred_test = knn_classifier.predict(x_test)
    
    return y_test, y_pred_test 
       

In [20]:
def Previsoes_Dataframe_Val(df_val, knn_classifier):
    
   #Entrada: dataset de validacao +  algoritmo KNN treinado
    
   #Saida: os dados de y do proprio dataset de validacao (y_val) e as previsoes feitas encima dos dados de validacao
   #(y_pred_val)
    
    features = ['age', 'flight_distance', 'departure_arrival_time_convenient', 'departure_delay_in_minutes',
           'arrival_delay_in_minutes']
    label = ['label']  
    
    #Separando os dados
    x_val = df_val.loc[:, features]

    y_val = df_val.loc[:, label].values.ravel() 
    
    #Fazendo previsões    
    y_pred_val = knn_classifier.predict(x_val)
    
    return y_val, y_pred_val
        

In [21]:
#Criando uma função para avaliar as metricas do modelo

#Entrada: Os proprios valores de y do dataset original; os valores de y que são as previsoes feitas, o parametro neighbors
#Saida: Dataframe com as principais metricas do modelo

def Model_Metrics(y, y_pred, neighbors):
    #Confusion Matrix
    confusion_matrix = mt.confusion_matrix(y , y_pred )
        
    #Accuracy Score
    accuracy_score = np.round(mt.accuracy_score(y, y_pred ),3)
        
    #Precision Score
    precision_score = np.round(mt.precision_score( y, y_pred, average="binary", pos_label= 1),3)
        
    #Recall Score
    recall_score = np.round(mt.recall_score( y, y_pred, average='binary', pos_label=1 ),3)
   
    #F1 Score
    f1_score = np.round(mt.f1_score( y, y_pred, average='binary', pos_label=1 ),3)
    
    #Criando um dataframe com as metricas
    
    neighbors = 'neighbors = ' + str(neighbors) 
    
    d = { neighbors: [accuracy_score, precision_score, recall_score, f1_score]}
    
    df_metrics =  pd.DataFrame(data = d, index = ['accuracy_score', 'precision_score', 'recall_score', 'f1_score'])
    
    return df_metrics

# 5. Treinamento do modelo, validação e verificação de performance

### 5.1 Treinar o modelo com os valores de parametro default: n_neighbors = 5 e verificação das métricas do modelo sobre os dados de treinamento

In [22]:
y_train, y_pred_train, knn_classifier = Model_Training (df_train, 5)

In [23]:
df_default_train = Model_Metrics(y_train, y_pred_train, 5)

In [24]:
df_default_train

Unnamed: 0,neighbors = 5
accuracy_score,0.761
precision_score,0.739
recall_score,0.696
f1_score,0.716


### 5.2 Testar o algoritmo com o paramtro default  nos dados de validação e verificar sua performance

In [25]:
y_val, y_pred_val = Previsoes_Dataframe_Val(df_val, knn_classifier)

In [26]:
df_val_default = Model_Metrics(y_val, y_pred_val, 5)

In [27]:
df_val_default

Unnamed: 0,neighbors = 5
accuracy_score,0.648
precision_score,0.6
recall_score,0.562
f1_score,0.581


### 5.3 Treinar o modelo nos dados de treinamento alterando os parametros e testar a performance desse modelo sobre os dados de validação

### a) neighbors = 3

In [28]:
y_train, y_pred_train , knn_classifier = Model_Training(df_train, 3)

In [29]:
y_val, y_pred_val = Previsoes_Dataframe_Val(df_val,knn_classifier) 

In [30]:
df_val_neighbors3 = Model_Metrics(y_val, y_pred_val, 3)

In [31]:
df_val_neighbors3

Unnamed: 0,neighbors = 3
accuracy_score,0.635
precision_score,0.582
recall_score,0.561
f1_score,0.571


### b) neighbors = 4

In [32]:
y_train, y_pred_train , knn_classifier = Model_Training(df_train, 4)

In [33]:
y_val, y_pred_val = Previsoes_Dataframe_Val(df_val,knn_classifier) 

In [34]:
df_val_neighbors4 = Model_Metrics(y_val, y_pred_val, 4)

In [35]:
df_val_neighbors4

Unnamed: 0,neighbors = 4
accuracy_score,0.645
precision_score,0.636
recall_score,0.424
f1_score,0.509


### c) neighbors = 6

In [36]:
y_train, y_pred_train , knn_classifier = Model_Training(df_train, 6)

In [37]:
y_val, y_pred_val = Previsoes_Dataframe_Val(df_val,knn_classifier) 

In [38]:
df_val_neighbors6 = Model_Metrics(y_val, y_pred_val, 6)

In [39]:
df_val_neighbors6

Unnamed: 0,neighbors = 6
accuracy_score,0.654
precision_score,0.638
recall_score,0.468
f1_score,0.54


### d) neighbors = 7

In [40]:
y_train, y_pred_train , knn_classifier = Model_Training(df_train, 7)

In [41]:
y_val, y_pred_val = Previsoes_Dataframe_Val(df_val,knn_classifier) 

In [42]:
df_val_neighbors7 = Model_Metrics(y_val, y_pred_val, 7)

In [43]:
df_val_neighbors7

Unnamed: 0,neighbors = 7
accuracy_score,0.657
precision_score,0.613
recall_score,0.569
f1_score,0.59


### e) neighbors = 9

In [44]:
y_train, y_pred_train , knn_classifier = Model_Training(df_train, 9)

In [45]:
y_val, y_pred_val = Previsoes_Dataframe_Val(df_val,knn_classifier) 

In [46]:
df_val_neighbors9 = Model_Metrics(y_val, y_pred_val, 9)

In [47]:
df_val_neighbors9

Unnamed: 0,neighbors = 9
accuracy_score,0.664
precision_score,0.622
recall_score,0.572
f1_score,0.596


### f) neighbors = 10

In [48]:
y_train, y_pred_train , knn_classifier = Model_Training(df_train, 10)

In [49]:
y_val, y_pred_val = Previsoes_Dataframe_Val(df_val,knn_classifier) 

In [50]:
df_val_neighbors10 = Model_Metrics(y_val, y_pred_val, 10)

In [51]:
df_val_neighbors10

Unnamed: 0,neighbors = 10
accuracy_score,0.667
precision_score,0.649
recall_score,0.504
f1_score,0.567


### g) neighbors = 15

In [52]:
y_train, y_pred_train , knn_classifier = Model_Training(df_train, 15)

In [53]:
y_val, y_pred_val = Previsoes_Dataframe_Val(df_val,knn_classifier) 

In [54]:
df_val_neighbors15 = Model_Metrics(y_val, y_pred_val, 15)

In [55]:
df_val_neighbors15

Unnamed: 0,neighbors = 15
accuracy_score,0.674
precision_score,0.637
recall_score,0.577
f1_score,0.606


### 5.4 Verificar o parametro que obteve os melhores resultados nos dados de validação

In [56]:
df_metrics_val = pd.concat([df_val_neighbors3, df_val_neighbors4, df_val_default, df_val_neighbors6,
          df_val_neighbors7, df_val_neighbors9, df_val_neighbors10, df_val_neighbors15], axis = 1)

In [57]:
df_metrics_val

Unnamed: 0,neighbors = 3,neighbors = 4,neighbors = 5,neighbors = 6,neighbors = 7,neighbors = 9,neighbors = 10,neighbors = 15
accuracy_score,0.635,0.645,0.648,0.654,0.657,0.664,0.667,0.674
precision_score,0.582,0.636,0.6,0.638,0.613,0.622,0.649,0.637
recall_score,0.561,0.424,0.562,0.468,0.569,0.572,0.504,0.577
f1_score,0.571,0.509,0.581,0.54,0.59,0.596,0.567,0.606


#### Resposta: O parâmetro que obteve os melhores resultados nos dados de validação é neighbors = 15

# 6. Unificar os dados de treinamento e validação e retreinar o algoritmo, utilizando os melhores valores para os parâmetros

### 6.1 Dividir novamente os datasets de treinamento e validação

In [58]:
features = ['age', 'flight_distance', 'departure_arrival_time_convenient', 'departure_delay_in_minutes',
           'arrival_delay_in_minutes']
    
label = ['label']    
    
#Separação dos dados de treino
x_train = df_train.loc[:, features]

#Esse metodo ravel transforma os valores em um array
y_train = df_train.loc[:, label].values.ravel()      

#Separação dos dados de validação
x_val = df_val.loc[:, features]

y_val = df_val.loc[:, label].values.ravel()    

### 6.2 Treinar o modelo novamente com o melhor parametro obtido

In [59]:
model_last = KNeighborsClassifier(n_neighbors=15)
model_last.fit( np.concatenate( (x_train, x_val) ), np.concatenate((y_train, y_val)) )

KNeighborsClassifier(n_neighbors=15)

### 6.3 Fazendo previsões nos dados de teste 

In [60]:
y_test, y_pred_test = Previsoes_Dataframe_Test(df_test, model_last)



### 6.4 Verificando as métricas do modelo com os dados de teste

In [61]:
df_metrics_test = Model_Metrics(y_test, y_pred_test, 15)

In [62]:
df_metrics_test

Unnamed: 0,neighbors = 15
accuracy_score,0.673
precision_score,0.644
recall_score,0.57
f1_score,0.605


# 7. Resguardando os objetos com as métricas de maior performance

### 7.1 Alterando o nome das colunas dos dataframes

#### a) dataframe teste com melhores métricas

In [63]:
df_metrics_test = df_metrics_test.rename(columns = {"neighbors = 15": "KNN Classifier"})

In [64]:
df_metrics_test

Unnamed: 0,KNN Classifier
accuracy_score,0.673
precision_score,0.644
recall_score,0.57
f1_score,0.605


#### b) dataframe validação com melhores métricas

In [65]:
df_val_neighbors15 = df_val_neighbors15.rename(columns = {"neighbors = 15": "KNN Classifier"})

In [66]:
df_val_neighbors15

Unnamed: 0,KNN Classifier
accuracy_score,0.674
precision_score,0.637
recall_score,0.577
f1_score,0.606


#### c) dataframe treinamento com melhores métricas

In [67]:
df_default_train = df_default_train.rename(columns={"neighbors = 5": "KNN Classifier"})

In [68]:
df_default_train

Unnamed: 0,KNN Classifier
accuracy_score,0.761
precision_score,0.739
recall_score,0.696
f1_score,0.716


### 7.2 Salvar os objetos com as melhores metricas em um arquivo pickle

### a) dataframe com métricas dos dados de treinamento

In [69]:
with open('arquivo_knn_train.pkl', 'wb') as arquivo_knn_train:
    pickle.dump(df_default_train, arquivo_knn_train)  

In [70]:
arquivo_knn_train.close()

### b) dataframe com métricas dos dados de validação

In [71]:
with open('arquivo_knn_val.pkl', 'wb') as arquivo_knn_val:
    pickle.dump(df_val_neighbors15, arquivo_knn_val)

In [72]:
arquivo_knn_val.close()

### c) dataframe com métricas dos dados de teste

In [73]:
with open('arquivo_knn_teste.pkl', 'wb') as arquivo_knn_teste:
       pickle.dump(df_metrics_test, arquivo_knn_teste)

In [74]:
arquivo_knn_teste.close()