# 1.Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics as mt
from sklearn import linear_model as lm
import pickle

# 2. Funcao para carregar os datasets

In [2]:
def Load_Data_Set(df_x_train, df_y_train):
    df1 = pd.read_csv(df_x_train)
    df2 = pd.read_csv(df_y_train)
    return df1, df2

# 3. Criando e concatenando os dataframes

### 3.1 Dataset de treinamento

In [3]:
df_x_train = "../2_ensaio_regressao/1_dados_treinamento/X_training.csv"

In [4]:
df_y_train = "../2_ensaio_regressao/1_dados_treinamento/y_training.csv"

In [5]:
df1_train, df2_train = Load_Data_Set(df_x_train, df_y_train)

In [6]:
df1_train['label'] = df2_train

In [7]:
df_train = df1_train.copy()

### 3.2 Dataset de validacao

In [8]:
df_x_val = "../2_ensaio_regressao/2_dados_validacao/X_validation.csv"

In [9]:
df_y_val = "../2_ensaio_regressao/2_dados_validacao/y_val.csv"

In [10]:
df1_val, df2_val = Load_Data_Set(df_x_val, df_y_val)

In [11]:
df1_val['label'] = df2_val

In [12]:
df_val = df1_val.copy()

### 3.3 Dataset de teste

In [13]:
df_x_test = "../2_ensaio_regressao/3_dados_teste/X_test.csv"

In [14]:
df_y_test = "../2_ensaio_regressao/3_dados_teste/y_test.csv"

In [15]:
df1_test, df2_test = Load_Data_Set(df_x_test, df_y_test)

In [16]:
df1_test['label'] = df2_test

In [17]:
df_test = df1_test.copy()

# 4. Funções para treinar o modelo e avaliar as métricas

In [18]:
#Criando uma função para Treinar o Modelo encima dos dados de treinamento

#Fazer as previsoes encima dos proprios dados de treinamento

#Entrada: dataframe de treinamento + parametros de ajuste do modelo, a saber alpha e max_iter

#Saida: valores de y_train + valores de previsao feitos a partir dos dados de treinamento (y_pred_train) +
# algoritmo ridge treinado

def Model_Training(df_train, alpha, max_iter):
    
    features = ['song_duration_ms', 'acousticness', 'danceability', 'energy',
       'instrumentalness', 'key', 'liveness', 'loudness', 'audio_mode',
       'speechiness', 'tempo', 'time_signature', 'audio_valence']
    
    
    label = ['label']
    
    #Preparação dos dados de treino
    x_train = df_train.loc[:, features]

    #Esse metodo ravel transforma os valores em um array
    y_train = df_train.loc[:, label].values.ravel()      
    
    #Treinamento do algoritmo Lasso
      
    ridge = lm.Ridge(alpha = alpha, max_iter = max_iter)    
  
    ridge.fit(x_train, y_train)

    #Fazendo previsoes emcima dos dados de treinamento
    y_pred_train = ridge.predict(x_train)
    
    return y_train, y_pred_train, ridge
   


In [19]:
def Previsoes_Dataframe_Test(df_test, ridge):
    
    #Entrada: dataset de teste +  algoritmo ridge treinado
    
    #Saida: os dados de y do proprio dataset de teste (y_test) e as previsoes feitas encima dos dados de teste (y_pred_test)
    
    features = ['song_duration_ms', 'acousticness', 'danceability', 'energy',
       'instrumentalness', 'key', 'liveness', 'loudness', 'audio_mode',
       'speechiness', 'tempo', 'time_signature', 'audio_valence']
    
    
    label = ['label'] 
       
    #Separando os dados
    x_test = df_test.loc[:, features]

    y_test = df_test.loc[:, label].values.ravel() 
    
    #Fazendo previsões
    y_pred_test = ridge.predict(x_test)
    
    return y_test, y_pred_test 
       

In [20]:
def Previsoes_Dataframe_Val(df_val, ridge):
    
   #Entrada: dataset de validacao +  algoritmo ridge treinado
    
   #Saida: os dados de y do proprio dataset de validacao (y_val) e as previsoes feitas encima dos dados de validacao
   #(y_pred_val)
    

    features = ['song_duration_ms', 'acousticness', 'danceability', 'energy',
       'instrumentalness', 'key', 'liveness', 'loudness', 'audio_mode',
       'speechiness', 'tempo', 'time_signature', 'audio_valence']
    
    
    label = ['label'] 
    
           
    #Separando os dados
    x_val = df_val.loc[:, features]

    y_val = df_val.loc[:, label].values.ravel() 
    
    #Fazendo previsões
    y_pred_val = ridge.predict(x_val)
    
    return y_val, y_pred_val

In [21]:
#Criando uma função para avaliar as metricas do modelo

#Entrada: Os proprios valores da label do dataset que estamos averiguando as metricas,
#as previsoes feitas a partir desses dados, os parametros que utilizamos no treinamento do modelo

#Saida: Dataframe com as principais metricas do modelo

def Model_Metrics(y, y_pred, alpha, max_iter):
    
    #R2_Score
    r2_score = np.round(mt.r2_score(y , y_pred ),4)
        
    #Mean_Squared_Error (MSE)
    mean_squared_error = np.round(mt.mean_squared_error(y, y_pred ),4)
        
    #Root Mean_Squared Error (RMSE)
    rmse = np.round(np.sqrt(mean_squared_error),4)
        
    #Mean_Absolute_Error(MAE)
    mean_absolute_error = np.round(mt.mean_absolute_error (y,y_pred),4)
   
    #Mean Absolute Percentage Error (MAPE)
    mean_absolute_percentage_error = np.round(mt.mean_absolute_percentage_error (y,y_pred),4)
    
    #Criando um dataframe com as metricas
        
    parameters = 'alpha = ' + str(alpha) + ' ;max_iter = ' + str(max_iter) 
    
    d = { parameters: [r2_score, mean_squared_error , rmse , mean_absolute_error ,mean_absolute_percentage_error]}  
    
    df_metrics =  pd.DataFrame(data = d, index = ['r2_score', 'mse', 'rmse', 'mae',
                                                 'mape'])      
    
        
    return df_metrics

# 5. Treinamento do modelo, validação e verificação de performance

### 5.1 Treinar o modelo com os valores de parametro default: alpha = 1, max_iter = None e verificação das métricas do modelo sobre os dados de treinamento

In [22]:
y_train, y_pred_train, ridge = Model_Training(df_train, 1, None)
df_default_train = Model_Metrics(y_train, y_pred_train, 1, None)
df_default_train

Unnamed: 0,alpha = 1 ;max_iter = None
r2_score,0.0461
mse,455.9964
rmse,21.3541
mae,16.9983
mape,8.6534


### 5.2 Testar o algoritmo com o parametro default  nos dados de validação e verificar sua performance

In [23]:
y_val, y_pred_val = Previsoes_Dataframe_Val(df_val, ridge)
df_default_val = Model_Metrics(y_val, y_pred_val, 1, None)
df_default_val

Unnamed: 0,alpha = 1 ;max_iter = None
r2_score,0.0399
mse,458.4455
rmse,21.4113
mae,17.0395
mape,8.6824


### 5.3 Treinar o modelo nos dados de treinamento alterando os parametros e testar a performance desse modelo sobre os dados de validação

### a) alpha = 5, max_iter = 1000

In [24]:
y_train, y_pred_train, ridge = Model_Training(df_train, 5, 1000)
y_val, y_pred_val = Previsoes_Dataframe_Val(df_val,ridge) 
df_val1 = Model_Metrics(y_val, y_pred_val, 5,1000)
df_val1

Unnamed: 0,alpha = 5 ;max_iter = 1000
r2_score,0.0399
mse,458.4415
rmse,21.4112
mae,17.0385
mape,8.6819


### b) alpha = 100, max_iter = 1000

In [25]:
y_train, y_pred_train, ridge = Model_Training(df_train, 100, 1000)
y_val, y_pred_val = Previsoes_Dataframe_Val(df_val,ridge) 
df_val2 = Model_Metrics(y_val, y_pred_val, 100,1000)
df_val2

Unnamed: 0,alpha = 100 ;max_iter = 1000
r2_score,0.0389
mse,458.9456
rmse,21.423
mae,17.0346
mape,8.6749


### c) alpha = 100, max_iter = 2000

In [26]:
y_train, y_pred_train, ridge = Model_Training(df_train, 100, 2000)
y_val, y_pred_val = Previsoes_Dataframe_Val(df_val,ridge) 
df_val3 = Model_Metrics(y_val, y_pred_val, 100,2000)
df_val3

Unnamed: 0,alpha = 100 ;max_iter = 2000
r2_score,0.0389
mse,458.9456
rmse,21.423
mae,17.0346
mape,8.6749


### d) alpha = 500, max_iter = 100

In [27]:
y_train, y_pred_train, ridge = Model_Training(df_train, 500, 100)
y_val, y_pred_val = Previsoes_Dataframe_Val(df_val,ridge) 
df_val4 = Model_Metrics(y_val, y_pred_val, 500,100)
df_val4

Unnamed: 0,alpha = 500 ;max_iter = 100
r2_score,0.0312
mse,462.62
rmse,21.5086
mae,17.0797
mape,8.673


### e) alpha = 3, max_iter = 20

In [28]:
y_train, y_pred_train, ridge = Model_Training(df_train, 3, 20)
y_val, y_pred_val = Previsoes_Dataframe_Val(df_val,ridge) 
df_val5 = Model_Metrics(y_val, y_pred_val, 3,20)
df_val5

Unnamed: 0,alpha = 3 ;max_iter = 20
r2_score,0.0399
mse,458.4431
rmse,21.4113
mae,17.039
mape,8.6822


### f) alpha = 1, max_iter = 20

In [29]:
y_train, y_pred_train, ridge = Model_Training(df_train, 1, 20)
y_val, y_pred_val = Previsoes_Dataframe_Val(df_val,ridge) 
df_val6= Model_Metrics(y_val, y_pred_val, 1,20)
df_val6

Unnamed: 0,alpha = 1 ;max_iter = 20
r2_score,0.0399
mse,458.4455
rmse,21.4113
mae,17.0395
mape,8.6824


### g) alpha = 0.4, max_iter = 20

In [30]:
y_train, y_pred_train, ridge = Model_Training(df_train, 0.4, 20)
y_val, y_pred_val = Previsoes_Dataframe_Val(df_val,ridge) 
df_val7 = Model_Metrics(y_val, y_pred_val, 0.4,20)
df_val7

Unnamed: 0,alpha = 0.4 ;max_iter = 20
r2_score,0.0399
mse,458.4464
rmse,21.4114
mae,17.0396
mape,8.6825


### h) alpha = 0.2, max_iter = 15

In [31]:
y_train, y_pred_train, ridge = Model_Training(df_train, 0.2, 15)
y_val, y_pred_val = Previsoes_Dataframe_Val(df_val,ridge) 
df_val8 = Model_Metrics(y_val, y_pred_val, 0.2,15)
df_val8

Unnamed: 0,alpha = 0.2 ;max_iter = 15
r2_score,0.0399
mse,458.4467
rmse,21.4114
mae,17.0397
mape,8.6825


### 5.4 Verificar o parametro que obteve os melhores resultados nos dados de validação

In [32]:
df_metrics_val = pd.concat([df_default_val, df_val1, df_val2, df_val3, df_val4, df_val5,
                           df_val6, df_val7, df_val8], axis = 1)
df_metrics_val

Unnamed: 0,alpha = 1 ;max_iter = None,alpha = 5 ;max_iter = 1000,alpha = 100 ;max_iter = 1000,alpha = 100 ;max_iter = 2000,alpha = 500 ;max_iter = 100,alpha = 3 ;max_iter = 20,alpha = 1 ;max_iter = 20,alpha = 0.4 ;max_iter = 20,alpha = 0.2 ;max_iter = 15
r2_score,0.0399,0.0399,0.0389,0.0389,0.0312,0.0399,0.0399,0.0399,0.0399
mse,458.4455,458.4415,458.9456,458.9456,462.62,458.4431,458.4455,458.4464,458.4467
rmse,21.4113,21.4112,21.423,21.423,21.5086,21.4113,21.4113,21.4114,21.4114
mae,17.0395,17.0385,17.0346,17.0346,17.0797,17.039,17.0395,17.0396,17.0397
mape,8.6824,8.6819,8.6749,8.6749,8.673,8.6822,8.6824,8.6825,8.6825


# 6. Unificar os dados de treinamento e validação e retreinar o algoritmo, utilizando os melhores valores para os parâmetros

### 6.1 Dividir novamente os datasets de treinamento e validação

In [33]:
features = ['song_duration_ms', 'acousticness', 'danceability', 'energy',
       'instrumentalness', 'key', 'liveness', 'loudness', 'audio_mode',
       'speechiness', 'tempo', 'time_signature', 'audio_valence']
    
    
label = ['label'] 

#Separação dos dados de treino
x_train = df_train.loc[:, features]

#Esse metodo ravel transforma os valores em um array
y_train = df_train.loc[:, label].values.ravel()      

#Separação dos dados de validação
x_val = df_val.loc[:, features]

y_val = df_val.loc[:, label].values.ravel() 

### 6.2 Treinar o modelo novamente com o melhor parametro obtido

In [34]:
model_last = lm.Ridge(alpha = 1, max_iter = None)
model_last.fit( np.concatenate( (x_train, x_val) ), np.concatenate((y_train, y_val)) )

Ridge(alpha=1)

### 6.3 Fazendo previsões nos dados de teste 

In [35]:
y_test, y_pred_test = Previsoes_Dataframe_Test(df_test, model_last)



### 6.4 Verificando as métricas do modelo com os dados de teste

In [36]:
df_metrics_test = Model_Metrics(y_test, y_pred_test, 1, None)
df_metrics_test

Unnamed: 0,alpha = 1 ;max_iter = None
r2_score,0.0512
mse,461.9878
rmse,21.4939
mae,17.144
mape,8.532


# 7. Resguardando os objetos com as métricas de maior performance

### 7.1 Alterando o nome das colunas dos dataframes

#### a) dataframe teste com melhores métricas

In [37]:
df_metrics_test = df_metrics_test.rename(columns = {"alpha = 1 ;max_iter = None": "Linear Regression Ridge"})
df_metrics_test

Unnamed: 0,Linear Regression Ridge
r2_score,0.0512
mse,461.9878
rmse,21.4939
mae,17.144
mape,8.532


#### b) dataframe validação com melhores métricas

In [38]:
df_default_val = df_default_val.rename(columns = {"alpha = 1 ;max_iter = None": "Linear Regression Ridge"})
df_default_val

Unnamed: 0,Linear Regression Ridge
r2_score,0.0399
mse,458.4455
rmse,21.4113
mae,17.0395
mape,8.6824


#### c) dataframe treinamento com melhores métricas

In [39]:
df_default_train = df_default_train.rename(columns={"alpha = 1 ;max_iter = None": "Linear Regression Ridge"})
df_default_train

Unnamed: 0,Linear Regression Ridge
r2_score,0.0461
mse,455.9964
rmse,21.3541
mae,16.9983
mape,8.6534


# 8. Salvar os objetos com as melhores metricas em um arquivo pickle

### a) dataframe com métricas dos dados de treinamento

In [40]:
with open('arquivo_ridge_train.pkl', 'wb') as arquivo_ridge_train:
    pickle.dump(df_default_train,arquivo_ridge_train)  

In [41]:
arquivo_ridge_train.close()

### b) dataframe com métricas dos dados de validação

In [42]:
with open('arquivo_ridge_val.pkl', 'wb') as arquivo_ridge_val:
    pickle.dump(df_default_val,arquivo_ridge_val)  

In [43]:
arquivo_ridge_val.close()

### c) dataframe com métricas dos dados de teste

In [44]:
with open('arquivo_ridge_test.pkl', 'wb') as arquivo_ridge_test:
    pickle.dump(df_metrics_test,arquivo_ridge_test)  

In [45]:
arquivo_ridge_test.close()