#Trabalho Final - Machine Learning - Bootcamp ENAP 2020

## Teste com recorte de dados (coluna AirlineCompany com valor DL) e uso do algoritmo Lasso. 

## Apresentação

Construção de modelo de regressão linear que pode ser utilizado para previsão de gastos com passagens aéreas nos diversos órgãos da administração pública.

## Equipe

Cristhiano Mello - Justiça Federal do Paraná

Edson Dario Silva de França - Departamento Penitenciário Nacional (DEPEN/MJSP)

Eluzaí Souza dos Santos - Tribunal Regional Federal da 1ª Região

Rafael Ventura da Silva - Polícia Rodoviária Federal

## Importação de bibliotecas

In [None]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split

# importa o modelo de regressão linear
from sklearn.linear_model import LinearRegression

# Normalização das variáveis
from sklearn.preprocessing import StandardScaler

# Variáveis categóricas
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import ElasticNet

# Análise exploratória dos dados

### Carregamento dos dados

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
path = '/content/drive/MyDrive/Cursos/BootCamp/ML/Passagens/Dados/Filtered-DL.csv'
# path = '/content/drive/MyDrive/dados/Filtered-DL.csv'
df_pass = pd.read_csv(path)

In [None]:
df_pass

Unnamed: 0,Quarter,Origin,Dest,Miles,ContiguousUSA,NumTicketsOrdered,AirlineCompany,PricePerTicket
0,1,JFK,DEN,1626.0,2,1.0,DL,52.0
1,1,JFK,DEN,1626.0,2,6.0,DL,78.0
2,1,JFK,DEN,1626.0,2,2.0,DL,83.0
3,1,JFK,DEN,1626.0,2,13.0,DL,88.0
4,1,JFK,DEN,1626.0,2,1.0,DL,92.0
...,...,...,...,...,...,...,...,...
1771434,4,CVG,BOS,752.0,2,1.0,DL,275.0
1771435,4,BOS,CVG,752.0,2,1.0,DL,279.0
1771436,4,BOS,CVG,752.0,2,1.0,DL,279.5
1771437,4,BOS,CVG,752.0,2,1.0,DL,279.5


In [None]:
# df_pass.drop(columns=['MktCoupons','OriginWac', 'DestWac','ContiguousUSA'], inplace=True)

In [None]:
# Excluir as duas primeiras colunas
# df_pass.drop(columns=['Unnamed: 0','ItinID', 'MktID'], inplace=True) # Pode ser usado como filtro, para linhas e colunas.

# del df_pass['MktID'] # Mais rápido. Somente para coluna. 

## Preparação dos dados

In [None]:
# cria uma lista contendo nomes das features
feature_cols = ['Quarter', 'Origin', 'Dest', 'ContiguousUSA', 'AirlineCompany']
feature_cols_padroniza = [3,5]

In [None]:
# Features
X = df_pass.iloc[:,:-1]
# Target
y = df_pass.PricePerTicket

In [None]:
# Analisar dimensões do dataset
print(X.shape)
print(y.shape)

(1771439, 7)
(1771439,)


In [None]:
preprocessor_cat = ColumnTransformer( 
    transformers=[        
        ('category', OneHotEncoder(handle_unknown='ignore'), feature_cols), # Especifica as colunas categóricas.   
        ('integer', StandardScaler(), feature_cols_padroniza) # Especifica as colunas que serão padronizadas.
    ], remainder='passthrough')

In [None]:
# Separação dos dados de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size = 0.2)
X_train

Unnamed: 0,Quarter,Origin,Dest,Miles,ContiguousUSA,NumTicketsOrdered,AirlineCompany
1256955,3,ORD,ATL,606.0,2,3.0,DL
635810,2,MSP,PHX,1276.0,2,1.0,DL
1125104,3,PIT,ATL,526.0,2,1.0,DL
1428930,4,MSP,MCO,1310.0,2,1.0,DL
516493,2,JFK,MIA,1089.0,2,1.0,DL
...,...,...,...,...,...,...,...
259178,1,MCI,ATL,692.0,2,1.0,DL
1414414,4,MSP,DCA,931.0,2,1.0,DL
131932,1,ATL,ORD,606.0,2,1.0,DL
671155,2,DEN,ATL,1199.0,2,4.0,DL


In [None]:
X_train = preprocessor_cat.fit_transform(X_train)

In [None]:
X_train

<1417151x322 sparse matrix of type '<class 'numpy.float64'>'
	with 9920057 stored elements in Compressed Sparse Row format>

In [None]:
X_test = preprocessor_cat.transform(X_test)

In [None]:
# Dimensões dos dados de treino
X_train.shape

(1417151, 322)

In [None]:
# Dimensões dos dados de teste
X_test.shape

(354288, 322)

## Treino do modelo

In [None]:
# Instanciação
lr = LinearRegression()

In [None]:
# Treino do modelo
modelo = lr.fit(X_train, y_train)

## Testar o modelo (inferência)

In [None]:
from sklearn import metrics
y_predict = lr.predict(X_test)
# lr.score(X_train, y_train)

## Avaliar qualidade do modelo

In [None]:
metrics.r2_score(y_test, y_predict) # Tem que ser maior

0.1848827660290917

In [None]:
metrics.mean_squared_error(y_test, y_predict) # Erro quadrático médio - Tem que ser menor (na comparação)

17059.721983409363

In [None]:
metrics.mean_absolute_error(y_test, y_predict) # Margem de erro - em modulo - Tem que ser menor.

98.10098170066462

## ElasticNet

In [None]:
en = ElasticNet(l1_ratio=0)
en.fit(X_train, y_train)

  max_iter, tol, rng, random, positive)


ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0,
           max_iter=1000, normalize=False, positive=False, precompute=False,
           random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [None]:
y_predict_en = en.predict(X_test)

In [None]:
metrics.r2_score(y_test, y_predict_en) # Abaixo de 0.5 => modelo não tá legal.

0.1051983784126812

In [None]:
metrics.mean_squared_error(y_test, y_predict_en) # Erro quadrático médio

18727.44957215362

In [None]:
metrics.mean_absolute_error(y_test, y_predict) # Margem de erro - em modulo - Tem que ser menor.

In [None]:
lr.intercept_ # Se não informar valores das colunas, a passagem custará $234,59.

241.2832119555996

In [None]:
lr.coef_

array([ 4.40396300e+00, -1.50357436e+00, -4.80435505e+00,  1.90396682e+00,
       -2.05833355e+01, -1.15322657e+01,  4.59814087e+01, -2.33728326e-01,
       -1.71714646e+01,  5.95199984e+01,  6.79567360e+00, -8.72142979e+00,
       -1.60160257e+01, -3.62131443e+01,  1.80677745e+01, -1.26301011e+01,
        9.88463417e+00,  2.30422813e+01,  2.67306880e+01,  4.20163604e+01,
        5.91621609e+00, -8.81705998e+00, -1.44382619e+01,  7.48223341e+00,
       -6.08996518e+00, -7.38688440e+00, -3.44991830e+01,  3.72681140e+01,
       -8.72160442e+00, -5.01749068e+01,  1.12480973e+01, -5.56954769e+01,
        2.23385478e+01, -7.10516031e+00, -3.95639723e+01,  4.53142915e+00,
       -3.22638249e+01, -4.25994062e+01, -7.04514539e+01,  1.26599810e+01,
       -1.22016188e+01, -4.20875199e+01, -8.93007713e+00,  2.38616849e+01,
       -3.56373635e+01, -4.16040999e+01,  1.96791054e+02,  6.02448354e+00,
        6.07780703e+01, -1.81953598e+00,  8.77873856e+01, -2.60509612e+00,
       -3.70264390e+00,  

In [None]:
# Fazer predição


## Lasso Model

In [None]:
from sklearn.linear_model import Lasso

In [None]:
tst_lasso = Lasso()
lasso_fit = tst_lasso.fit(X_train, y_train)

In [None]:
y_predict_lasso = lasso_fit.predict(X_test)

In [None]:
metrics.r2_score(y_test, y_predict_lasso)

0.151589788568815

In [None]:
metrics.mean_squared_error(y_test, y_predict_lasso)

17756.516157058875

In [None]:
metrics.mean_absolute_error(y_test, y_predict_lasso)

100.38125616943486