#Trabalho Final - Machine Learning - Bootcamp ENAP 2020

## Teste com recorte de dados (coluna Dest com valor LAX) e uso do algoritmo Lasso. 

## Apresentação

Construção de modelo de regressão linear que pode ser utilizado para previsão de gastos com passagens aéreas nos diversos órgãos da administração pública.

## Equipe

Cristhiano Mello - Justiça Federal do Paraná

Edson Dario Silva de França - Departamento Penitenciário Nacional (DEPEN/MJSP)

Eluzaí Souza dos Santos - Tribunal Regional Federal da 1ª Região

Rafael Ventura da Silva - Polícia Rodoviária Federal

## Importação de bibliotecas

In [None]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split

# importa o modelo de regressão linear
from sklearn.linear_model import LinearRegression

# Normalização das variáveis
from sklearn.preprocessing import StandardScaler

# Variáveis categóricas
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.compose import ColumnTransformer

# Análise exploratória dos dados

### Carregamento dos dados

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
path = '/content/drive/MyDrive/Cursos/BootCamp/ML/Passagens/Dados/Filtered-Dest-LAX.csv'
df_pass = pd.read_csv(path)

In [None]:
df_pass

Unnamed: 0,Quarter,Origin,Dest,Miles,ContiguousUSA,NumTicketsOrdered,AirlineCompany,PricePerTicket
0,1,PHL,LAX,2402.0,2,1.0,AA,672.87
1,1,PHL,LAX,2402.0,2,1.0,AA,367.68
2,1,PHL,LAX,2402.0,2,1.0,AA,417.94
3,1,PHL,LAX,2402.0,2,1.0,AA,247.10
4,1,PHL,LAX,2402.0,2,1.0,AA,276.35
...,...,...,...,...,...,...,...,...
413852,4,SMF,LAX,373.0,2,1.0,WN,218.40
413853,4,ATL,LAX,1947.0,2,1.0,WN,176.23
413854,4,LIT,LAX,1494.0,2,1.0,WN,223.00
413855,4,LIT,LAX,1494.0,2,1.0,WN,376.96


In [None]:
# df_pass.drop(columns=['MktCoupons','OriginWac', 'DestWac','ContiguousUSA'], inplace=True)

In [None]:
# Excluir as duas primeiras colunas
# df_pass.drop(columns=['Unnamed: 0','ItinID', 'MktID'], inplace=True) # Pode ser usado como filtro, para linhas e colunas.

# del df_pass['MktID'] # Mais rápido. Somente para coluna. 

## Preparação dos dados

In [None]:
# cria uma lista contendo nomes das features
feature_cols = ['Quarter', 'Origin', 'Dest', 'ContiguousUSA', 'AirlineCompany']
feature_cols_padroniza = [3,5]

In [None]:
# Features
X = df_pass.iloc[:,:-1]
# Target
y = df_pass.PricePerTicket

In [None]:
# Analisar dimensões do dataset
print(X.shape)
print(y.shape)

(413857, 7)
(413857,)


In [None]:
preprocessor_cat = ColumnTransformer( 
    transformers=[        
        ('category', OneHotEncoder(handle_unknown='ignore'), feature_cols), # Especifica as colunas categóricas.   
        ('integer', StandardScaler(), feature_cols_padroniza) # Especifica as colunas que serão padronizadas.
    ], remainder='passthrough')

In [None]:
# Separação dos dados de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size = 0.2)
X_train

Unnamed: 0,Quarter,Origin,Dest,Miles,ContiguousUSA,NumTicketsOrdered,AirlineCompany
33720,1,MCO,LAX,2218.0,2,6.0,B6
74124,1,OGG,LAX,2486.0,1,2.0,HA
105644,2,BWI,LAX,2329.0,2,2.0,NK
147111,2,MSY,LAX,1670.0,2,7.0,WN
175034,2,OGG,LAX,2486.0,1,1.0,AA
...,...,...,...,...,...,...,...
259178,3,PDX,LAX,834.0,2,1.0,DL
365838,4,PIT,LAX,2136.0,2,1.0,NK
131932,2,PDX,LAX,834.0,2,1.0,AS
146867,2,MKE,LAX,1756.0,2,1.0,WN


In [None]:
X_train = preprocessor_cat.fit_transform(X_train)

In [None]:
X_train

<331085x143 sparse matrix of type '<class 'numpy.float64'>'
	with 2317595 stored elements in Compressed Sparse Row format>

In [None]:
X_test = preprocessor_cat.transform(X_test)

In [None]:
# Dimensões dos dados de treino
X_train.shape

(331085, 143)

In [None]:
# Dimensões dos dados de teste
X_test.shape

(82772, 143)

## Treino do modelo

In [None]:
# Instanciação
lr = LinearRegression()

In [None]:
# Treino do modelo
modelo = lr.fit(X_train, y_train)

## Testar o modelo (inferência)

In [None]:
from sklearn import metrics
y_predict = lr.predict(X_test)
# lr.score(X_train, y_train)

## Avaliar qualidade do modelo

In [None]:
metrics.r2_score(y_test, y_predict)

0.2771908131772751

In [None]:
metrics.mean_squared_error(y_test, y_predict) # Erro quadrático médio

20335.659584907975

In [None]:
metrics.mean_absolute_error(y_test, y_predict) # Margem de erro - em modulo

102.11579109112927

## ElasticNet

In [None]:
from sklearn.linear_model import ElasticNet
en = ElasticNet(l1_ratio=0)
en.fit(X_train, y_train)


  max_iter, tol, rng, random, positive)


ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0,
           max_iter=1000, normalize=False, positive=False, precompute=False,
           random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [None]:
y_predict_en = en.predict(X_test)

In [None]:
metrics.r2_score(y_test, y_predict_en) # Abaixo de 0.5 => modelo não tá legal.

0.14522794375705306

In [None]:
metrics.mean_squared_error(y_test, y_predict_en) # Erro quadrático médio

24048.32959422741

In [None]:
lr.intercept_ # Se não informar valores das colunas, a passagem custará $234,59.

229.57064987936027

In [None]:
lr.coef_

array([-4.23060796e+00,  1.22569927e+00, -2.39119080e+00,  5.39609948e+00,
       -9.45374008e+00, -2.65721312e+01, -6.14845603e+01, -4.93222383e+01,
       -9.30437381e+00,  3.19690584e+01,  3.60283173e+01, -1.91239268e+01,
       -1.75024809e+01,  1.55486875e+01, -1.64437422e+01,  1.57271947e+01,
        2.23544462e+01,  3.05741828e+00, -3.58493430e+01, -6.33107898e+01,
       -2.97618817e+01, -3.04357132e+01,  3.02202524e+01, -9.21366147e+00,
       -1.23219589e+01,  1.04270758e+02, -1.84503931e+01,  2.36862270e+01,
        1.69365876e+02, -1.05500223e+00, -1.15266157e+01, -2.25704652e+00,
       -2.77741269e+01,  1.09803002e+01, -8.22724329e-01,  6.13099304e+01,
        7.62885032e+01,  1.44723968e+02, -8.51093523e+00,  2.07199090e+01,
        2.34917650e+01, -1.19899570e+01, -1.45030332e+01, -5.90324040e+00,
       -1.41998163e+01, -2.24177828e+01, -2.51411400e+00, -5.81499899e+01,
       -2.52144744e+01,  2.84139467e+01,  6.46023984e+00, -6.20950857e+00,
        2.52231828e+01,  

In [None]:
# Fazer predição


## Lasso Model

In [None]:
from sklearn.linear_model import Lasso

In [None]:
tst_lasso = Lasso()
lasso_fit = tst_lasso.fit(X_train, y_train)

In [None]:
y_predict_lasso = lasso_fit.predict(X_test)

In [None]:
metrics.r2_score(y_test, y_predict_lasso)

0.24874561545386265

In [None]:
metrics.mean_squared_error(y_test, y_predict_lasso)

21135.94252025836

In [None]:
metrics.mean_absolute_error(y_test, y_predict_lasso)

104.33469280846637