#Trabalho Final - Machine Learning - Bootcamp ENAP 2020

## Teste com recorte de dados (coluna Origin com valor LAX) e uso do algoritmo Lasso

## Apresentação

Construção de modelo de regressão linear que pode ser utilizado para previsão de gastos com passagens aéreas nos diversos órgãos da administração pública.

## Equipe

Cristhiano Mello - Justiça Federal do Paraná

Edson Dario Silva de França - Departamento Penitenciário Nacional (DEPEN/MJSP)

Eluzaí Souza dos Santos - Tribunal Regional Federal da 1ª Região

Rafael Ventura da Silva - Polícia Rodoviária Federal

## Importação de bibliotecas

In [None]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split

# importa o modelo de regressão linear
from sklearn.linear_model import LinearRegression

# Normalização das variáveis
from sklearn.preprocessing import StandardScaler

# Variáveis categóricas
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.compose import ColumnTransformer

# Análise exploratória dos dados

### Carregamento dos dados

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
path = '/content/drive/MyDrive/Cursos/BootCamp/ML/Passagens/Dados/Filtered-LAX.csv'
df_pass = pd.read_csv(path)

In [None]:
df_pass

Unnamed: 0,Quarter,Origin,Dest,Miles,ContiguousUSA,NumTicketsOrdered,AirlineCompany,PricePerTicket
0,1,LAX,ORD,1744.0,2,1.0,AA,200.65
1,1,LAX,PHL,2402.0,2,2.0,AA,67.00
2,1,LAX,PHL,2402.0,2,1.0,AA,67.50
3,1,LAX,PHL,2402.0,2,3.0,AA,104.00
4,1,LAX,PHL,2402.0,2,2.0,AA,109.00
...,...,...,...,...,...,...,...,...
415292,4,LAX,LGA,2469.0,2,1.0,WN,162.07
415293,4,LAX,LGA,2469.0,2,1.0,WN,282.76
415294,4,LAX,BNA,1797.0,2,1.0,WN,199.81
415295,4,LAX,SMF,373.0,2,1.0,WN,184.85


In [None]:
# df_pass.drop(columns=['MktCoupons','OriginWac', 'DestWac','ContiguousUSA'], inplace=True)

In [None]:
# Excluir as duas primeiras colunas
# df_pass.drop(columns=['Unnamed: 0','ItinID', 'MktID'], inplace=True) # Pode ser usado como filtro, para linhas e colunas.

# del df_pass['MktID'] # Mais rápido. Somente para coluna. 

## Preparação dos dados

In [None]:
# cria uma lista contendo nomes das features
feature_cols = ['Quarter', 'Origin', 'Dest', 'ContiguousUSA', 'AirlineCompany']
feature_cols_padroniza = [3,5]

In [None]:
# Features
X = df_pass.iloc[:,:-1]
# Target
y = df_pass.PricePerTicket

In [None]:
# Analisar dimensões do dataset
print(X.shape)
print(y.shape)

(415297, 7)
(415297,)


In [None]:
preprocessor_cat = ColumnTransformer( 
    transformers=[        
        ('category', OneHotEncoder(handle_unknown='ignore'), feature_cols), # Especifica as colunas categóricas.   
        ('integer', StandardScaler(), feature_cols_padroniza) # Especifica as colunas que serão padronizadas.
    ], remainder='passthrough')

In [None]:
# Separação dos dados de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size = 0.2)
X_train

Unnamed: 0,Quarter,Origin,Dest,Miles,ContiguousUSA,NumTicketsOrdered,AirlineCompany
82780,1,LAX,FLL,2343.0,2,2.0,VX
115853,2,LAX,EWR,2454.0,2,1.0,UA
312964,4,LAX,KOA,2504.0,1,3.0,DL
293553,3,LAX,ATL,1947.0,2,13.0,NK
14130,1,LAX,TPA,2158.0,2,2.0,DL
...,...,...,...,...,...,...,...
259178,3,LAX,CMH,1995.0,2,1.0,DL
365838,4,LAX,EWR,2454.0,2,1.0,UA
131932,2,LAX,LAS,236.0,2,6.0,AS
146867,2,LAX,IAD,2288.0,2,1.0,UA


In [None]:
X_train = preprocessor_cat.fit_transform(X_train)

In [None]:
X_train

<332237x140 sparse matrix of type '<class 'numpy.float64'>'
	with 2325659 stored elements in Compressed Sparse Row format>

In [None]:
X_test = preprocessor_cat.transform(X_test)

In [None]:
# Dimensões dos dados de treino
X_train.shape

(332237, 140)

In [None]:
# Dimensões dos dados de teste
X_test.shape

(83060, 140)

## Treino do modelo

In [None]:
# Instanciação
lr = LinearRegression()

In [None]:
# Treino do modelo
modelo = lr.fit(X_train, y_train)

## Testar o modelo (inferência)

In [None]:
from sklearn import metrics
y_predict = lr.predict(X_test)
# lr.score(X_train, y_train)

## Avaliar qualidade do modelo

In [None]:
metrics.r2_score(y_test, y_predict)

0.27498962715442254

In [None]:
metrics.mean_squared_error(y_test, y_predict) # Erro quadrático médio

20496.483900771058

In [None]:
metrics.mean_absolute_error(y_test, y_predict) # Margem de erro - em modulo

101.86989271094859

## ElasticNet

In [None]:
from sklearn.linear_model import ElasticNet
en = ElasticNet(l1_ratio=0)
en.fit(X_train, y_train)

  max_iter, tol, rng, random, positive)


ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0,
           max_iter=1000, normalize=False, positive=False, precompute=False,
           random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [None]:
y_predict_en = en.predict(X_test)

In [None]:
metrics.r2_score(y_test, y_predict_en) # Abaixo de 0.5 => modelo não tá legal.

0.14611608582531666

In [None]:
metrics.mean_squared_error(y_test, y_predict_en) # Erro quadrático médio

24139.81724332723

In [None]:
lr.intercept_ # Se não informar valores das colunas, a passagem custará $234,59.

232.91364011349955

In [None]:
lr.coef_

array([-3.17589142e+00,  1.49861268e+00, -2.83915431e+00,  4.51643306e+00,
       -9.84837004e-10, -1.34665751e+01, -3.25317804e+01, -1.10791082e+01,
        8.77939066e+01, -8.08725909e+00,  3.37943497e+00,  3.45101860e+01,
       -2.49859062e+01, -9.29560710e+00,  5.32742462e-01, -1.49373812e+01,
        1.85011568e+01,  1.89431682e+01, -8.56974229e+00, -3.26293658e+01,
       -5.59582230e+01, -3.06733645e+01, -3.50036211e+01,  2.97192320e+01,
       -1.49529758e+01, -1.08360460e+01,  9.94154154e+01, -2.00460047e+01,
        1.94191575e+01, -1.05820043e+00, -1.92952599e+01,  7.25544183e-01,
       -3.57721511e+01, -6.22130587e+00, -2.03479313e+00,  6.16150945e+01,
       -4.46475626e+00,  1.32777317e+02, -1.36401721e+01,  1.39019070e+01,
        2.70644663e+01, -1.26394040e+01, -1.61493515e+01, -1.02958296e+00,
       -1.55997055e+01, -3.79031450e+01, -4.86233021e+00,  2.85986071e+01,
        1.08403297e+01,  2.21194727e+01,  8.98020141e+00, -5.10880769e+01,
        2.66292790e+01,  

In [None]:
# Fazer predição


## Lasso Model

In [None]:
from sklearn.linear_model import Lasso

In [None]:
tst_lasso = Lasso()
lasso_fit = tst_lasso.fit(X_train, y_train)

In [None]:
y_predict_lasso = lasso_fit.predict(X_test)

In [None]:
metrics.r2_score(y_test, y_predict_lasso)

0.24946654150622738

In [None]:
metrics.mean_squared_error(y_test, y_predict_lasso)

21218.037044946082

In [None]:
metrics.mean_absolute_error(y_test, y_predict_lasso)

103.99530350102508