In [None]:
import matplotlib.pyplot as plt
import pandas as pd

import database

voos = pd.read_sql( '''
select * from voos_sp_sj
where dt_partida_prevista >= '2017-12-30 00:00:00'
order by dt_partida_prevista''', database.engine )

In [None]:

clima = pd.read_sql( """

with
	mappings as (
	select *
		from
			aero_ponto_coleta
		where
			exists(
				select
					1
					from
						voos_sp_sj voos
					where
						voos.id_aerodromo_origem = aero_ponto_coleta.id_aerodromo
				)


	)
  , resumo_clima as (
                    select
	                    ponto_id
	                  , date( dt_clima ) + interval 1 day as dt_clima
	                  , id_aerodromo
	                  , sum( precipitacao_total )         as precipitacao_total
	                  , avg( temperatura_ar )             as temperatura_ar
	                  , avg( vento_rajada_maxima )        as vento_rajada_maxima
	                    from
		                    clima
		                    inner join mappings using ( ponto_id )
	                    group by
		                    ponto_id
		                  , date( dt_clima )
                    )

select * from resumo_clima
                    """, database.engine )

In [None]:
clima[ "dt_clima" ] = pd.to_datetime( clima[ "dt_clima" ] )
voos[ "dt_referencia" ] = voos[ "dt_partida_prevista" ].dt.floor( 'D' )
voos[ "atrasado" ] = ((voos[ "dt_chegada_real" ] - voos[ "dt_chegada_prevista" ]).dt.seconds / 60) >= 20

In [None]:
df = voos.merge( clima, left_on = [ "id_aerodromo_origem", "dt_referencia" ], right_on = [ "id_aerodromo", "dt_clima" ], how = "left" )
df

In [None]:
df = df[ [ "atrasado", "id_aerodromo_origem", "id_aerodromo_destino", "id_empresa", "dt_referencia", "precipitacao_total", "temperatura_ar", "vento_rajada_maxima" ] ]
df

In [None]:
df = df.sort_values( "dt_referencia" ).groupby( "id_aerodromo_origem", as_index = False ).fillna( method = "ffill" )

In [None]:
df[ "dia_semana" ] = df[ "dt_referencia" ].dt.dayofweek
df[ "semana_ano" ] = df[ "dt_referencia" ].dt.weekofyear
for column in [ "precipitacao_total", "temperatura_ar", "vento_rajada_maxima" ]:
	df[ column ] = df[ column ].fillna( df[ column ].mean() )
df

In [None]:

from sklearn.model_selection import train_test_split

categoricas = [ "id_empresa",
                "id_aerodromo_origem",
                "id_aerodromo_destino",
                "dia_semana",
                "semana_ano" ]
numericas = [ "precipitacao_total", "temperatura_ar", "vento_rajada_maxima" ]
train_columns = [ ]
train_columns.extend( categoricas )
train_columns.extend( numericas )

df_y = df[ [ "atrasado" ] ]
df_x = df[ train_columns ]

df_train_x, df_test_x, df_train_y, df_test_y = train_test_split( df_x, df_y, train_size = 0.8, stratify = df_y )

In [None]:
from category_encoders import TargetEncoder

encoder = TargetEncoder(
		cols = categoricas,
		handle_unknown = 'value' ).fit( df_train_x, df_train_y )

df_train_x = encoder.transform( df_train_x )
df_test_x = encoder.transform( df_test_x )

In [None]:

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

forest: RandomForestClassifier = RandomForestClassifier( max_depth = 4, random_state = 1, )
forest: RandomForestClassifier = forest.fit( df_train_x, df_train_y )

reg: LogisticRegression = LogisticRegression()
reg.fit( df_train_x, df_train_y )
df_predict_train = forest.predict_proba( df_train_x )[ :, 1 ]
df_predict_test = forest.predict_proba( df_test_x )[ :, 1 ]

In [None]:

from sklearn.metrics import accuracy_score, precision_score, roc_auc_score, f1_score

print( "training accuracy_score: ", accuracy_score( df_train_y, df_predict_train > 0.5 ) )
print( "training precision_score: ", precision_score( df_train_y, df_predict_train > 0.5 ) )
print( "training f1_score: ", f1_score( df_train_y, df_predict_train > 0.5 ) )
print( "training roc_auc_score: ", roc_auc_score( df_train_y, df_predict_train ) )
# print()
print( "testing accuracy_score: ", accuracy_score( df_test_y, df_predict_test > 0.5 ) )
print( "testing precision_score: ", precision_score( df_test_y, df_predict_test > 0.5 ) )
print( "testing f1_score: ", f1_score( df_test_y, df_predict_test > 0.5 ) )
print( "testing roc_auc_score: ", roc_auc_score( df_test_y, df_predict_test ) )


In [None]:
from sklearn.calibration import calibration_curve

cal_x, cal_y = calibration_curve( df_train_y, df_predict_train, normalize = True )
print( cal_x, cal_y )

In [None]:
from sklearn.metrics import plot_roc_curve, plot_precision_recall_curve, plot_confusion_matrix
import numpy as np

dig, ax = plt.subplots( figsize = (20, 20) )
ax.plot( [ 0, 1 ], [ 0, 1 ] )


class Proxy:

	def __init__( self, model ):
		self.model = model
		self._estimator_type = model._estimator_type
		self.classes_ = model.classes_

	def predict_proba( self, x ):
		return np.sqrt( self.model.predict_proba( x ) )


plot_roc_curve( forest, df_train_x, df_train_y, ax = ax )
plot_roc_curve( reg, df_train_x, df_train_y, ax = ax )

plot_

In [None]:
dig, ax = plt.subplots()
ax.plot()
plot_precision_recall_curve( forest, df_train_x, df_train_y, ax = ax )
plot_precision_recall_curve( reg, df_train_x, df_train_y, ax = ax )

In [None]:
plot_confusion_matrix( forest, df_train_x, df_train_y, cmap = "Blues" )