In [14]:
import pandas as pd
from imblearn.over_sampling import SMOTE
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, make_scorer, recall_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.utils import shuffle
import numpy as np
import pickle

## Load data 

SCL_flights_full contains the columns from SCL_flights_data_extra and SCL_flights_data(original), so it is the one useful to create a model.

In [4]:
df = pd.read_csv('./../datasets/SCL_flights_full.csv')

# shuffle data with the same random_state as to-expose.ipynb for an approx comparission
df = shuffle(df, random_state=111)

print(f"data shape: {df.shape}")
print(f"Info: {df.info()}")

data shape: (68206, 22)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 68206 entries, 13375 to 10196
Data columns (total 22 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Fecha-I         68206 non-null  object 
 1   Vlo-I           68206 non-null  object 
 2   Ori-I           68206 non-null  object 
 3   Des-I           68206 non-null  object 
 4   Emp-I           68206 non-null  object 
 5   Fecha-O         68206 non-null  object 
 6   Vlo-O           68205 non-null  object 
 7   Ori-O           68206 non-null  object 
 8   Des-O           68206 non-null  object 
 9   Emp-O           68206 non-null  object 
 10  DIA             68206 non-null  int64  
 11  MES             68206 non-null  int64  
 12  AÑO             68206 non-null  int64  
 13  DIANOM          68206 non-null  object 
 14  TIPOVUELO       68206 non-null  object 
 15  OPERA           68206 non-null  object 
 16  SIGLAORI        68206 non-null  object 
 17  SIG

  df = pd.read_csv('./../datasets/SCL_flights_full.csv')


## Prepare data

The features to use, based on the assumption that they may influece on whether a flight gets delayed or not, are: 
- **DIA** (numerical) - Airport can be busy
- **MES** (numerical) - Seasons factors
- **temporada_alta** (numerical) - High congestion at the airport
- **DIANOM** Nombre del día (categorical) - Day of the week
- **TIPOVUELO** vuelo nacional o internacional (categorical) - different procedures
- **OPERA** Nombre de la aerolinea (categorical) - different operational procedures
- **SIGLAORI** Ciudad de origen (categorical) - airport status
- **SIGLADES** Ciudad de destino (categorical) - airport status 
- **periodo_dia** (categorical) - some periods can be busier

In [5]:
# Create one-hot encoded vectors
features = pd.get_dummies(df, columns=['DIANOM', 'TIPOVUELO', 'OPERA', 'SIGLAORI', 'SIGLADES', 'periodo_dia'])

# Drop irrelevant columns
features.drop(['Fecha-I', 
         'Fecha-O', 
         'Vlo-I', 
         'Vlo-O', 
         'AÑO', 
         'dif_min', 
         'Ori-O', 
         'Des-O', 
         'Emp-O', 
         'Ori-I', 
         'Des-I', 
         'Emp-I'], axis=1, inplace=True)

# Keep 20% of data for testing
training_set, testing_set = train_test_split(features, test_size=0.2, random_state=42)

# Define input and expected output data
X = training_set.drop('atraso_15', axis=1)
y = training_set['atraso_15']

# Fix class imbalance, these new samples are generated by interpolating between the existing 
# samples in the minority class
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Create training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.33, random_state=42)

print(f"Training dist: \n{y_train.value_counts('%')}")
print(f"Testing dist: \n{y_val.value_counts('%')}")

Training dist: 
1    0.501764
0    0.498236
Name: atraso_15, dtype: float64
Testing dist: 
0    0.503581
1    0.496419
Name: atraso_15, dtype: float64


## Create model

The models in to-expose.ipynb were not using enough features to predict a flight delay, besides that, they had very poor performance since the recall for class 1 (delayed flights) was at only 0.03 which indicates that the model is not good identifying delays (it correctly identified only 3% of them). This is due to unbalanced data. 

So, in this new model, both of those problems were addressed in the above cell, more features and balanced data. This significantly improved the model and increased the class 1 recall to around 20%.

To improve it more, a grid search was done in the following cell using the Recall score as metric; This because it is better to say that a flight is going to be delayed and be wrong than saying that a flight is going to be on-time and be wrong...

In [None]:
model = xgb.XGBClassifier(eval_metric='logloss', random_state=1)

# Define the hyperparameters
param_grid = {
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 6, 9],
    'n_estimators': [50, 100, 200],
    'subsample': [0.5, 0.7, 1.0],
    'colsample_bytree': [0.5, 0.7, 1.0],
    'gamma': [0, 0.1, 0.3]
}

# Use recall score as the evaluation metric for GridSearchCV
recall_scorer = make_scorer(recall_score, greater_is_better=True)

# Dearch 
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring=recall_scorer, cv=3, verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best combination of hyperparameters
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

In [21]:
print("Best parameters found: ", best_params)

Best parameters found:  {'colsample_bytree': 0.7, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 9, 'n_estimators': 200, 'subsample': 1.0}


In [6]:
best_params = {'colsample_bytree': 0.7, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 9, 'n_estimators': 200, 'subsample': 1.0}
model = xgb.XGBClassifier(eval_metric='logloss', random_state=1, **best_params)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.73      0.78     14764
           1       0.76      0.86      0.80     14554

    accuracy                           0.79     29318
   macro avg       0.80      0.79      0.79     29318
weighted avg       0.80      0.79      0.79     29318



## Test model 

Validate the results with real data (without generated samples) 

In [7]:
# Define input and expected output data
X = testing_set.drop('atraso_15', axis=1)
y = testing_set['atraso_15']

y_pred = model.predict(X)
print(classification_report(y, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.72      0.79     11171
           1       0.30      0.54      0.38      2471

    accuracy                           0.69     13642
   macro avg       0.59      0.63      0.59     13642
weighted avg       0.77      0.69      0.72     13642



Observe that the recall for class 1 (delayed flights), altough is relatively low, it is an improvement compared to the previous models (to_expose.ipynb). It also indicates that there is still room for improvements, feature engineering might help. For example, SIGLAORI may not be needed since in all the dataset, it has always the value of "Santiago"

## Save model

In [23]:
import pickle

with open('mynewmodel.pkl', 'wb') as file:
    pickle.dump(model, file)

## Inference 

In [26]:
# list of values we should pass as input for inference
for val in X.columns:
    print(val)

DIA
MES
temporada_alta
DIANOM_Domingo
DIANOM_Jueves
DIANOM_Lunes
DIANOM_Martes
DIANOM_Miercoles
DIANOM_Sabado
DIANOM_Viernes
TIPOVUELO_I
TIPOVUELO_N
OPERA_Aerolineas Argentinas
OPERA_Aeromexico
OPERA_Air Canada
OPERA_Air France
OPERA_Alitalia
OPERA_American Airlines
OPERA_Austral
OPERA_Avianca
OPERA_British Airways
OPERA_Copa Air
OPERA_Delta Air
OPERA_Gol Trans
OPERA_Grupo LATAM
OPERA_Iberia
OPERA_JetSmart SPA
OPERA_K.L.M.
OPERA_Lacsa
OPERA_Latin American Wings
OPERA_Oceanair Linhas Aereas
OPERA_Plus Ultra Lineas Aereas
OPERA_Qantas Airways
OPERA_Sky Airline
OPERA_United Airlines
SIGLAORI_Santiago
SIGLADES_Antofagasta
SIGLADES_Arica
SIGLADES_Asuncion
SIGLADES_Atlanta
SIGLADES_Auckland N.Z.
SIGLADES_Balmaceda
SIGLADES_Bariloche
SIGLADES_Bogota
SIGLADES_Buenos Aires
SIGLADES_Calama
SIGLADES_Cancun
SIGLADES_Castro (Chiloe)
SIGLADES_Cataratas Iguacu
SIGLADES_Ciudad de Mexico
SIGLADES_Ciudad de Panama
SIGLADES_Cochabamba
SIGLADES_Concepcion
SIGLADES_Copiapo
SIGLADES_Cordoba
SIGLADES_Curitib

In [8]:
# For simplicity, create a single dict header
data_headers_dict = {
    "DIA" : 0,
    "MES" : 0,
    "temporada_alta" : 0,
    "DIANOM_Domingo" : 0,
    "DIANOM_Jueves" : 0,
    "DIANOM_Lunes" : 0,
    "DIANOM_Martes" : 0,
    "DIANOM_Miercoles" : 0,
    "DIANOM_Sabado" : 0,
    "DIANOM_Viernes" : 0,
    "TIPOVUELO_I" : 0,
    "TIPOVUELO_N" : 0,
    "OPERA_Aerolineas Argentinas" : 0,
    "OPERA_Aeromexico" : 0,
    "OPERA_Air Canada" : 0,
    "OPERA_Air France" : 0,
    "OPERA_Alitalia" : 0,
    "OPERA_American Airlines" : 0,
    "OPERA_Austral" : 0,
    "OPERA_Avianca" : 0,
    "OPERA_British Airways" : 0,
    "OPERA_Copa Air" : 0,
    "OPERA_Delta Air" : 0,
    "OPERA_Gol Trans" : 0,
    "OPERA_Grupo LATAM" : 0,
    "OPERA_Iberia" : 0,
    "OPERA_JetSmart SPA" : 0,
    "OPERA_K.L.M." : 0,
    "OPERA_Lacsa" : 0,
    "OPERA_Latin American Wings" : 0,
    "OPERA_Oceanair Linhas Aereas" : 0,
    "OPERA_Plus Ultra Lineas Aereas" : 0,
    "OPERA_Qantas Airways" : 0,
    "OPERA_Sky Airline" : 0,
    "OPERA_United Airlines" : 0,
    "SIGLAORI_Santiago" : 0,
    "SIGLADES_Antofagasta" : 0,
    "SIGLADES_Arica" : 0,
    "SIGLADES_Asuncion" : 0,
    "SIGLADES_Atlanta" : 0,
    "SIGLADES_Auckland N.Z." : 0,
    "SIGLADES_Balmaceda" : 0,
    "SIGLADES_Bariloche" : 0,
    "SIGLADES_Bogota" : 0,
    "SIGLADES_Buenos Aires" : 0,
    "SIGLADES_Calama" : 0,
    "SIGLADES_Cancun" : 0,
    "SIGLADES_Castro (Chiloe)" : 0,
    "SIGLADES_Cataratas Iguacu" : 0,
    "SIGLADES_Ciudad de Mexico" : 0,
    "SIGLADES_Ciudad de Panama" : 0,
    "SIGLADES_Cochabamba" : 0,
    "SIGLADES_Concepcion" : 0,
    "SIGLADES_Copiapo" : 0,
    "SIGLADES_Cordoba" : 0,
    "SIGLADES_Curitiba, Bra." : 0,
    "SIGLADES_Dallas" : 0,
    "SIGLADES_Florianapolis" : 0,
    "SIGLADES_Guayaquil" : 0,
    "SIGLADES_Houston" : 0,
    "SIGLADES_Iquique" : 0,
    "SIGLADES_Isla de Pascua" : 0,
    "SIGLADES_La Paz" : 0,
    "SIGLADES_La Serena" : 0,
    "SIGLADES_Lima" : 0,
    "SIGLADES_Londres" : 0,
    "SIGLADES_Los Angeles" : 0,
    "SIGLADES_Madrid" : 0,
    "SIGLADES_Melbourne" : 0,
    "SIGLADES_Mendoza" : 0,
    "SIGLADES_Miami" : 0,
    "SIGLADES_Montevideo" : 0,
    "SIGLADES_Neuquen" : 0,
    "SIGLADES_Nueva York" : 0,
    "SIGLADES_Orlando" : 0,
    "SIGLADES_Osorno" : 0,
    "SIGLADES_Paris" : 0,
    "SIGLADES_Pisco, Peru" : 0,
    "SIGLADES_Puerto Montt" : 0,
    "SIGLADES_Puerto Natales" : 0,
    "SIGLADES_Puerto Stanley" : 0,
    "SIGLADES_Punta Arenas" : 0,
    "SIGLADES_Punta Cana" : 0,
    "SIGLADES_Punta del Este" : 0,
    "SIGLADES_Quito" : 0,
    "SIGLADES_Rio de Janeiro" : 0,
    "SIGLADES_Roma" : 0,
    "SIGLADES_Rosario" : 0,
    "SIGLADES_San Juan, Arg." : 0,
    "SIGLADES_Santa Cruz" : 0,
    "SIGLADES_Sao Paulo" : 0,
    "SIGLADES_Sydney" : 0,
    "SIGLADES_Temuco" : 0,
    "SIGLADES_Toronto" : 0,
    "SIGLADES_Tucuman" : 0,
    "SIGLADES_Ushuia" : 0,
    "SIGLADES_Valdivia" : 0,
    "SIGLADES_Washington" : 0,
    "periodo_dia_mañana" : 0,
    "periodo_dia_noche" : 0,
    "periodo_dia_tarde" : 0
}

In [11]:
with open("./../models/model.pkl", 'rb') as file:
    trained_model = pickle.load(file)

### Test a flight on-time

In [12]:
# create new data one-hot vector 
new_data = data_headers_dict.copy()

# update dict values
new_data['DIA'] = 15
new_data['MES'] = 3
new_data['temporada_alta'] = 1

cat_vars_names = [
    "DIANOM_" + "Lunes",
    "TIPOVUELO_" + "I",
    "OPERA_" + "Delta Air",
    "SIGLAORI_" + "Santiago",
    "SIGLADES_" + "Ciudad de Mexico",
    "periodo_dia_" + "noche"
]

for name in cat_vars_names:
    if name in new_data:
        new_data[name] = 1
    else: 
        print(name)

print(len(new_data))

101


In [15]:
# covert to numpy
x = np.array(list(new_data.values())).reshape(1, -1)
# predict 
prediction = model.predict(x)
pred = "delayed" if prediction else "on-time"
print(pred)

on-time


### Test a delayed flight

In [16]:
# create new data one-hot vector 
new_data = data_headers_dict.copy()

# update dict values
new_data['DIA'] = 8
new_data['MES'] = 1
new_data['temporada_alta'] = 1

cat_vars_names = [
    "DIANOM_" + "Domingo",
    "TIPOVUELO_" + "I",
    "OPERA_" + "Qantas Airways",
    "SIGLAORI_" + "Santiago",
    "SIGLADES_" + "Sydney",
    "periodo_dia_" + "tarde"
]

for name in cat_vars_names:
    if name in new_data:
        new_data[name] = 1
    else: 
        print(name)

print(len(new_data))

101


In [17]:
# covert to numpy
x = np.array(list(new_data.values())).reshape(1, -1)
# predict 
prediction = model.predict(x)
pred = "delayed" if prediction else "on-time"
print(pred)

delayed
