# Notebook to Predict Delays in Flights for SCL Airport

In [None]:
"""
Load necessary packages
"""

import warnings
from datetime import datetime  # , timedelta

import pandas as pd

# import numpy as np
# import seaborn as sns
# import sklearn


warnings.filterwarnings("ignore")

## 0. Load dataset
Create DataFrame, check data and format

Column details for file `dataset_SCL.csv`:

- **Fecha-I** : Date and time of the programmed flight.
- **Vlo-I** : Programmed flight number.
- **Ori-I** : City code of programmed origin.
- **Des-I** : City code of programmed destination.
- **Emp-I** : Airline code of programmed flight.
- **Fecha-O** : Date and time of operated flight.
- **Vlo-O** : Operated flight number.
- **Ori-O** : City code of operated origin.
- **Des-O** : City code of operated destination.
- **Emp-O** : Airline code of operated flight.
- **DIA** : Day number of operated flight.
- **MES** : Month number of operated flight.
- **AÃ‘O** : Year of operated flight.
- **DIANOM** : Day of the week of operated flight.
- **TIPOVUELO** : Type of flight, I =International, N =National.
- **OPERA** : Name of the operated airline.
- **SIGLAORI** : Origin city name.
- **SIGLADES** : Destination city name.

In [None]:
# Load dataset as Panda's DataFrame

dataset = pd.read_csv("input/dataset_SCL.csv")
dataset.head()

In [None]:
# Show data shape as: (rows, columns)

dataset.shape

In [None]:
# search for missing values (NaN's)

dataset.isna().sum()

There's only 1 missing value from one column, corresponding to an empy value for the column `Vlo-I : Operated flight number`. 

To avoid removing the whole row of data, we will analyze the frequency in which `Vl-O` is equal to `Vlo-I`.

In [None]:
# How often is 'Vlo-I' equal to 'Vlo-O' ?

fraction = (dataset["Vlo-I"] == dataset["Vlo-O"]).sum() / len(dataset)

print(f"'Vlo-I' is equal to 'Vlo-O' {fraction:.1%} of the time")

There is a very high probability (over 97%) that 'Vlo-O' can be correctly estimated from 'Vlo-I', so we decide to fill this value instead of removing the row.

In [None]:
# fill missinf values for 'Vlo-O' from column 'Vlo-I'
dataset["Vlo-O"].fillna(dataset["Vlo-I"], inplace=True)
# count NaN's
dataset.isna().sum().sum()

In [None]:
# Check min-max values for Day, Month and Year

dataset.describe()

In [None]:
# Show unique values for each column to better undestand the data

print("Unique values for column 'Ori-I':", set(dataset["Ori-I"]), "\n")
print("Unique values for column 'Des-I':", set(dataset["Des-I"]), "\n")
print("Unique values for column 'Emp-I':", set(dataset["Emp-I"]), "\n")
print("Unique values for column 'Ori-O':", set(dataset["Ori-O"]), "\n")
print("Unique values for column 'Des-O':", set(dataset["Des-O"]), "\n")
print("Unique values for column 'Emp-O':", set(dataset["Emp-O"]), "\n")
print("Unique values for column 'DIANOM':", set(dataset["DIANOM"]), "\n")
print("Unique values for column 'TIPOVUELO':", set(dataset["TIPOVUELO"]), "\n")
print("Unique values for column 'OPERA':", set(dataset["OPERA"]), "\n")
print("Unique values for column 'SIGLAORI':", set(dataset["SIGLAORI"]), "\n")
print("Unique values for column 'SIGLADES':", set(dataset["SIGLADES"]), "\n")

In [None]:
# transform dates from string into datetime object to calculate time difference
dataset["Fecha-I"] = dataset["Fecha-I"].apply(
    lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S")
)
dataset["Fecha-O"] = dataset["Fecha-O"].apply(
    lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S")
)

In [None]:
dataset.head()