# Notebook to Predict Delays in Flights for SCL Airport

In [None]:
import warnings
from datetime import datetime  # , timedelta

import pandas as pd
import seaborn as sns

warnings.filterwarnings("ignore")


# import numpy as np

sns.set_theme(style="darkgrid")
# import sklearn

## 0. Load dataset
Create DataFrame, check data and format

Column details for file `dataset_SCL.csv`:

- **Fecha-I** : Date and time of the programmed flight.
- **Vlo-I** : Programmed flight number.
- **Ori-I** : City code of programmed origin.
- **Des-I** : City code of programmed destination.
- **Emp-I** : Airline code of programmed flight.
- **Fecha-O** : Date and time of operated flight.
- **Vlo-O** : Operated flight number.
- **Ori-O** : City code of operated origin.
- **Des-O** : City code of operated destination.
- **Emp-O** : Airline code of operated flight.
- **DIA** : Day number of operated flight.
- **MES** : Month number of operated flight.
- **AÑO** : Year of operated flight.
- **DIANOM** : Day of the week of operated flight.
- **TIPOVUELO** : Type of flight, I =International, N =National.
- **OPERA** : Name of the operated airline.
- **SIGLAORI** : Origin city name.
- **SIGLADES** : Destination city name.

In [None]:
# Load dataset as Panda's DataFrame

dataset = pd.read_csv("input/dataset_SCL.csv")
dataset.head()

In [None]:
# Show data shape as: (rows, columns)

dataset.shape

In [None]:
# search for missing values (NaN's)

dataset.isna().sum()

There's only 1 missing value from one column, corresponding to an empy value for the column `Vlo-I : Operated flight number`. 

To avoid removing the whole row of data, we will analyze the frequency in which `Vl-O` is equal to `Vlo-I`.

In [None]:
# How often is 'Vlo-I' equal to 'Vlo-O' ?

fraction = (dataset["Vlo-I"] == dataset["Vlo-O"]).sum() / len(dataset)

print(f"'Vlo-I' is equal to 'Vlo-O' {fraction:.1%} of the time")

There is a very high probability (over 97%) that 'Vlo-O' can be correctly estimated from 'Vlo-I', so we decide to fill this value instead of removing the row.

In [None]:
# fill missinf values for 'Vlo-O' from column 'Vlo-I'
dataset["Vlo-O"].fillna(dataset["Vlo-I"], inplace=True)
# count NaN's
dataset.isna().sum().sum()

In [None]:
# Check min-max values for Day, Month and Year

dataset.describe()

In [None]:
# Show unique values for each column to better undestand the data

print("Unique values for column 'Ori-I':", set(dataset["Ori-I"]), "\n")
print("Unique values for column 'Des-I':", set(dataset["Des-I"]), "\n")
print("Unique values for column 'Emp-I':", set(dataset["Emp-I"]), "\n")
print("Unique values for column 'Ori-O':", set(dataset["Ori-O"]), "\n")
print("Unique values for column 'Des-O':", set(dataset["Des-O"]), "\n")
print("Unique values for column 'Emp-O':", set(dataset["Emp-O"]), "\n")
print("Unique values for column 'DIANOM':", set(dataset["DIANOM"]), "\n")
print("Unique values for column 'TIPOVUELO':", set(dataset["TIPOVUELO"]), "\n")
print("Unique values for column 'OPERA':", set(dataset["OPERA"]), "\n")
print("Unique values for column 'SIGLAORI':", set(dataset["SIGLAORI"]), "\n")
print("Unique values for column 'SIGLADES':", set(dataset["SIGLADES"]), "\n")

In [None]:
# transform dates from string into datetime object to calculate time difference
dataset["Fecha-I"] = dataset["Fecha-I"].apply(
    lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S")
)
dataset["Fecha-O"] = dataset["Fecha-O"].apply(
    lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S")
)

In [None]:
dataset.head()

## 1. Analyze Distribution of Data

In [None]:
# Distribution of Flights vs Day of the Week

g = sns.catplot(
    x="DIANOM",
    data=dataset,
    kind="count",
    order=["Lunes", "Martes", "Miercoles", "Jueves", "Viernes", "Sabado", "Domingo"],
)
g.set_axis_labels("\nDay of the Week\n", "\nTotal Flights\n", size=16)
g.fig.suptitle("\nNumber of Flights per Day of the Week", size=20, y=1.15)
g.figure.set_size_inches(15, 5)

In [None]:
# Distribution of Flights vs Month

g = sns.catplot(x="MES", data=dataset, kind="count", order=range(1, 13))
g.set_axis_labels("\nMonth\n", "\nTotal Flights\n", size=16)
g.fig.suptitle("\nNumber of Flights per Month", size=20, y=1.15)
g.figure.set_size_inches(15, 5)

In [None]:
# Distribution of Flights vs Type of Flight

g = sns.catplot(x="TIPOVUELO", data=dataset, kind="count")
g.set_axis_labels(
    "\nType of Flight (International or National)\n", "\nTotal Flights\n", size=16
)
g.fig.suptitle("\nNumber of Flights per Type of Flight", size=20, y=1.15)
g.figure.set_size_inches(15, 5)

In [None]:
# calculate percent of National and International Flights

national_pct = dataset[dataset["TIPOVUELO"] == "N"]["TIPOVUELO"].count() / len(dataset)

print(f"{national_pct:.1%} of Flights are National")
print(f"{1-national_pct:.1%} of Flights are International")

In [None]:
# Distribution of Flights per Operating Airline

g = sns.catplot(
    x="OPERA", data=dataset, kind="count", order=dataset["OPERA"].value_counts().index
)
g.set_axis_labels("\nOperating Airline\n", "\nTotal Flights\n", size=16)
g.fig.suptitle("\nNumber of Flights per Operating Airline", size=20, y=1.15)
g.figure.set_size_inches(15, 5)
# rotate xlabels
g.set_xticklabels(rotation=90)
for ax in g.axes.flat:
    ax.yaxis.set_major_formatter(lambda x, p: f"{x:.0f}")

In [None]:
# calculate percent for Airline 'Grupo LATAM' and 'Sky Airline
total_latam = dataset[dataset["OPERA"] == "Grupo LATAM"]["OPERA"].count()
total_sky = dataset[dataset["OPERA"] == "Sky Airline"]["OPERA"].count()

print(f"'Grupo LATAM' operates {total_latam/len(dataset):.1%} of the flights.")
print(f"'Sky Airline' operates {total_sky/len(dataset):.1%} of the flights.")
print(
    f"Together they account for {(total_sky+total_latam)/len(dataset):.1%} of flights."
)

In [None]:
# Distribution of Flights per City of Destination

g = sns.catplot(
    x="SIGLADES",
    data=dataset,
    kind="count",
    order=dataset["SIGLADES"].value_counts().index,
)
g.set_axis_labels("\nCity of Destination\n", "\nTotal Flights\n", size=16)
g.fig.suptitle("\nNumber of Flights per City of Destination", size=20, y=1.15)
g.figure.set_size_inches(15, 5)
# rotate xlabels
g.set_xticklabels(rotation=90)
for ax in g.axes.flat:
    ax.yaxis.set_major_formatter(lambda x, p: f"{x:.0f}")

### Main Conclusions and Interpretation:

1. On average, Saturday (**Sabado**) is the day with the least flights of the whole week. The rest days are similar but slightly higher for Monday (**Lunes**), Thursday (**Jueves**) and Friday (**Viernes**).
2. There is a difference in number of flights per month of the year, there are months with low demand (April/4, May/5 and June/6) and high demand (July/7, October/10, November/11, December/12 and January/1).
3. Slightly more than half (54.2%) of the flights are of '**National**' Type, while the rest (45.8%) are '**International**'.
4. Most flights are operated by '**Grupo LATAM**' (60%), followed by '**Sky Airline**' (21%). Together they account for 80.9% of total flights.
5. As seen previously (*not plotted*), the only City of Origin is '**Santiago**' de Chile.
6. The 5 most frequent destinations overall are (in descending order): '*Buenos Aires*', '*Antofagasta*', '*Lima*', '*Calama*' and '*Puerto Montt*'.	
7. The 5 most frequent **International** destinations are (in descending order): '*Buenos Aires*', '*Lima*', '*Sao Paulo*', '*Ciudad de Panama*' and '*Mendoza*'.
8. The 5 most frequent **National** destinations are (in descending order): '*Antofagasta*', '*Calama*', '*Puerto Montt*', '*Concepción*' e '*Iquique*'.