# Layover computation script

Script to retrieve data from the Aviationstack API for a specified date range and calculate layover statistics for the selected airport during the given time period.

### Functions definition

In [1]:
import pickle
import ast
import requests
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from config import API_KEY, BASE_URL
from inputs import airport_code, start_date, end_date, use_cached, min_layover_time, max_layover_time

def get_flight_data_for_date(airport_code, flight_type, date):
    """Fetch flight data (arrivals or departures) for a specific airport and single day with pagination."""
    endpoint = f'{BASE_URL}flights'
    all_data = []
    limit = 100
    offset = 0

    while True:
        # Use the correct parameter based on flight type (arrivals or departures)
        if flight_type == 'arrived':
            params = {'access_key': API_KEY, 'arr_iata': airport_code}
        elif flight_type == 'departed':
            params = {'access_key': API_KEY, 'dep_iata': airport_code}
        else:
            raise ValueError("Invalid flight_type. Use 'arrived' or 'departed'.")

        # Add single date parameter and pagination control
        params.update({
            'flight_date': date.strftime('%Y-%m-%d'),
            'limit': limit,
            'offset': offset
        })

        # Request flight data
        response = requests.get(endpoint, params=params)

        if response.status_code == 200:
            response_data = response.json()
            data = response_data.get('data', [])
            pagination = response_data.get('pagination', {})
            print(pagination)

            # Add the data from this page
            all_data.extend(data)

            # Check if we have more pages to fetch
            if 'total' in pagination and 'count' in pagination and 'offset' in pagination:
                total = pagination['total']
                count = pagination['count']
                offset = pagination['offset'] + count

                # Stop if we have collected all pages
                if offset >= total:
                    break
            else:
                # If pagination info is missing, we assume we are done
                break
        else:
            print(f"Error: {response.status_code}, {response.json()}")
            break

    return all_data

def save_data_to_disk(df, airport_code, start_date, end_date, flight_type):
    """Save flight data to disk as a CSV from a DataFrame."""
    filename = f'{airport_code}_{flight_type}_{start_date.strftime("%Y-%m-%d")}_to_{end_date.strftime("%Y-%m-%d")}.csv'
    df.to_csv(filename, index=False)
    print(f"Data saved to {filename}")

def load_data_from_disk(airport_code, start_date, end_date, flight_type):
    """Load flight data from disk if available as a DataFrame."""
    filename = f'{airport_code}_{flight_type}_{start_date.strftime("%Y-%m-%d")}_to_{end_date.strftime("%Y-%m-%d")}.csv'
    if os.path.exists(filename):
        df = pd.read_csv(filename)
        print(f"Loaded cached data from {filename}")
        return df
    else:
        return None

def collect_flights_for_range(airport_code, start_date, end_date, use_cached=True):
    """Collect flights for a given date range by fetching data one day at a time."""
    
    # Try to load cached data from disk if available
    if use_cached:
        arrivals_df = load_data_from_disk(airport_code, start_date, end_date, 'arrivals')
        departures_df = load_data_from_disk(airport_code, start_date, end_date, 'departures')
        if arrivals_df is not None and departures_df is not None:
            arrivals_df['arrival'] = arrivals_df['arrival'].apply(ast.literal_eval)
            arrivals_df['estimated_arrival_time'] = pd.to_datetime(arrivals_df['estimated_arrival_time'])

            departures_df['departure'] = departures_df['departure'].apply(ast.literal_eval)
            departures_df['estimated_departure_time'] = pd.to_datetime(departures_df['estimated_departure_time'])
            return arrivals_df, departures_df

    # Initialize empty lists to collect data for all dates
    all_arrivals = []
    all_departures = []

    # Loop through the date range and collect data for each day
    current_date = start_date
    while current_date <= end_date:
        print(f"Fetching flight data for {current_date.strftime('%Y-%m-%d')}")
        
        # Fetch arrivals and departures for the current date
        arrivals_data = get_flight_data_for_date(airport_code, 'arrived', current_date)
        departures_data = get_flight_data_for_date(airport_code, 'departed', current_date)
        
        if arrivals_data:
            all_arrivals.extend(arrivals_data)
        if departures_data:
            all_departures.extend(departures_data)
        
        # Move to the next day
        current_date += timedelta(days=1)

    # Convert to DataFrames
    arrivals_df = pd.DataFrame(all_arrivals)
    departures_df = pd.DataFrame(all_departures)

    # Add `estimated_arrival_time` and `estimated_departure_time` columns for easier filtering later
    if not arrivals_df.empty:
        arrivals_df['estimated_arrival_time'] = pd.to_datetime(arrivals_df['arrival'].apply(lambda x: x.get('estimated') if isinstance(x, dict) else None))
    if not departures_df.empty:
        departures_df['estimated_departure_time'] = pd.to_datetime(departures_df['departure'].apply(lambda x: x.get('estimated') if isinstance(x, dict) else None))

    # Add `iata` and `icao` columns for both DataFrames
    arrivals_df['iata'] = arrivals_df['flight'].apply(lambda x: x.get('iata') if isinstance(x, dict) else None)
    arrivals_df['icao'] = arrivals_df['flight'].apply(lambda x: x.get('icao') if isinstance(x, dict) else None)
    departures_df['iata'] = departures_df['flight'].apply(lambda x: x.get('iata') if isinstance(x, dict) else None)
    departures_df['icao'] = departures_df['flight'].apply(lambda x: x.get('icao') if isinstance(x, dict) else None)

    print("len arrivals_df before drop:", len(arrivals_df))
    print("len departures_df before drop:", len(departures_df))
    # Drop duplicates based on 'icao', 'iata', and 'estimated_arrival_time' or 'estimated_departure_time'
    arrivals_df = arrivals_df.drop_duplicates(subset=['icao', 'iata', 'estimated_arrival_time'])
    departures_df = departures_df.drop_duplicates(subset=['icao', 'iata', 'estimated_departure_time'])
    print("len arrivals_df after drop:", len(arrivals_df))
    print("len departures_df after drop:", len(departures_df))

    # Sort arrivals and departures by flight_date and estimated times
    arrivals_df = arrivals_df.sort_values(by=['flight_date', 'estimated_arrival_time'])
    departures_df = departures_df.sort_values(by=['flight_date', 'estimated_departure_time'])

    # Save the data to disk
    save_data_to_disk(arrivals_df, airport_code, start_date, end_date, 'arrivals')
    save_data_to_disk(departures_df, airport_code, start_date, end_date, 'departures')

    return arrivals_df, departures_df

def calculate_layover(arrivals, departures, min_layover_time, max_layover_time):
    """Compute layover times between arrivals and all valid departures within a time window."""
    layovers = []

    print_count = 0
    # Filter the departures based on the layover time window for each arrival
    for _, arrival in arrivals.iterrows():
        arrival_time = arrival['estimated_arrival_time']
        if print_count % 1000 == 0:
            print(print_count)
            print(arrival_time)
        print_count = print_count + 1
        if pd.notnull(arrival_time):
            arrival_airport = arrival.get('arrival', {}).get('iata')

            # Define the time window for valid departures
            min_departure_time = arrival_time + min_layover_time
            max_departure_time = arrival_time + max_layover_time

            # Filter departures based on the time window and airport
            valid_departures = departures[
                (departures['estimated_departure_time'] >= min_departure_time) &
                (departures['estimated_departure_time'] <= max_departure_time) &
                (departures['departure'].apply(lambda x: x.get('iata') if isinstance(x, dict) else None) == arrival_airport)
            ]

            # Calculate the layover time for each valid departure
            for _, departure in valid_departures.iterrows():
                layover_time = (departure['estimated_departure_time'] - arrival_time).total_seconds() / 60  # Convert to minutes
                layovers.append(layover_time)

    return layovers

def main():
    # Collect all flight data for the date range
    arrivals_df, departures_df = collect_flights_for_range(airport_code, start_date, end_date, use_cached)

    # Calculate layover times for the date range
    layovers = calculate_layover(arrivals_df, departures_df, min_layover_time, max_layover_time)

    # Output layover statistics
    print(f"Layovers for {airport_code} from {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}: {layovers}")

    # Plot histogram of layover times
    if layovers:
        plt.figure(figsize=(10, 6))
        plt.hist(layovers, bins=10, edgecolor='black', alpha=0.7)
        plt.title(f'Layover Time Distribution for {airport_code} ({start_date.strftime("%Y-%m-%d")} to {end_date.strftime("%Y-%m-%d")})')
        plt.xlabel('Layover Time (minutes)')
        plt.ylabel('Frequency')
        plt.grid(True)
        plt.show()

# if __name__ == '__main__':
#     main()


### Setting input parameters

In [2]:
airport_code = 'LHR'  # Example airport code
start_date = datetime(2024, 6, 1)  # Example start date
end_date = datetime(2024, 9, 1)   # Example end date
use_cached = True  # Set to False if you want to fetch new data

### Gathering data from the Aviationstack API

In [3]:
# Collect all flight data for the date range
arrivals_df, departures_df = collect_flights_for_range(airport_code, start_date, end_date, use_cached)

Loaded cached data from LHR_arrivals_2024-06-01_to_2024-09-01.csv
Loaded cached data from LHR_departures_2024-06-01_to_2024-09-01.csv


In [4]:
arrivals_df.head()

Unnamed: 0,flight_date,flight_status,departure,arrival,airline,flight,aircraft,live,estimated_arrival_time,iata,icao
0,2024-06-01,landed,"{'airport': 'King Khaled International', 'time...","{'airport': 'Heathrow', 'timezone': 'Europe/Lo...","{'name': 'Iberia', 'iata': 'IB', 'icao': 'IBE'}","{'number': '7426', 'iata': 'IB7426', 'icao': '...",,,2024-06-01 05:35:00+00:00,IB7426,IBE7426
1,2024-06-01,landed,"{'airport': 'King Khaled International', 'time...","{'airport': 'Heathrow', 'timezone': 'Europe/Lo...","{'name': 'British Airways', 'iata': 'BA', 'ica...","{'number': '262', 'iata': 'BA262', 'icao': 'BA...","{'registration': 'G-VIIA', 'iata': 'B772', 'ic...",,2024-06-01 05:35:00+00:00,BA262,BAW262
2,2024-06-01,scheduled,"{'airport': 'Doha International', 'timezone': ...","{'airport': 'Heathrow', 'timezone': 'Europe/Lo...","{'name': 'Qatar Airways', 'iata': 'QR', 'icao'...","{'number': None, 'iata': 'QR', 'icao': 'QTR', ...",,,2024-06-01 05:42:00+00:00,QR,QTR
3,2024-06-01,landed,"{'airport': 'Bahrain International', 'timezone...","{'airport': 'Heathrow', 'timezone': 'Europe/Lo...","{'name': 'Iberia', 'iata': 'IB', 'icao': 'IBE'}","{'number': '7360', 'iata': 'IB7360', 'icao': '...",,,2024-06-01 06:15:00+00:00,IB7360,IBE7360
4,2024-06-01,landed,"{'airport': 'Bahrain International', 'timezone...","{'airport': 'Heathrow', 'timezone': 'Europe/Lo...","{'name': 'American Airlines', 'iata': 'AA', 'i...","{'number': '7103', 'iata': 'AA7103', 'icao': '...",,,2024-06-01 06:15:00+00:00,AA7103,AAL7103


In [5]:
departures_df.head()

Unnamed: 0,flight_date,flight_status,departure,arrival,airline,flight,aircraft,live,estimated_departure_time,iata,icao
0,2024-06-01,scheduled,"{'airport': 'Heathrow', 'timezone': 'Europe/Lo...","{'airport': 'Barajas', 'timezone': 'Europe/Mad...","{'name': 'Iberia', 'iata': 'IB', 'icao': 'IBE'}","{'number': '7458', 'iata': 'IB7458', 'icao': '...",,,2024-06-01 06:00:00+00:00,IB7458,IBE7458
1,2024-06-01,landed,"{'airport': 'Heathrow', 'timezone': 'Europe/Lo...","{'airport': 'Zurich', 'timezone': 'Europe/Zuri...","{'name': 'Air Canada', 'iata': 'AC', 'icao': '...","{'number': '6756', 'iata': 'AC6756', 'icao': '...",,,2024-06-01 06:00:00+00:00,AC6756,ACA6756
2,2024-06-01,scheduled,"{'airport': 'Heathrow', 'timezone': 'Europe/Lo...","{'airport': 'Barajas', 'timezone': 'Europe/Mad...","{'name': 'American Airlines', 'iata': 'AA', 'i...","{'number': '6806', 'iata': 'AA6806', 'icao': '...",,,2024-06-01 06:00:00+00:00,AA6806,AAL6806
3,2024-06-01,scheduled,"{'airport': 'Heathrow', 'timezone': 'Europe/Lo...","{'airport': 'Barajas', 'timezone': 'Europe/Mad...","{'name': 'British Airways', 'iata': 'BA', 'ica...","{'number': '456', 'iata': 'BA456', 'icao': 'BA...","{'registration': 'G-EUYP', 'iata': 'A320', 'ic...","{'updated': '2024-06-02T17:47:21+00:00', 'lati...",2024-06-01 06:00:00+00:00,BA456,BAW456
4,2024-06-01,landed,"{'airport': 'Heathrow', 'timezone': 'Europe/Lo...","{'airport': 'Zurich', 'timezone': 'Europe/Zuri...","{'name': 'SWISS', 'iata': 'LX', 'icao': 'SWR'}","{'number': '345', 'iata': 'LX345', 'icao': 'SW...",,,2024-06-01 06:00:00+00:00,LX345,SWR345


In [6]:
arrivals_df['month'] = arrivals_df['flight_date'].apply(lambda x: x.split('-')[1])
departures_df['month'] = departures_df['flight_date'].apply(lambda x: x.split('-')[1])

In [7]:
arrivals_df['month'].value_counts()

month
08    82463
07    81936
06    65152
09     2609
Name: count, dtype: int64

In [8]:
departures_df['month'].value_counts()

month
08    82714
07    82090
06    65691
09     2641
Name: count, dtype: int64

In [9]:
arrivals_df = arrivals_df[arrivals_df['month'].isin(['07'])]

In [10]:
arrivals_df['month'].value_counts()

month
07    81936
Name: count, dtype: int64

### Min & max layover time in seconds

In [11]:
min_layover_time

datetime.timedelta(seconds=900)

In [12]:
max_layover_time

datetime.timedelta(seconds=29700)

### Computing layovers

Layovers are defined as the time difference between an arrival and any subsequent departure within a specified window, determined by the minimum and maximum layover times. The data is calculated for all incoming arrivals from the considered airport between 2024/06/01 and 2024/09/01.

In [None]:
# Calculate layover times for the date range
layovers = calculate_layover(arrivals_df, departures_df, min_layover_time, max_layover_time)

# Output layover statistics
print(f"Layovers for {airport_code} from {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}: {layovers}")

0
2024-07-01 05:35:00+00:00
1000
2024-07-01 15:35:00+00:00
2000
2024-07-02 06:50:00+00:00
3000
2024-07-02 09:45:00+00:00
4000
2024-07-02 18:00:00+00:00
5000
2024-07-03 10:40:00+00:00
6000
2024-07-03 18:45:00+00:00
7000
2024-07-04 11:15:00+00:00
8000
2024-07-04 13:30:00+00:00
9000
2024-07-04 21:14:00+00:00
10000
2024-07-05 07:20:00+00:00
11000
2024-07-05 16:05:00+00:00
12000
2024-07-06 07:05:00+00:00
13000
2024-07-06 09:55:00+00:00
14000
2024-07-06 19:45:00+00:00
15000
2024-07-07 13:50:00+00:00
16000
2024-07-07 15:20:00+00:00
17000
2024-07-08 06:40:00+00:00
18000
2024-07-08 09:20:00+00:00
19000
2024-07-08 17:40:00+00:00
20000
2024-07-09 09:50:00+00:00
21000
2024-07-09 12:15:00+00:00
22000
2024-07-09 19:50:00+00:00
23000
2024-07-10 13:50:00+00:00
24000
2024-07-10 14:45:00+00:00
25000
2024-07-11 05:25:00+00:00
26000
2024-07-11 08:20:00+00:00
27000
2024-07-11 16:45:00+00:00
28000
2024-07-12 08:20:00+00:00
29000
2024-07-12 11:00:00+00:00
30000
2024-07-12 18:55:00+00:00
31000
2024-07-13 11:3

In [1]:
# Save the list to a binary file
with open(f'layovers {airport_code} {str(min_layover_time.seconds)} {str(max_layover_time.seconds)}.pkl', 'wb') as file:
    pickle.dump(layovers, file)

NameError: name 'airport_code' is not defined

In [None]:
def round_to_nearest_quarter(num):
    return np.round(num / 15) * 15
rounded_layovers = [round_to_nearest_quarter(num) for num in layovers]

### Visualizing layovers distribution

In [None]:
# Counting the occurrences of each layover time
layover_counts = {layover: rounded_layovers.count(layover) for layover in set(rounded_layovers)}
del layover_counts[15]
del layover_counts[495]

# # Calculating the median layover time
# layover_times = list(layover_counts.keys())
# layover_frequencies = list(layover_counts.values())
# median_layover = np.median(np.repeat(layover_times, layover_frequencies))

# Creating the bar plot
plt.figure(figsize=(8,6))
plt.bar(layover_counts.keys(), layover_counts.values(), width=10, zorder=2)
plt.xlabel('Layover Time (minutes)')
plt.ylabel('Number of Flights')
plt.title(f'Distribution of Layover Times for {airport_code} (data from {start_date.strftime("%Y-%m-%d")} to {end_date.strftime("%Y-%m-%d")})')
# Setting the x-ticks for every bar
plt.xticks(ticks=list(layover_counts.keys()), rotation=45)

# # Adding the median as a red vertical dashed line
# plt.axvline(median_layover, color='red', linestyle='--', label=f'Median: {median_layover:.0f} minutes', zorder=3)

# # Displaying the value of the median on the plot
# plt.text(median_layover + 5, max(layover_counts.values()) * 0.9, f'{median_layover:.0f}', color='red')

# Adding a grid in the background
plt.grid(True, zorder=1)

# Displaying the plot
plt.show()

# Saving the figure
file_path = f'Distribution of Layover Times for {airport_code} (data from {start_date.strftime("%Y-%m-%d")} to {end_date.strftime("%Y-%m-%d")}).png'
plt.savefig(file_path)

In [None]:
# Normalizing the counts based on the total count
total_count = sum(layover_counts.values())
normalized_counts = {layover: count / total_count * 100 for layover, count in layover_counts.items()}

# # Calculating the median layover time
# layover_times = list(layover_counts.keys())
# layover_frequencies = list(layover_counts.values())
# median_layover = np.median(np.repeat(layover_times, layover_frequencies))

# Creating the bar plot for normalized counts
plt.figure(figsize=(8,6))
plt.bar(normalized_counts.keys(), normalized_counts.values(), width=10, zorder=2)
plt.xlabel('Layover Time (minutes)')
plt.ylabel('Proportion of Total Flights (%)')
plt.title(f'Proportion of Layover Times for {airport_code} airport (data from {start_date.strftime("%Y-%m-%d")} to {end_date.strftime("%Y-%m-%d")})')
# Setting the x-ticks for every bar and inclining them
plt.xticks(ticks=list(normalized_counts.keys()), rotation=45)

# Adding a grid in the background
plt.grid(True, zorder=1)

# # Adding the median as a red vertical dashed line
# plt.axvline(median_layover, color='red', linestyle='--', label=f'Median: {median_layover:.0f} minutes', zorder=3)

# # Displaying the value of the median on the plot
# plt.text(median_layover + 5, max(normalized_counts.values()) * 0.9, f'{median_layover:.0f}', color='red')

# Displaying the plot
plt.show()

# Saving the figure
file_path = f'Proportion of Layover Times for {airport_code} airport (data from {start_date.strftime("%Y-%m-%d")} to {end_date.strftime("%Y-%m-%d")}).png'
plt.savefig(file_path)