In [None]:
import os
import pandas as pd
import numpy as np

os.chdir("..")
from utils import featuresfunctions

****
# EDDM
****

In [7]:
file_path = "C:\\Users\\kruu\\store\\"
data_EDDM = pd.read_parquet(os.path.join(file_path + "\\data_EDDM\\landing_df_EDDM.parquet"))

In [None]:
#Typecode selection: only the ones that land more than 400 times a year

typecode_list = data_EDDM.typecode.value_counts()[data_EDDM.typecode.value_counts() > 400].index.tolist()
data_EDDM_reduced = data_EDDM.query(f"typecode in {typecode_list}")

print(f"typecode proportion: {len(typecode_list) / data_EDDM.typecode.nunique()}")
print(f"Flight proportion: {len(data_EDDM_reduced) / len(data_EDDM)} ")

In [None]:
#Number of flight that are in flight simultaneously

start_matrix = data_EDDM_reduced['start'].values[:, np.newaxis]
stop_matrix = data_EDDM_reduced['stop'].values[:, np.newaxis]

# Find overlaps: (start1 <= stop2) & (stop1 >= start2)
overlap_matrix = (start_matrix <= stop_matrix.T) & (stop_matrix >= start_matrix.T)

# Count the number of overlaps for each row
data_EDDM_reduced['nb_aircraft'] = overlap_matrix.sum(axis=1)

In [None]:
# aircraft bodytype identification
data_EDDM_reduced[["body_type"]] =  data_EDDM_reduced.typecode.apply(lambda x: pd.Series(featuresfunctions.body_type(x)))

In [None]:
# Seasonal features creation

data_EDDM_reduced["month"] = pd.DatetimeIndex(data_EDDM_reduced.start).month.astype('category')
data_EDDM_reduced["hour"] = pd.DatetimeIndex(data_EDDM_reduced.start).hour.astype('category')
data_EDDM_reduced["day"] = pd.DatetimeIndex(data_EDDM_reduced.start).day.astype('category')
data_EDDM_reduced["weekday"] = pd.DatetimeIndex(data_EDDM_reduced.start).weekday.astype('category')
    
data_EDDM_reduced['season'] = data_EDDM_reduced['start'].apply(featuresfunctions.get_season)

data_EDDM_reduced['rush_hour'] = data_EDDM_reduced["start"].apply(featuresfunctions.is_rush_hour_EDDM)

In [None]:
# Calculation of the target variable: ratio between the flown distance and the nominal distance of the STAR

star_len_eddm = {
    "NAPS1B": 61.9,
    "LAND1B": 74.8,
    "ROKI1A": 52.3,
    "BETO1A": 61.3,
}

data_EDDM_reduced["nominal_distance"] = data_EDDM_reduced.star.apply(lambda x: star_len_eddm[x])
data_EDDM_reduced["nominal_distance_prop"] = data_EDDM_reduced["distance"] / data_EDDM_reduced["nominal_distance"] 


In [None]:
# Meteo Data
import tqdm

file_path_save = "C:\\Users\\kruu\\store\\data_EDDM"

batch_size = 1000  # batch size depends on METAR server capacity
num_batches = len(data_EDDM_reduced) // batch_size + 1

for i in tqdm.tqdm(range(num_batches)):
    if os.path.exists(os.path.join(file_path_save + f"landing_df_EDDM_with_meteo_{i}_of_{num_batches-1}.parquet")):
        print("file already exists")
    else:
        batch = data_EDDM_reduced.iloc[i * batch_size:(i + 1) * batch_size]
        batch[["avg_wind_dir", "avg_wind_speed", "avg_vis", "avg_temp", "avg_press"]] = batch.apply(
            lambda row: pd.Series(featuresfunctions.get_meteo_data(row, "EDDM")), axis=1
        )
        batch.to_parquet(os.path.join(file_path_save + f"landing_df_EDDM_with_meteo_{i}_of_{num_batches-1}.parquet"))
        


****
# LIRF
****

In [17]:
file_path = "C:\\Users\\kruu\\store\\"
data_LIRF = pd.read_parquet(os.path.join(file_path + "\\data_LIRF\\landing_df_LIRF.parquet"))

In [None]:
#Typecode selection: only the ones that land more than 400 times a year

typecode_list = data_LIRF.typecode.value_counts()[data_LIRF.typecode.value_counts() > 400].index.tolist()
data_LIRF_reduced = data_LIRF.query(f"typecode in {typecode_list}")

print(f"typecode proportion: {len(typecode_list) / data_LIRF.typecode.nunique()}")
print(f"Flight proportion: {len(data_LIRF_reduced) / len(data_LIRF)} ")

In [None]:
#Number of flight that are in flight simultaneously

start_matrix = data_LIRF_reduced['start'].values[:, np.newaxis]
stop_matrix = data_LIRF_reduced['stop'].values[:, np.newaxis]

# Find overlaps: (start1 <= stop2) & (stop1 >= start2)
overlap_matrix = (start_matrix <= stop_matrix.T) & (stop_matrix >= start_matrix.T)

# Count the number of overlaps for each row
data_LIRF_reduced['nb_aircraft'] = overlap_matrix.sum(axis=1)

In [None]:
# aircraft bodytype identification
data_LIRF_reduced[["body_type"]] =  data_LIRF_reduced.typecode.apply(lambda x: pd.Series(featuresfunctions.body_type(x)))

In [None]:
# Seasonal features creation

data_LIRF_reduced["month"] = pd.DatetimeIndex(data_LIRF_reduced.start).month.astype('category')
data_LIRF_reduced["hour"] = pd.DatetimeIndex(data_LIRF_reduced.start).hour.astype('category')
data_LIRF_reduced["day"] = pd.DatetimeIndex(data_LIRF_reduced.start).day.astype('category')
data_LIRF_reduced["weekday"] = pd.DatetimeIndex(data_LIRF_reduced.start).weekday.astype('category')
    
data_LIRF_reduced['season'] = data_LIRF_reduced['start'].apply(featuresfunctions.get_season)

data_LIRF_reduced['rush_hour'] = data_LIRF_reduced["start"].apply(featuresfunctions.is_rush_hour_LIRF)

In [None]:
# Calculation of the target variable: ratio between the flown distance and the nominal distance of the STAR

star_len_lirf = {
    "ELKA2A": 132.7,
    "VALM2C": 92.7,
    "RITE2A": 94.8,
    "LAT2C": 74.3,
}

data_LIRF_reduced["nominal_distance"] = data_LIRF_reduced.star.apply(lambda x: star_len_lirf[x])
data_LIRF_reduced["nominal_distance_prop"] = data_LIRF_reduced["distance"] / data_LIRF_reduced["nominal_distance"] 

In [None]:
# Meteo Data
import tqdm

# file_path_save = "C:\\Users\\kruu\\store\\data_LIRF\\"
file_path_save = "C:\\Users\\kruu\\store\\test_LIRF\\"


batch_size = 100  # batch size depends on METAR server capacity
num_batches = len(data_LIRF_reduced) // batch_size + 1

for i in range(num_batches):
    if os.path.exists(os.path.join(file_path_save + f"landing_df_LIRF_with_meteo_{i}_of_{num_batches-1}.parquet")):
        print("file already exists")
    else:
        batch = data_LIRF_reduced.iloc[i * batch_size:(i + 1) * batch_size]
        batch[["avg_wind_dir", "avg_wind_speed", "avg_vis", "avg_temp", "avg_press"]] = batch.apply(
            lambda row: pd.Series(featuresfunctions.get_meteo_data(row, "LIRF")), axis=1
        )
        batch.to_parquet(os.path.join(file_path_save + f"landing_df_LIRF_with_meteo_{i}_of_{num_batches-1}.parquet"))


****
# LSGG
****


In [34]:
file_path = "C:\\Users\\kruu\\store\\"
data_LSGG = pd.read_parquet(os.path.join(file_path + "\\data_LSGG\\landing_df_LSGG.parquet"))

In [35]:
#Typecode selection: only the ones that land more than 400 times a year

typecode_list = data_LSGG.typecode.value_counts()[data_LSGG.typecode.value_counts() > 400].index.tolist()
data_LSGG_reduced = data_LSGG.query(f"typecode in {typecode_list}")

print(f"typecode proportion: {len(typecode_list) / data_LSGG.typecode.nunique()}")
print(f"Flight proportion: {len(data_LSGG_reduced) / len(data_LSGG)} ")

typecode proportion: 0.058823529411764705
Flight proportion: 0.6835600236546422 


In [36]:
#Number of flight that are in flight simultaneously

start_matrix = data_LSGG_reduced['start'].values[:, np.newaxis]
stop_matrix = data_LSGG_reduced['stop'].values[:, np.newaxis]

# Find overlaps: (start1 <= stop2) & (stop1 >= start2)
overlap_matrix = (start_matrix <= stop_matrix.T) & (stop_matrix >= start_matrix.T)

# Count the number of overlaps for each row
data_LSGG_reduced['nb_aircraft'] = overlap_matrix.sum(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_LSGG_reduced['nb_aircraft'] = overlap_matrix.sum(axis=1)


In [37]:
# aircraft bodytype identification
data_LSGG_reduced[["body_type"]] =  data_LSGG_reduced.typecode.apply(lambda x: pd.Series(featuresfunctions.body_type(x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_LSGG_reduced[["body_type"]] =  data_LSGG_reduced.typecode.apply(lambda x: pd.Series(featuresfunctions.body_type(x)))


In [38]:
# Seasonal features creation

data_LSGG_reduced["month"] = pd.DatetimeIndex(data_LSGG_reduced.start).month.astype('category')
data_LSGG_reduced["hour"] = pd.DatetimeIndex(data_LSGG_reduced.start).hour.astype('category')
data_LSGG_reduced["day"] = pd.DatetimeIndex(data_LSGG_reduced.start).day.astype('category')
data_LSGG_reduced["weekday"] = pd.DatetimeIndex(data_LSGG_reduced.start).weekday.astype('category')
    
data_LSGG_reduced['season'] = data_LSGG_reduced['start'].apply(featuresfunctions.get_season)

data_LSGG_reduced['rush_hour'] = data_LSGG_reduced["start"].apply(featuresfunctions.is_rush_hour_LSGG)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_LSGG_reduced["month"] = pd.DatetimeIndex(data_LSGG_reduced.start).month.astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_LSGG_reduced["hour"] = pd.DatetimeIndex(data_LSGG_reduced.start).hour.astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_LSGG_reduc

In [39]:
# Calculation of the target variable: ratio between the flown distance and the nominal distance of the STAR

star_len_lsgg = {
    'BELU3N': 92.8,
    'KINE2N': 103.8,
    'AKIT3R': 110.3,
    'LUSA2N': 87.9
}

data_LSGG_reduced["nominal_distance"] = data_LSGG_reduced.star.apply(lambda x: star_len_lsgg[x])
data_LSGG_reduced["nominal_distance_prop"] = data_LSGG_reduced["distance"] / data_LSGG_reduced["nominal_distance"] 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_LSGG_reduced["nominal_distance"] = data_LSGG_reduced.star.apply(lambda x: star_len_lsgg[x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_LSGG_reduced["nominal_distance_prop"] = data_LSGG_reduced["distance"] / data_LSGG_reduced["nominal_distance"]


In [43]:
# Meteo Data
import tqdm

file_path_save = "C:\\Users\\kruu\\store\\data_LSGG\\"

batch_size = 1000  # batch size depends on METAR server capacity
num_batches = len(data_LSGG_reduced) // batch_size + 1

for i in tqdm.tqdm(range(num_batches)):
    if os.path.exists(os.path.join(file_path_save + f"landing_df_LSGG_with_meteo_{i}_of_{num_batches-1}.parquet")):
        print("file already exists")
    else:
        batch = data_LSGG_reduced.iloc[i * batch_size:(i + 1) * batch_size]
        batch[["avg_wind_dir", "avg_wind_speed", "avg_vis", "avg_temp", "avg_press"]] = batch.apply(
            lambda row: pd.Series(featuresfunctions.get_meteo_data(row, "LSGG")), axis=1
        )
        batch.to_parquet(os.path.join(file_path_save + f"landing_df_LSGG_with_meteo_{i}_of_{num_batches-1}.parquet"))

  0%|          | 0/12 [00:00<?, ?it/s]

file already exists
file already exists
file already exists
file already exists
file already exists
file already exists
file already exists
file already exists
file already exists


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch[["avg_wind_dir", "avg_wind_speed", "avg_vis", "avg_temp", "avg_press"]] = batch.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch[["avg_wind_dir", "avg_wind_speed", "avg_vis", "avg_temp", "avg_press"]] = batch.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch[["avg_wind_di