In [55]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import VarianceThreshold, SelectFdr, f_regression

# For calculating feature wise distribution similarity
from scipy.spatial.distance import jensenshannon as js_dist

# Suppressing warnings
import warnings
warnings.filterwarnings('ignore')

# Global Parameters

In [86]:
TARGET = 'LoadFactor'

# For loading data
REALIZED_DATA_PATH = 'dataset_full.xls'
FUTURE_DATA_PATH = 'future_data.xls'

# For saving data
REALIZED_PATH = 'data/realized_preprocessed_data.csv'
FUTURE_PATH = 'data/future_preprocessed_data.csv'

# Data Preprocessing of Case 1
### By August Semrau and William Marstrand
This notebook consists of data preprocessing, analysis and modelling is found in seperate notebooks

## Data

In [57]:
df_realized = pd.read_excel(REALIZED_DATA_PATH)
df_realized

Unnamed: 0,ScheduleTime,Airline,FlightNumber,Destination,AircraftType,FlightType,Sector,SeatCapacity,LoadFactor
0,2021-01-01 06:35:00,IA,874,DEN,73W,J,US,142,0.408451
1,2021-01-01 10:35:00,JZ,818,YHM,AT7,J,CA,74,0.189189
2,2021-01-01 12:05:00,IA,876,DEN,73W,J,US,142,0.570423
3,2021-01-01 13:20:00,CN,514,EST,AT7,J,US,72,0.333333
4,2021-01-01 14:20:00,LJ,3140,DEN,32A,J,US,186,0.204301
...,...,...,...,...,...,...,...,...,...
39444,2022-02-28 18:45:00,VW,986,YYZ,319,J,CA,144,0.847222
39445,2022-02-28 19:25:00,LJ,667,YDQ,319,J,CA,156,0.871795
39446,2022-02-28 20:00:00,VW,3406,MYV,E90,J,US,98,0.857143
39447,2022-02-28 19:30:00,LJ,663,RUT,32N,J,US,186,0.682796


In [58]:
df_future = pd.read_excel(FUTURE_DATA_PATH)
df_future

Unnamed: 0,ScheduleTime,Airline,FlightNumber,Destination,AircraftType,FlightType,Sector,SeatCapacity
0,2022-03-01 05:45:00,UK,1315,YXX,319,J,CA,131
1,2022-03-01 07:15:00,PW,950,YLW,319,J,CA,143
2,2022-03-01 06:45:00,VW,770,YYZ,320,J,CA,220
3,2022-03-01 06:40:00,DO,2568,YXU,32N,J,CA,180
4,2022-03-01 07:30:00,AY,984,YYG,320,J,CA,174
...,...,...,...,...,...,...,...,...
4808,2022-03-31 21:05:00,SV,1988,YYJ,73H,J,CA,186
4809,2022-03-31 20:55:00,MD,1242,YBR,321,J,CA,230
4810,2022-03-31 21:50:00,LJ,506,YUL,320,J,CA,186
4811,2022-03-31 20:30:00,LJ,772,YEG,320,J,CA,186


First we combine the two datasets to make sure they are preprocessed to have the same features

In [59]:
data = pd.concat([df_realized, df_future])
data

Unnamed: 0,ScheduleTime,Airline,FlightNumber,Destination,AircraftType,FlightType,Sector,SeatCapacity,LoadFactor
0,2021-01-01 06:35:00,IA,874,DEN,73W,J,US,142,0.408451
1,2021-01-01 10:35:00,JZ,818,YHM,AT7,J,CA,74,0.189189
2,2021-01-01 12:05:00,IA,876,DEN,73W,J,US,142,0.570423
3,2021-01-01 13:20:00,CN,514,EST,AT7,J,US,72,0.333333
4,2021-01-01 14:20:00,LJ,3140,DEN,32A,J,US,186,0.204301
...,...,...,...,...,...,...,...,...,...
4808,2022-03-31 21:05:00,SV,1988,YYJ,73H,J,CA,186,
4809,2022-03-31 20:55:00,MD,1242,YBR,321,J,CA,230,
4810,2022-03-31 21:50:00,LJ,506,YUL,320,J,CA,186,
4811,2022-03-31 20:30:00,LJ,772,YEG,320,J,CA,186,


We convert the flight number to an object type, since it is really a categorical variable, but Pandas thinks it's numerical

In [60]:
## Convert FlightNumber to object, it is not a numerical value
data.FlightNumber = data.FlightNumber.astype(object)
## Investigate data for dtypes and stuff
print('\ndtypes of the datasets columns:')
data.dtypes


dtypes of the datasets columns:


ScheduleTime    datetime64[ns]
Airline                 object
FlightNumber            object
Destination             object
AircraftType            object
FlightType              object
Sector                  object
SeatCapacity             int64
LoadFactor             float64
dtype: object

In [61]:
INPUT_FEATURES = data.columns.drop(TARGET)
INPUT_FEATURES

Index(['ScheduleTime', 'Airline', 'FlightNumber', 'Destination',
       'AircraftType', 'FlightType', 'Sector', 'SeatCapacity'],
      dtype='object')

## Data Cleaning
We clean the data based on the input variables. **We do not look at the target variable LoadFactor**.
<br>
We want to first make sure that the data itself is sane before we go any further and start extracting new features from the data.
<br>
The checks we do are:
* Checking for and cleaning null values
* Checking for illegal values e.g. zero or negative seat capacity or impossible dates in ScheduleTime
* Checking for and possibly removing outliers

### Null Values

In [62]:
def remove_empty(df):
    nan_values_count = df.isnull().sum()
    empty_values_count = df.apply(lambda df: df == '', axis=1).sum()
    print(f'List of null/empty data counts for each column: \n{nan_values_count+empty_values_count}')

    ## As there are so few missing values, a total of five rows, we simply remove these
    df_no_nan = df.dropna()[df.apply(lambda df: df != '')]
    # rea_df = rea_df.loc[rea_df['FlightType'].isin(['C', 'J'])]
    print(f'\nData had {df.shape[0]} rows pre-nan-removal, now has {df_no_nan.shape[0]} rows.')

    return df_no_nan
    

In [63]:
no_nan_data = data.copy()
no_nan_data[INPUT_FEATURES] = remove_empty(data[INPUT_FEATURES])

List of null/empty data counts for each column: 
ScheduleTime    0
Airline         0
FlightNumber    0
Destination     0
AircraftType    0
FlightType      0
Sector          0
SeatCapacity    0
dtype: int64

Data had 44262 rows pre-nan-removal, now has 44262 rows.


### Illegal or Extremely Unlikly Values

In [64]:
def is_before_2021_or_in_the_future(df):
    min_st = pd.Timestamp(2021, 1, 1)
    max_st = pd.Timestamp(2022, 4, 1)
    return df['ScheduleTime'].apply(lambda st: st < min_st or st >= max_st)

In [65]:
# Check if any ScheduleTime in Realized dataset is before 2021 or into the future
data[is_before_2021_or_in_the_future(data)]

Unnamed: 0,ScheduleTime,Airline,FlightNumber,Destination,AircraftType,FlightType,Sector,SeatCapacity,LoadFactor


In [66]:
def is_negative_or_zero(df):
    return df['SeatCapacity'] <= 0

In [67]:
# Check if Realized SeatCapacity is zero or negative
data[is_negative_or_zero(data)]

Unnamed: 0,ScheduleTime,Airline,FlightNumber,Destination,AircraftType,FlightType,Sector,SeatCapacity,LoadFactor


In [68]:
# Check if LoadFactors are low or very high
print(f'Min LoadFactor: {data[TARGET].min()}')
print(f'Max LoadFactor: {data[TARGET].max()}')

Min LoadFactor: 0.0
Max LoadFactor: 1.47517730496453


So no negative load factors.
And we have some load factors which are greater than 1 indicating that the flight has more passengers than seats. While it might be tempting to remove these records on the basis of illegal data, we are keeping them due to the fact, that we understand the description of the LoadFactor variable to mean the share of occupied seats meaning that a load factor greater than 1 just indicates an overbooking for the flight. Overbooking is a generally used practice as described in this [article](https://www.aeroclass.org/why-are-airlines-allowed-to-overbook/) and as such we accept the data.
<br>
Also, som flights are empty or close to empty, but as explained in this [article](https://eu.usatoday.com/story/travel/columnist/cox/2019/08/14/ask-captain-why-fly-plane-just-one-passenger/2006241001/) this is also a perfectly valid scenario.

## Feature Engineering
Based on the different data types, we already know we want to do a couple of things to each column/feature, and these are described below:
<br><br>
For **ScheduleTime**, we want to do two things; Firstly, we want to convert the datetime64[ns] format to a more simple datetime format. We further want to create new object columns; one which encodes which year it is, one for the month of the year the flight is, which week, weekday, hour of day and minute of hour. These will also be encoded in the next step.
These new features extrated from the timestamp will be categorical features.
<br><br>
For **Airline**, **Flightnumber** (infact categorical), **Destination**, **Aircrafttype**, **Flighttype** and **Sector**, columns which are encoded as objects (they are categorical), we need to make an alternative encoding.
<br><br>
SeatCapacity is made categorical, though it differently from the rest, is in fact ordinal.

### Extracting Features from **ScheduleTime**
The `ScheduleTime` feature is a `datetime64` dtype. To be able to use it in a meaningful way for the modelling, we extract certain time features from it e.g. `Year`, `Month`, `Day`, and `WeekNumber`.

In [18]:
def get_season(month):
    if month >= 3 and month <= 5:
        return 'Spring'
    elif month >= 6 and month <= 8:
        return 'Summer'
    elif month >= 9 and month <= 11:
        return 'Fall'
    elif month == 12 or (month >= 1 and month <= 2):
        return 'Winter'
    else:
        raise Exception(f'Invalid month {month}')

def extract_scheduletime_features(df):
    df = df.copy()
    ### Preprosses time for getting year, month, week number, weekday, hour of day and minute of hour
    df['Month'] = df['ScheduleTime'].dt.month
    df['WeekNumber'] = df['ScheduleTime'].dt.week
    df['Weekday'] = df['ScheduleTime'].dt.dayofweek
    df['HourOfDay'] = df['ScheduleTime'].dt.hour
    df['MinuteOfHour'] = df['ScheduleTime'].dt.minute
    df['Season'] = df['ScheduleTime'].dt.month.apply(get_season)

    df.Month = df.Month.astype(object)
    df.WeekNumber = df.WeekNumber.astype(object)
    df.Weekday = df.Weekday.astype(object)
    df.HourOfDay = df.HourOfDay.astype(object)
    df.MinuteOfHour = df.MinuteOfHour.astype(object)
    df.Season = df.Season.astype(object)

    return df

In [70]:
data_schedule_time = extract_scheduletime_features(no_nan_data)
data_schedule_time = data_schedule_time.drop('ScheduleTime', axis=1)
data_schedule_time

Unnamed: 0,Airline,FlightNumber,Destination,AircraftType,FlightType,Sector,SeatCapacity,LoadFactor,Month,WeekNumber,Weekday,HourOfDay,MinuteOfHour,Season
0,IA,874,DEN,73W,J,US,142,0.408451,1,53,4,6,35,Winter
1,JZ,818,YHM,AT7,J,CA,74,0.189189,1,53,4,10,35,Winter
2,IA,876,DEN,73W,J,US,142,0.570423,1,53,4,12,5,Winter
3,CN,514,EST,AT7,J,US,72,0.333333,1,53,4,13,20,Winter
4,LJ,3140,DEN,32A,J,US,186,0.204301,1,53,4,14,20,Winter
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4808,SV,1988,YYJ,73H,J,CA,186,,3,13,3,21,5,Spring
4809,MD,1242,YBR,321,J,CA,230,,3,13,3,20,55,Spring
4810,LJ,506,YUL,320,J,CA,186,,3,13,3,21,50,Spring
4811,LJ,772,YEG,320,J,CA,186,,3,13,3,20,30,Spring


### One-hot encoding nominal categorical features
The categorical features are not numerical types, but we need them to be for the model to handle them.
<br>
The time categorical variables are ordinal, and therefore we do not one-hot encode them, as the tree based models can use them as is.

In [71]:
### We need to find out how many categorical values exist for each feature
data_schedule_time.describe(include = ['object'])

Unnamed: 0,Airline,FlightNumber,Destination,AircraftType,FlightType,Sector,Month,WeekNumber,Weekday,HourOfDay,MinuteOfHour,Season
count,44262,44262,44262,44262,44262,44262,44262,44262,44262,44262,44262,44262
unique,111,864,230,59,4,12,12,53,7,24,13,4
top,DO,771,CKE,73H,J,CA,3,12,6,8,40,Fall
freq,11739,430,2946,10139,42922,21745,5604,1288,7191,3418,3803,13498


In [72]:
nominal_features = ['AircraftType', 'FlightType', 'Sector','Airline', 'FlightNumber', 'Destination', 'Season'] 
encoded_data = pd.get_dummies(data_schedule_time, columns=nominal_features, drop_first=False)
encoded_data

Unnamed: 0,SeatCapacity,LoadFactor,Month,WeekNumber,Weekday,HourOfDay,MinuteOfHour,AircraftType_221,AircraftType_223,AircraftType_295,...,Destination_YZM,Destination_YZV,Destination_ZBF,Destination_ZNG,Destination_ZSW,Destination_ZTM,Season_Fall,Season_Spring,Season_Summer,Season_Winter
0,142,0.408451,1,53,4,6,35,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,74,0.189189,1,53,4,10,35,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,142,0.570423,1,53,4,12,5,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,72,0.333333,1,53,4,13,20,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,186,0.204301,1,53,4,14,20,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4808,186,,3,13,3,21,5,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4809,230,,3,13,3,20,55,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4810,186,,3,13,3,21,50,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4811,186,,3,13,3,20,30,0,0,0,...,0,0,0,0,0,0,0,1,0,0


## Feature Selection

Now we remove any columns that hold constant and close to constant values in all rows i.e. has no information to contribute with for the prediction task

In [73]:
selector = VarianceThreshold(0.01)
var_selected_data = pd.DataFrame(selector.fit_transform(encoded_data), columns=selector.get_feature_names_out())
var_selected_data

Unnamed: 0,SeatCapacity,LoadFactor,Month,WeekNumber,Weekday,HourOfDay,MinuteOfHour,AircraftType_319,AircraftType_320,AircraftType_333,...,Destination_YXU,Destination_YXX,Destination_YYJ,Destination_YYZ,Destination_YZF,Destination_ZSW,Season_Fall,Season_Spring,Season_Summer,Season_Winter
0,142,0.408451,1,53,4,6,35,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,74,0.189189,1,53,4,10,35,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,142,0.570423,1,53,4,12,5,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,72,0.333333,1,53,4,13,20,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,186,0.204301,1,53,4,14,20,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44257,186,,3,13,3,21,5,0,0,0,...,0,0,1,0,0,0,0,1,0,0
44258,230,,3,13,3,20,55,0,0,0,...,0,0,0,0,0,0,0,1,0,0
44259,186,,3,13,3,21,50,0,1,0,...,0,0,0,0,0,0,0,1,0,0
44260,186,,3,13,3,20,30,0,1,0,...,0,0,0,0,0,0,0,1,0,0


We then then perform feature selection based on univariate statistics to select the final subset of features for training.
<br>
The selection is done using the ` Benjamini-Hochberg procedure` with $\alpha = 0.05$ and the sklearn `f_regression` scoring function which produces F-statistic and p-values through a univariate linear regression test of the effect of the input variables on the target.

In [74]:
selector = SelectFdr(f_regression)
realized_data = var_selected_data.dropna(subset=TARGET, axis=0)
y = realized_data[TARGET].astype(float)
X = realized_data.drop(TARGET, axis=1)
selector.fit(X,y)
selected_features = selector.get_feature_names_out()
removed_features = var_selected_data.columns.drop(TARGET).drop(selected_features)
fdr_selected_data = var_selected_data[selected_features]
fdr_selected_data[TARGET] = var_selected_data[TARGET]
print(f'Removed features: {removed_features}')
fdr_selected_data

Removed features: Index(['MinuteOfHour', 'AircraftType_AT5', 'Sector_IS', 'Sector_US',
       'Destination_KEF', 'Destination_YCD', 'Destination_YHM',
       'Destination_YQR'],
      dtype='object')


Unnamed: 0,SeatCapacity,Month,WeekNumber,Weekday,HourOfDay,AircraftType_319,AircraftType_320,AircraftType_333,AircraftType_359,AircraftType_32A,...,Destination_YXX,Destination_YYJ,Destination_YYZ,Destination_YZF,Destination_ZSW,Season_Fall,Season_Spring,Season_Summer,Season_Winter,LoadFactor
0,142,1,53,4,6,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0.408451
1,74,1,53,4,10,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0.189189
2,142,1,53,4,12,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0.570423
3,72,1,53,4,13,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0.333333
4,186,1,53,4,14,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0.204301
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44257,186,3,13,3,21,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,
44258,230,3,13,3,20,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,
44259,186,3,13,3,21,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,
44260,186,3,13,3,20,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,


In [75]:
selected_data = fdr_selected_data

## Distribution Similarity

First we split the data back up into realized and future datasets

In [76]:
selected_realized_data = selected_data.dropna(subset=TARGET)
selected_realized_data

Unnamed: 0,SeatCapacity,Month,WeekNumber,Weekday,HourOfDay,AircraftType_319,AircraftType_320,AircraftType_333,AircraftType_359,AircraftType_32A,...,Destination_YXX,Destination_YYJ,Destination_YYZ,Destination_YZF,Destination_ZSW,Season_Fall,Season_Spring,Season_Summer,Season_Winter,LoadFactor
0,142,1,53,4,6,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0.408451
1,74,1,53,4,10,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0.189189
2,142,1,53,4,12,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0.570423
3,72,1,53,4,13,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0.333333
4,186,1,53,4,14,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0.204301
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39444,144,2,9,0,18,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0.847222
39445,156,2,9,0,19,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0.871795
39446,98,2,9,0,20,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0.857143
39447,186,2,9,0,19,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0.682796


In [77]:
selected_future_data = selected_data[selected_data[TARGET].isna()].drop(TARGET, axis=1)
selected_future_data

Unnamed: 0,SeatCapacity,Month,WeekNumber,Weekday,HourOfDay,AircraftType_319,AircraftType_320,AircraftType_333,AircraftType_359,AircraftType_32A,...,Destination_YXU,Destination_YXX,Destination_YYJ,Destination_YYZ,Destination_YZF,Destination_ZSW,Season_Fall,Season_Spring,Season_Summer,Season_Winter
39449,131,3,9,1,5,1,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
39450,143,3,9,1,7,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
39451,220,3,9,1,6,0,1,0,0,0,...,0,0,0,1,0,0,0,1,0,0
39452,180,3,9,1,6,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
39453,174,3,9,1,7,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44257,186,3,13,3,21,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
44258,230,3,13,3,20,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
44259,186,3,13,3,21,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
44260,186,3,13,3,20,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0


It is important that features in the realized training dataset and features in the future prediction dataset have similar distributions.
<br>
Therefore we perform measurements of the featurewise distribution similarities between the features in the realized and future datasets.
<br> 
The similarity is computed using `scipy.spatial.distance.jensenshannon`. If the pairwise distribution similarity is **very low** (i.e. large JS distance) between a set of features we drop the feature-pair. This is done to avoid the model (over)fitting to the feature in the training data and then performing poorly on the future data, because it is different. Two distirbutions are deemed different if they have a JS distance larger than 0.7. This measurement was chosen emperically, but no feature pairs had a distance larger than 0.4, so they were all deemed similar enough to work with.

In [78]:
# Calculation of the Jensen Shannon Distance
def compute_probs(data, n=10): 
    h, e = np.histogram(data, n)
    p = h/data.shape[0]
    return e, p

def support_intersection(p, q): 
    sup_int = (
        list(
            filter(
                lambda x: (x[0]!=0) & (x[1]!=0), zip(p, q)
            )
        )
    )
    return sup_int

def get_probs(list_of_tuples): 
    p = np.array([p[0] for p in list_of_tuples])
    q = np.array([p[1] for p in list_of_tuples])
    return p, q

def compute_js_dist(f_rea, f_fut):
    f_rea_len = f_rea.nunique()
    e, p = compute_probs(f_rea, n=f_rea_len)
    _, q = compute_probs(f_fut, n=e)

    list_of_tuples = support_intersection(p, q)
    p, q = get_probs(list_of_tuples)
    
    return js_dist(p, q, base=2)


In [79]:
def find_and_plot_different_feature_dist(df_rea, df_fut, threshold=0.5):
    features = df_fut.columns
    diff_features = []

    # Find features with different distributions
    for f in tqdm(features):
        f_rea = df_rea[f]
        f_fut = df_fut[f]
        js_dist = compute_js_dist(f_rea, f_fut)
        if js_dist > threshold:
            diff_features.append(f)

    if len(diff_features) > 0:
        # Plot significantly different features
        fig, ax = plt.subplots(len(diff_features), 2, figsize=(15, int(5*len(diff_features))))

        for i, f in enumerate(tqdm(diff_features)):
            f_rea_len = df_rea[f].nunique()
            f_fut_len = df_fut[f].nunique()
            bins = f_rea_len if f_rea_len > f_fut_len else f_fut_len
            f_values = df_rea[f].unique().tolist() + df_fut[f].unique().tolist()
            brange = (min(f_values), max(f_values))
            if len(diff_features) > 1:
                ax[i,0].set_title(f'Realized {f}')
                sns.histplot(df_rea[f], ax=ax[i,0], bins=bins, binrange=brange)
                ax[i,1].set_title(f'Future {f}')
                sns.histplot(df_fut[f], ax=ax[i,1], bins=bins, binrange=brange)
            else:
                ax[0].set_title(f'Realized {f}')
                sns.histplot(df_rea[f], ax=ax[0], bins=bins, binrange=brange)
                ax[1].set_title(f'Future {f}')
                sns.histplot(df_fut[f], ax=ax[1], bins=bins, binrange=brange)

        plt.tight_layout()
        plt.show()
    else:
        print('No significantly diffrent feature distributions found')

In [84]:
find_and_plot_different_feature_dist(selected_realized_data, selected_future_data, threshold=0.5)

100%|██████████| 63/63 [00:00<00:00, 79.06it/s]

No significantly diffrent feature distributions found





## Saving the data
We now save the preprocessed realized and future datasets.
* `realized_preprocessed_data.csv`
* `future_preprocessed_data.csv`

In [85]:
def set_target_last(df, target):
    df = df.copy()
    tmp_target = df[target]
    df.drop(target, axis=1)
    df[target] = tmp_target
    return df

In [87]:
set_target_last(selected_realized_data, TARGET).to_csv(REALIZED_PATH, sep=',', decimal='.')
selected_future_data.to_csv(FUTURE_PATH, sep=',', decimal='.')