# Notebook to experiment with different features

In [1]:
import pandas as pd
df_1 = pd.read_csv('../data/4.processed_data/Query4_results_test_processed.csv')
df_2 = pd.read_csv('../data/4.processed_data/Query1503_results_conv_processed.csv')
df_3 = pd.read_csv('../data/4.processed_data/Query1603_results_test_processed.csv')
df = pd.concat([df_1,df_2,df_3])

## Baseline Target: "mean_savings_for_Journey_route_in_Detected_Country" 

### With no changes

In [2]:
# Calculate the average of rel_diff_to_min_price_FlightID for each Journey_ID and Detected_Country
mean_savings_journeyID = df.groupby(['Journey_ID', 'Detected_Country'])['rel_diff_to_min_price_FlightID'].mean().reset_index(name='mean_savings_for_JourneyID_in_Detected_Country')
# Merge the average_savings DataFrame back into df_reduced
df = df.merge(mean_savings_journeyID, on=['Journey_ID', 'Detected_Country'], how='left')

In [3]:
df.mean_savings_for_JourneyID_in_Detected_Country.describe()

count    5806.000000
mean        3.090059
std         7.462736
min         0.000000
25%         0.655104
50%         2.108359
75%         3.941099
max       169.333250
Name: mean_savings_for_JourneyID_in_Detected_Country, dtype: float64

### Trimmed Mean

In [4]:
import pandas as pd
from scipy.stats import trim_mean

# Assuming df is your DataFrame and it's already defined

# Calculate the trimmed mean of rel_diff_to_min_price_FlightID for each Journey_ID and Detected_Country
trimmed_average_savings_journeyID = df.groupby(['Journey_ID', 'Detected_Country'])['rel_diff_to_min_price_FlightID'].apply(lambda x: trim_mean(x, proportiontocut=0.1)).reset_index(name='trimmed_mean_savings_for_JourneyID_in_Detected_Country')

# Merge the trimmed_average_savings DataFrame back into df
df = df.merge(trimmed_average_savings_journeyID, on=['Journey_ID', 'Detected_Country'], how='left')


In [5]:
df.trimmed_mean_savings_for_JourneyID_in_Detected_Country.describe()

count    5806.000000
mean        3.080453
std         7.462026
min         0.000000
25%         0.655104
50%         2.108359
75%         3.940695
max       169.333250
Name: trimmed_mean_savings_for_JourneyID_in_Detected_Country, dtype: float64

### Log of mean savings

In [6]:
import numpy as np

# Apply log transformation with a small constant to avoid log(0)
df['log_mean_savings_for_JourneyID_in_Detected_Country'] = np.log(df['mean_savings_for_JourneyID_in_Detected_Country'] + 1)


In [7]:
df.log_mean_savings_for_JourneyID_in_Detected_Country.describe()

count    5806.000000
mean        1.061073
std         0.740117
min         0.000000
25%         0.503864
50%         1.134095
75%         1.597588
max         5.137757
Name: log_mean_savings_for_JourneyID_in_Detected_Country, dtype: float64

# Target 2 "mean_savings_for_Journey_route_in_Detected_Country"

In [8]:
# Calculate the average of rel_diff_to_min_price_FlightID for each Journey_ID and Detected_Country
mean_savings_journey_route = df.groupby(['Journey_route', 'Detected_Country'])['rel_diff_to_min_price_FlightID'].mean().reset_index(name='mean_savings_for_Journey_route_in_Detected_Country')
# Merge the average_savings DataFrame back into df_reduced
df = df.merge(mean_savings_journey_route, on=['Journey_route', 'Detected_Country'], how='left')

### With no changes

In [9]:
df.mean_savings_for_Journey_route_in_Detected_Country.describe()

count    5806.000000
mean        3.090059
std         4.027117
min         0.000000
25%         1.499970
50%         2.469589
75%         3.941347
max        95.490787
Name: mean_savings_for_Journey_route_in_Detected_Country, dtype: float64

### Trimmed 

In [10]:
import pandas as pd
from scipy.stats import trim_mean

# Assuming df is your DataFrame and it's already defined

# Calculate the trimmed mean of rel_diff_to_min_price_FlightID for each Journey_ID and Detected_Country
trimmed_average_savings_journey_route = df.groupby(['Journey_route', 'Detected_Country'])['rel_diff_to_min_price_FlightID'].apply(lambda x: trim_mean(x, proportiontocut=0.1)).reset_index(name='trimmed_mean_savings_for_Journey_route_in_Detected_Country')

# Merge the trimmed_average_savings DataFrame back into df
df = df.merge(trimmed_average_savings_journey_route, on=['Journey_route', 'Detected_Country'], how='left')

### Log

In [11]:
import numpy as np

# Apply log transformation with a small constant to avoid log(0)
df['log_mean_savings_for_Journey_route_in_Detected_Country'] = np.log(df['trimmed_mean_savings_for_Journey_route_in_Detected_Country'] + 1)

In [12]:
df.log_mean_savings_for_Journey_route_in_Detected_Country.describe()

count    5806.000000
mean        1.127865
std         0.502468
min         0.000000
25%         0.823209
50%         1.106425
75%         1.566289
max         4.569448
Name: log_mean_savings_for_Journey_route_in_Detected_Country, dtype: float64

# Target 3: "median_savings_for_Journey_route_in_Detected_Country"

In [13]:
median_savings_Journey_route_train = df.groupby(['Journey_route','Detected_Country'])['rel_diff_to_min_price_FlightID'].median().reset_index(name='median_savings_for_Journey_route_country')

df = df.merge(median_savings_Journey_route_train, on=['Journey_route', 'Detected_Country'], how='left')


## Target 4: "normalized_mean_savings"

In [14]:
df.columns

Index(['airline_code', 'departure_airport_code', 'destination_airport_code',
       'ticket_price', 'departure_date', 'arrival_date', 'First_flight',
       'first_flight_code', 'last_flight_code', 'Detected_Language',
       'Detected_Country', 'Detected_Currency', 'Flight_ID', 'Price_in_USD',
       'commute_time', 'query_date', 'days_until_departure',
       'FlightID_in_Countries_Count', 'departure_date_day', 'arrival_date_day',
       'Journey_route', 'Journey_ID', 'max_price_FlightID',
       'min_price_FlightID', 'max_price_diff_FlightID',
       'max_rel_price_diff_FlightID', 'abs_diff_to_min_price_FlightID',
       'rel_diff_to_min_price_FlightID', 'rel_price_score_FlightID',
       'max_price_JourneyID', 'min_price_JourneyID', 'max_abs_diff_JourneyID',
       'max_rel_diff_Journey', 'abs_diff_to_min_price_JourneyID',
       'rel_diff_to_min_price_JourneyID', 'rel_price_score_JourneyID',
       'max_journey_same_country', 'min_journey_same_country',
       'max_abs_diff_perIDG

In [15]:
df['normalized_mean_savings'] = df[['mean_savings_for_JourneyID_in_Detected_Country', 'median_savings_for_Journey_route_country']].mean(axis=1)

In [16]:
df.normalized_mean_savings.describe()

count    5806.000000
mean        2.630391
std         4.413457
min         0.000000
25%         0.751915
50%         1.981605
75%         3.716443
max       107.490743
Name: normalized_mean_savings, dtype: float64