In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [2]:
df = pd.read_csv('data/customer_booking.csv', encoding='latin-1')

In [3]:
df

Unnamed: 0,num_passengers,sales_channel,trip_type,purchase_lead,length_of_stay,flight_hour,flight_day,route,booking_origin,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,flight_duration,booking_complete
0,2,Internet,RoundTrip,262,19,7,Sat,AKLDEL,New Zealand,1,0,0,5.52,0
1,1,Internet,RoundTrip,112,20,3,Sat,AKLDEL,New Zealand,0,0,0,5.52,0
2,2,Internet,RoundTrip,243,22,17,Wed,AKLDEL,India,1,1,0,5.52,0
3,1,Internet,RoundTrip,96,31,4,Sat,AKLDEL,New Zealand,0,0,1,5.52,0
4,2,Internet,RoundTrip,68,22,15,Wed,AKLDEL,India,1,0,1,5.52,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,2,Internet,RoundTrip,27,6,9,Sat,PERPNH,Australia,1,0,1,5.62,0
49996,1,Internet,RoundTrip,111,6,4,Sun,PERPNH,Australia,0,0,0,5.62,0
49997,1,Internet,RoundTrip,24,6,22,Sat,PERPNH,Australia,0,0,1,5.62,0
49998,1,Internet,RoundTrip,15,6,11,Mon,PERPNH,Australia,1,0,1,5.62,0


# Train-test split

In [4]:
y = df['booking_complete']

df_train, df_val, _, _ = train_test_split(df, y, test_size=0.1, random_state=1, stratify=y)

# Binning routes and booking_origins

We use 100 as a cut-off value. Any route/origin with fewer than 100 will be binned into a single `other` category.

In [5]:
def feature_binning(feature, threshold=100):
    feature_counts = df_train.groupby(feature)['num_passengers'].count()

    def bin_X(X):
        return X[feature].map(lambda x : x if x in feature_counts.index and feature_counts[x]>100 else 'other')

    df_train[f'{feature}_binned'] = bin_X(df_train)
    df_val[f'{feature}_binned'] = bin_X(df_val)


In [6]:
feature_binning('route')
feature_binning('booking_origin')

In [7]:
df_train = df_train.drop(['route','booking_origin'], axis=1)
df_val = df_val.drop(['route','booking_origin'], axis=1)

# One-hot encoding

In [8]:
def one_hot_encoding(df_train, df_val):
    # Determine the categorical columns
    cat_mask = df_train.dtypes == 'object'
    cat_cols = df_train.columns[cat_mask].to_list()
    other_cols = df_train.columns[~cat_mask].to_list()

    # perform OHE
    ohe = OneHotEncoder(handle_unknown='ignore', sparse=False, drop='if_binary') # not planning to use a linear model so no need to drop a category
    df_train_ohe = ohe.fit_transform(df_train[cat_cols])
    df_val_ohe = ohe.transform(df_val[cat_cols])

    # convert to dataframes
    df_train_ohe_df = pd.DataFrame(df_train_ohe, columns=ohe.get_feature_names_out(ohe.feature_names_in_))
    df_val_ohe_df = pd.DataFrame(df_val_ohe, columns=ohe.get_feature_names_out(ohe.feature_names_in_))

    # Merge the dfs
    df_train_proc = pd.merge(
        left=df_train_ohe_df,
        right=df_train[other_cols].reset_index(),
        left_index=True,
        right_index=True
    )
    df_val_proc = pd.merge(
        left=df_val_ohe_df,
        right=df_val[other_cols].reset_index(),
        left_index=True,
        right_index=True
    )

    return df_train_proc, df_val_proc

In [13]:
# OHE for full df (including booking_origin):

df_train_proc, df_val_proc = one_hot_encoding(df_train, df_val)
# write to csv
df_train_proc.to_csv('data/df_train.csv',index=False)
df_val_proc.to_csv('data/df_val.csv',index=False)

# Now drop booking_origin and repeat

df_train1 = df_train.drop('booking_origin_binned', axis=1)
df_val1 = df_val.drop('booking_origin_binned', axis=1)

df_train_proc, df_val_proc = one_hot_encoding(df_train1, df_val1)
# write to csv
df_train_proc.to_csv('data/df_train_alt_wobo.csv',index=False)
df_val_proc.to_csv('data/df_val_alt_wobo.csv',index=False)

# Now drop route and repeat

df_train2 = df_train.drop('route_binned', axis=1)
df_val2 = df_val.drop('route_binned', axis=1)

df_train_proc, df_val_proc = one_hot_encoding(df_train2, df_val2)
# write to csv
df_train_proc.to_csv('data/df_train_alt_wor.csv',index=False)
df_val_proc.to_csv('data/df_val_alt_wor.csv',index=False)