In [None]:

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import zscore
import pickle
train = False
create_pickle = True

In [None]:
if train:
    data = pd.read_csv('data/train.csv')
else: 
    data = pd.read_csv('data/test.csv')

print(data.head())

In [None]:
if train and create_pickle:
    # Group data by 'prop_id' and calculate mean values
    groupby_hotel = data.groupby('prop_id').agg({'position': 'mean', 'click_bool': 'mean', 'booking_bool': 'mean'})
    
    # Create a dictionary for hotel properties
    dict_hotel_pos = groupby_hotel.to_dict('index')
    
    # Save dictionary to file
    with open('dict_hotel_pos.pickle', 'wb') as handle:
        pickle.dump(dict_hotel_pos, handle, protocol=pickle.HIGHEST_PROTOCOL)

else:
    # Load dictionary from file
    with open('dict_hotel_pos.pickle', 'rb') as handle:
        dict_hotel_pos = pickle.load(handle)

# Create new columns in the data and assign values from the dictionary
data['mean_position'] = data['prop_id'].map(lambda x: dict_hotel_pos[x]['position'] if x in dict_hotel_pos else np.nan)
data['perc_clicked'] = data['prop_id'].map(lambda x: dict_hotel_pos[x]['click_bool'] if x in dict_hotel_pos else np.nan)
data['perc_booked'] = data['prop_id'].map(lambda x: dict_hotel_pos[x]['booking_bool'] if x in dict_hotel_pos else np.nan)

In [None]:
if train:
    data['target'] = np.where(data['click_bool'] == 1, 1, 0)
    data['target'] = np.where(data['booking_bool'] == 1, 5, data['target'])	
    data = data.drop(['click_bool', 'booking_bool'], axis=1)

In [None]:
# create one feature of comp rate and comp_perc_diff for every competitor
for i in range(1,9):
    data['comp_rate_' + str(i)] = data['comp' + str(i) + '_rate'] * data['comp' + str(i) + '_rate_percent_diff']

In [None]:
# transform integers to strings for categorical data
data['srch_destination_id'] = data['srch_destination_id'].astype(str)
data['site_id'] = data['site_id'].astype(str)
data['visitor_location_country_id'] = data['visitor_location_country_id'].astype(str)
data['prop_country_id'] = data['prop_country_id'].astype(str)
data['prop_id'] = data['prop_id'].astype(str)

In [None]:
# transform integers to boleans
data["prop_brand_bool"] = data["prop_brand_bool"].astype(bool)
data["srch_saturday_night_bool"] = data["srch_saturday_night_bool"].astype(bool)
data["random_bool"] = data["random_bool"].astype(bool)
data["promotion_flag"] = data["promotion_flag"].astype(bool)

In [None]:
# transform all integers to floats to compute means and z-scores
for col in data.columns:
    if data[col].dtype == 'int64':
        data[col] = data[col].astype(float)        

In [None]:
# for the columns prop_starrating and prop_review_score, a 0 represents a missing value and should be replaced by nan
data['prop_starrating'] = data['prop_starrating'].replace(0, np.nan)
data['prop_review_score'] = data['prop_review_score'].replace(0, np.nan)
data['srch_query_affinity_score'] = data['srch_query_affinity_score'].replace(0, np.nan)
data["prop_log_historical_price"] = data["prop_log_historical_price"].replace(0, np.nan)
data["orig_destination_distance"] = data["orig_destination_distance"].replace(0, np.nan)

In [None]:
# rename date time column to month (january, february, march, april, may, june, july, august, september, october, november, december)
data['date_time'] = pd.to_datetime(data['date_time'])
data['date_time'] = data['date_time'].dt.month_name()

# rename date_time column name to month
data = data.rename(columns={'date_time': 'month'})

In [None]:
# CANNOT USE THIS FEATURE, TOO MANY MISSING VALUES:
# visitor_hist_starrating 0.9488966226896648 nan
# visitor_hist_adr_usd 0.9486560588709875 nan

In [None]:
# add feature that calculates how much the hotel rating is above the average rating of the hotels in the same region 
data['rating_relto_region'] = data.groupby('srch_destination_id')['prop_starrating'].transform(lambda x: x - x.mean())

# add feature that indicates how much the hotels rating is in same price range
data['rating_relto_search'] = data.groupby('srch_id')['prop_starrating'].transform(lambda x: x - x.mean())

In [None]:
# todo check if review score is above average of other search results
data['review_relto_search'] = data.groupby('srch_id')['prop_review_score'].transform(lambda x: x - x.mean())
data['review_relto_region'] = data.groupby('srch_destination_id')['prop_review_score'].transform(lambda x: x - x.mean())

In [None]:
# create price per night column
data['price_per_night'] = data['price_usd']/data['srch_length_of_stay']

# add feature that indicates if the price per night is higher than the average price per night in that region in that month
data['price_rel_to_region'] = data['price_per_night'] - data.groupby(['month', 'srch_destination_id'])['price_per_night'].transform('mean')

# add feature that indicates if the price per night is higher than the average price per night of the search id  
data['price_rel_to_search'] = data['price_per_night'] - data.groupby('srch_id')['price_per_night'].transform('mean')

# TODO: check output
print(data['price_rel_to_region'].head(20))

In [None]:
# check if the hotel is more close to the average distance of the hotels in the same region
data['rel_distance'] = data['orig_destination_distance'] - data.groupby('srch_destination_id')['orig_destination_distance'].transform('mean')

# TODO: check if this works
print(data['rel_distance'].head(20))

In [None]:
# compute the mean of the two columns prop_location_score1 and prop_location_score2 
data['prop_location_score2'] = data['prop_location_score2'].fillna(data['prop_location_score1'])
data['prop_location_score1'] = data['prop_location_score1'].fillna(data['prop_location_score2'])
data['prop_location_score'] = data[['prop_location_score2', 'prop_location_score1']].mean(axis=1)
data = data.drop(['prop_location_score1', 'prop_location_score2'], axis=1)

In [None]:
# comb_rate mean is the mean of all not nan values in all columns that contain comb_rate
data['comb_rate'] = data[['comp1_rate', 'comp2_rate', 'comp3_rate', 'comp4_rate', 'comp5_rate', 'comp6_rate', 'comp7_rate', 'comp8_rate']].mean(axis=1, skipna=True)
data["comb_inv"] = data[['comp1_inv', 'comp2_inv', 'comp3_inv', 'comp4_inv', 'comp5_inv', 'comp6_inv', 'comp7_inv', 'comp8_inv']].sum(axis=1, skipna=True)
print(data['comb_inv'].head(20))

In [None]:
data = data.drop(['comp1_rate', 'comp2_rate', 'comp3_rate', 'comp4_rate', 'comp5_rate', 'comp6_rate', 'comp7_rate', 'comp8_rate'], axis=1)
data = data.drop(['comp1_rate_percent_diff', 'comp2_rate_percent_diff', 'comp3_rate_percent_diff', 'comp4_rate_percent_diff', 'comp5_rate_percent_diff', 'comp6_rate_percent_diff', 'comp7_rate_percent_diff', 'comp8_rate_percent_diff'], axis=1)
data = data.drop(['comp1_inv', 'comp2_inv', 'comp3_inv', 'comp4_inv', 'comp5_inv', 'comp6_inv', 'comp7_inv', 'comp8_inv'], axis=1)

In [None]:
# save to latex table
import os
if not os.path.exists('latex'):
    os.makedirs('latex')

# create a table for all numerical data with mean, median, highest, lowest and standard deviation
num_describtion = data.describe() 
num_describtion.to_latex('latex/num_describtion.tex')

In [None]:
# get number of unique categories, most occuring, least occuring and percentage of most occuring category for categorical data
dict_cat = {}
for col in data.columns:
    if data[col].dtype == 'object':
        unique = data[col].nunique()
        moc = data[col].value_counts().idxmax()
        moc_freq = data[col].value_counts().max()/data.shape[0]
        loc = data[col].value_counts().idxmin()
        loc_freq = data[col].value_counts().min()/data.shape[0]
        
        dict_cat[col] = [unique, moc, moc_freq, loc, loc_freq]
    
# create table for categorical data
cat_describtion = pd.DataFrame.from_dict(dict_cat, orient='index', columns=['unique', 'most occuring', 'most occuring frequency', 'least occuring', 'least occuring frequency'])

# save to latex
cat_describtion.to_latex('cat_describtion.tex')

In [None]:
# print missing values in each column
for col in data.columns:
    # print percentage of missing values
    print(col, data[col].isnull().sum()/data.shape[0])

In [None]:
# remove all columns that contain more than 40% missing values
for col in data.columns:
    if data[col].isnull().sum()/data.shape[0] > 0.40:
        print(col, data[col].isnull().sum()/data.shape[0])
        data = data.drop([col], axis=1)

In [None]:
# # handle missing values 
# for col in data.columns:
    
#     # check if numerical
#     if data[col].dtype == 'float64' or data[col].dtype == 'int64':   
#         data[col] = data[col].fillna(data[col].mean())
        
#         # TODO: not always mean, sometimes median or mode
    
#     # check if categorical
#     if data[col].dtype == 'object' or data[col].dtype == 'bool':
#         data[col] = data[col].fillna(data[col].value_counts().idxmax())

# # check if there are still missing values
# print(data.isnull().sum().sum())

In [None]:
# TODO: remove the columns ... and ... that are not in the test set

In [None]:
# print amount of unique values in each categorical column
for col in data.columns:
    if data[col].dtype == 'object' or data[col].dtype == 'bool':
        print(col, data[col].nunique())
        
# drop srch_destination_id because it has too many unique values
data = data.drop(['srch_destination_id'], axis=1)

In [None]:
# handle bins with less than 5 occurences in  site_id, visitor_location_country_id, prop_country_id
for col in ['site_id', 'visitor_location_country_id', 'prop_country_id']:
    
    # merge bins with less than 5 occurences with the bin with the most occurences
    for sub_col in data[col].unique():
        if data[col].value_counts()[sub_col] < 5:
            
            # merge them into 'other' category
            data[col] = data[col].replace(sub_col, 'other')

In [None]:
# check if there are still missing values
print(data.isnull().sum().sum())

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer

def fill_missing_with_regression(df, target_column, feature_columns):
    # Split the data into two sets: one with missing values and one without
    df_missing = df[df[target_column].isnull()]
    df_not_missing = df[df[target_column].notnull()]
    imputer = SimpleImputer(strategy='median')
    df_not_missing.loc[:, feature_columns] = imputer.fit_transform(df_not_missing.loc[:, feature_columns])

    # Prepare the feature matrix and target vector for the regression
    X_train = df_not_missing[feature_columns]
    y_train = df_not_missing[target_column]
    X_test = df_missing[feature_columns]
    X_test = imputer.transform(X_test)
    
    # Create and fit the linear regression model
    regression_model = LinearRegression()
    regression_model.fit(X_train, y_train)

    # Predict the missing values
    y_pred = regression_model.predict(X_test)

    # Fill in the missing values in the DataFrame
    df.loc[df[target_column].isnull(), target_column] = y_pred

    return df

In [None]:
import pandas as pd
from sklearn.cluster import KMeans

def fill_missing_with_clustering(df, target_column, feature_columns, n_clusters):
    # Split the data into two sets: one with missing values and one without
    df_missing = df[df[target_column].isnull()]
    df_not_missing = df[df[target_column].notnull()]

    # Prepare the feature matrix for clustering
    X_train = df_not_missing[feature_columns]

    # Create and fit the KMeans clustering model
    clustering_model = KMeans(n_clusters=n_clusters, random_state=42)
    clustering_model.fit(X_train)

    # Assign cluster labels to non-missing values
    df_not_missing['cluster_label'] = clustering_model.labels_

    # Prepare the feature matrix for missing values
    X_test = df_missing[feature_columns]

    # Predict the cluster labels for missing values
    cluster_labels = clustering_model.predict(X_test)

    # Find the most frequent category for each cluster
    cluster_mode_values = df_not_missing.groupby('cluster_label')[target_column].apply(lambda x: x.mode().iloc[0])

    # Fill in the missing values based on cluster labels
    df.loc[df[target_column].isnull(), target_column] = cluster_labels.map(cluster_mode_values)

    return df

In [None]:
# fill in missing values with regression model
num_columns = data.select_dtypes(include=['float64', 'int64']).columns	

for col in data.columns:
    
    # check whether column has missing values
    if data[col].isnull().sum() != 0:
    
        if data[col].dtype == 'float64' or data[col].dtype == 'int64':
            print(col)
            # fill in missing values with regression model
            data = fill_missing_with_regression(data, col, num_columns)
            
        else:
            # fill in missing values with clustering model
            data = fill_missing_with_clustering(data, col, num_columns, 5)   

In [None]:
# check if there are still missing values
print(data.isnull().sum().sum())

In [None]:
# save cleaned data in data folder
if train:
    data.to_csv('data/train_cleaned.csv', index=False)
else:
    data.to_csv('data/test_cleaned.csv', index=False)