In [None]:
import pandas as pd
import numpy as np
from plotly import express as px
import plotly.graph_objects as go

In [None]:
train = pd.read_csv('/Users/adamamster/Downloads/expedia-hotel-recommendations/train.csv', 
                    parse_dates=['date_time'])
destinations = pd.read_csv('/Users/adamamster/Downloads/expedia-hotel-recommendations/destinations.csv')

In [None]:
def preprocess():
    date_part_srch_ci = train['srch_ci'][train['srch_ci'].notnull()].apply(lambda x: int(x[:4]))
    train.loc[date_part_srch_ci[date_part_srch_ci > 2020].index, 'srch_ci'] = np.nan 
    
    date_part_srch_co = train['srch_co'][train['srch_co'].notnull()].apply(lambda x: int(x[:4]))
    train.loc[date_part_srch_co[date_part_srch_co > 2020].index, 'srch_co'] = np.nan 
    
    train['srch_ci'] = pd.to_datetime(train['srch_ci'])
    train['srch_co'] = pd.to_datetime(train['srch_co'])
preprocess()

In [None]:
def feature_engineering():
    train['srch_duration'] = (train['srch_co'] - train['srch_ci']).dt.days
    train.loc[train['srch_duration'] < 0, 'srch_duration'] = np.nan
    
    train['has_kids'] = train['srch_children_cnt'] > 0
    train['num people'] = train['srch_adults_cnt'] + train['srch_children_cnt']
feature_engineering()

In [None]:
train.info()

In [None]:
destinations.info()

In [None]:
user_num_booking = train.groupby('user_id')['is_booking'].sum()
user_num_booking[user_num_booking == 0].shape[0] / user_num_booking.shape[0]

In [None]:
train['is_booking'].mean()

In [None]:
user_event_num = train.groupby('user_id').size().reset_index().rename(columns={0: 'Num. events'})
user_event_num['User did book'] = user_num_booking != 0
fig = px.histogram(user_event_num,  x='Num. events', color='User did book', nbins=50, histnorm='percent', 
                  title='Number of events')
fig['layout']['yaxis']['title'] = 'percent'
fig.show()

In [None]:
def plot_num_bookings_distr():
    df = user_num_booking[user_num_booking > 0].reset_index() \
        .rename(columns={'is_booking': 'Num. bookings'})
    fig = px.histogram(df, x='Num. bookings', histnorm='percent', nbins=50, 
                       title='Number of bookings for users who made at least 1 booking')
    fig['layout']['yaxis']['title'] = 'percent'
    return fig
plot_num_bookings_distr()

In [None]:
def plot_bookings_over_time():
    num_bookings = train.groupby(train['date_time'].dt.date)['is_booking'].sum()
    num_clicks = train.groupby(train['date_time'].dt.date).size() - num_bookings
    df = pd.DataFrame({'num bookings': num_bookings, 'num clicks': num_clicks, 'date': num_bookings.index})
    df = df.melt(id_vars='date')
    fig = px.line(df, x='date', y='value', facet_row='variable', color='variable', title='Num bookings over time')
    fig.update_yaxes(matches=None)
    return fig
plot_bookings_over_time()

In [None]:
def plot_book_rate_over_time():
    df = train.groupby(train['date_time'].dt.date)['is_booking'].mean() \
        .reset_index().rename(columns={'is_booking': 'book rate', 'date_time': 'date'})
    fig = px.line(df, x='date', y='book rate', title='Book rate over time')
    return fig
plot_book_rate_over_time()

In [None]:
def plot_hotel_cluster_popularity():
    total_books = train['is_booking'].sum()
    total_clicks = train.shape[0] - total_books
    click_rate = (train.groupby('hotel_cluster').size() / total_clicks) \
        .reset_index().rename(columns={0: 'click rate'})
    book_rate = (train.groupby('hotel_cluster')['is_booking'].sum() / total_books) \
        .reset_index().rename(columns={'is_booking': 'book rate'})
    conversion_rate = train.groupby('hotel_cluster')['is_booking'].mean().reset_index() \
        .rename(columns={'is_booking': 'conversion rate'})
    df = pd.DataFrame({'click rate': click_rate['click rate'], 'book rate': book_rate['book rate'], 
                       'conversion rate': conversion_rate['conversion rate'],
                      'hotel cluster': click_rate['hotel_cluster']})
    df = df.melt(id_vars='hotel cluster')
    df = df.set_index('hotel cluster')
    
    category_order = df[df['variable'] == 'book rate'].sort_values('value', ascending=False).index
    df = df.loc[category_order]

    fig = px.bar(df, x=df.index, y='value', color='variable', title='Hotel popularity')
    fig.update_xaxes(type='category')

    return fig
plot_hotel_cluster_popularity()

In [None]:
def plot_orig_destination_distance():
    return px.histogram(train.sample(500000), x='orig_destination_distance', nbins=100, 
                        title='Distance from origin to destination')
plot_orig_destination_distance()

In [None]:
def plot_avg_distance_booking():
    df = train.groupby(['user_id', 'is_booking'])['orig_destination_distance'].mean()
    df = df.reset_index().rename(columns={'orig_destination_distance': 'avg distance to destination'})
    fig = px.histogram(df, x='avg distance to destination', color='is_booking', histnorm='percent', nbins=100,
                      barmode='group', title='Distance to destination')
    fig['layout']['yaxis']['title'] = 'Percent'
    return fig
plot_avg_distance_booking()

In [None]:
def dist_to_destination_by_hotel_group():
    df = train.groupby(['hotel_cluster', 'is_booking'])['orig_destination_distance'].median()
    df = df.reset_index().rename(columns={'orig_destination_distance': 'median distance'})
    df = df.sort_values('median distance', ascending=False)
    df['is_booking'] = df['is_booking'].astype('category')
    fig = px.bar(df, x='hotel_cluster', y='median distance', color='is_booking', title='Distance', barmode='group')
    fig.update_xaxes(type='category')
    return fig
dist_to_destination_by_hotel_group()

In [None]:
def plot_search_duration():
    sample = train.sample(500000)
    fig = px.histogram(sample[sample['srch_duration'] < 30], x='srch_duration', nbins=100, histnorm='percent', 
                        title='Search Duration')
    fig['layout']['yaxis']['title'] = 'Percent'
    return fig
plot_search_duration()

In [None]:
def plot_search_duration_by_hotel():
    df = train.groupby(['hotel_cluster'])['srch_duration'].mean() \
        .reset_index().rename(columns={'srch_duration': 'mean search duration'})
    df = df.sort_values('mean search duration', ascending=False)
    fig = px.bar(df, x='hotel_cluster', y='mean search duration',
                        title='Search Duration')
    fig.update_xaxes(type='category')
    return fig
plot_search_duration_by_hotel()

In [None]:
def plot_kids_hotels():
    df = train.groupby(['hotel_cluster', 'has_kids'])['is_booking'].mean() \
        .reset_index().rename(columns={'is_booking': 'book rate'})
    df = df.sort_values('book rate', ascending=False)
#     df['is_booking'] = df['is_booking'].astype('category')
    fig = px.bar(df, x='hotel_cluster', y='book rate',
                        color='has_kids', title='Popularity by kids', barmode='group')
    fig.update_xaxes(type='category')
    return fig
plot_kids_hotels()

In [None]:
def book_rate_kids():
    df = train.groupby(['has_kids', 'user_id']).size() \
        .reset_index().rename(columns={0: 'num interactions'})
    fig = px.histogram(df[df['num interactions'] < 60], x='num interactions', 
                       color='has_kids', histnorm='percent', nbins=100, barmode='group')
    fig['layout']['yaxis']['title'] = 'Percent'
    return fig
book_rate_kids()

In [None]:
def plot_repeat_bookings():
    df = train[train['is_booking'] == 1].groupby(['hotel_cluster', 'user_id']).size() \
        .reset_index().rename(columns={0: 'num bookings'})
    df = df[df['num bookings'] < 20]
    fig = px.histogram(df, x='num bookings', histnorm='percent', title='Number of rebookings')
    fig['layout']['yaxis']['title'] = 'Percent'
    return fig
plot_repeat_bookings()

In [None]:
train.isnull().any()

In [None]:
train['hotel_market'].nunique()