# Imports

In [869]:
import pandas as pd
import numpy as np
from scipy.stats import zscore
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import plotly.express as px
import plotly.graph_objects as go
sns.set(rc={'figure.figsize':(25,10)})


# Transform file to csv

In [1221]:
# This is a function definition in Python that takes two arguments: `filename` and `new_filename`, both of which are strings. 
# The function reads the content of an Excel file using the pandas library's `read_excel` function and stores it in a dataframe object. 
# It then writes the contents of the dataframe object to a CSV file using the `to_csv` function,
# with the `index` parameter set to `None` and `header` parameter set to `True`. 
# The function returns `None`. This function can be used to convert an Excel file to a CSV file.

def transform_xls_to_csv(filename: str, new_filename: str) -> None:
    # Read and store content of an excel file 
    read_file = pd.read_excel (filename)
    # Write the dataframe object into csv file
    read_file.to_csv (new_filename, index = None, header=True)

In [871]:
#transform_xls_to_csv("../raw_data/bookings_without_onepark.xlsx","../raw_data/bookings_without_onepark.csv")

In [1464]:
# This code is defining a dictionary called `custom_dtype` that maps column names to their respective data types.
# The keys of the dictionary are the column names and the values are the data types. 
# The data types include `str` for string, and `float` for floating-point numbers. 
# This dictionary can be used to specify the data types of columns when reading data from a file or database.

custom_dtype = {'id': int,                        
                'pocket':str,
                'product':str,
                'status':str,
                'option':str,
                'guest_id':str,
                'booking_fees':float,
                'amount':float,
                'total_amount':float,
                'discount':float,
                'creation_date_hour':str,
                'beginning_date_hour':str,
                'begining_slice':str,
                'end_date_hour':str,
                'max_date_hour':str,
                'cxl_date_hour':str,
                'los':str,
                'lead_time_hours':float
                }

In [1504]:
# This code reads a CSV file named "bookings_without_onepark.csv" located in the "../raw_data/" 
# directory and stores the data in a pandas DataFrame called "df". The "dtype" parameter specifies
# the data types of the columns in the DataFrame, which are defined in a custom dictionary 
# called "custom_dtype". The "parse_dates" parameter specifies which columns should be parsed as datetime objects.
# In this case, the columns "creation_date_hour", "beginning_date_hour", and "end_date_hour" are being parsed as datetime objects.


df = pd.read_csv("../raw_data/bookings_without_onepark.csv", 
                 dtype=custom_dtype, 
                 parse_dates=['creation_date_hour', 'beginning_date_hour', 'end_date_hour'])


Columns (18,19) have mixed types. Specify dtype option on import or set low_memory=False.



# Cleaning the DF 

In [1505]:
# Drop the 'entry_date_hour' and 'exit_date_hour' columns
df = df.drop(['entry_date_hour', 'exit_date_hour','promo_code','amount_promo','max_date_hour'], axis=1)

In [1506]:
# Check for duplicates based on the 'id' column
duplicates = df.duplicated(subset=['id'])

# Count the number of duplicates
duplicate_count = duplicates.sum()

# Print the number of duplicates
print("Number of duplicates:", duplicate_count)

# Drop the duplicates based on the 'id' column
df = df.drop_duplicates(subset=['id'])

# Print the updated DataFrame without duplicates
print(df.head())

Number of duplicates: 1035
        id      pocket product    status    option    guest_id  booking_fees  \
0  5144307  0000000036     H10  finished  standard  DCL0300508           1.0   
1  5144312  0000000781     H10  finished  standard  DCL0266680           3.0   
2  5144315  0000000491     H10  finished  standard  DCL0286500           3.5   
3  5144319  0000600287     H10  finished  standard  DCL0254021           4.0   
4  5144320  0000600287     H10  canceled  standard  DCL0271067           4.0   

   amount  total_amount  discount      creation_date_hour beginning_date_hour  \
0    58.2         59.20      0.00 2019-01-01 09:11:01.000 2019-02-07 15:00:00   
1    29.5         32.50      0.00 2019-01-01 09:32:29.000 2019-01-08 07:30:00   
2    28.8         23.04      9.26 2019-01-01 09:45:16.999 2019-01-02 06:30:00   
3    40.0         44.00      0.00 2019-01-01 09:58:58.999 2019-01-31 05:45:00   
4    56.0         60.00      0.00 2019-01-01 09:59:13.000 2019-01-17 12:30:00   

  beg

In [1507]:
# Check for nan values and transform them to unknown to count them and to 0 after
df = df.fillna('unknown')
df.isin(['unknown']).sum()
df.replace({'unknown': 0})

Unnamed: 0,id,pocket,product,status,option,guest_id,booking_fees,amount,total_amount,discount,creation_date_hour,beginning_date_hour,begining_slice,end_date_hour,cxl_date_hour,los,lead_time_hours
0,5144307,0000000036,H10,finished,standard,DCL0300508,1.0,58.2,59.20,0.00,2019-01-01 09:11:01.000,2019-02-07 15:00:00.000,15H à 18H,2019-02-10 19:44:59.999,0,+24h,893.816389
1,5144312,0000000781,H10,finished,standard,DCL0266680,3.0,29.5,32.50,0.00,2019-01-01 09:32:29.000,2019-01-08 07:30:00.000,6H à 9H,2019-01-10 21:30:00.000,0,+24h,165.958611
2,5144315,0000000491,H10,finished,standard,DCL0286500,3.5,28.8,23.04,9.26,2019-01-01 09:45:16.999,2019-01-02 06:30:00.000,6H à 9H,2019-01-04 21:45:00.000,0,+24h,20.745278
3,5144319,0000600287,H10,finished,standard,DCL0254021,4.0,40.0,44.00,0.00,2019-01-01 09:58:58.999,2019-01-31 05:45:00.000,0H à 6H,2019-02-02 17:45:00.000,0,+24h,715.766944
4,5144320,0000600287,H10,canceled,standard,DCL0271067,4.0,56.0,60.00,0.00,2019-01-01 09:59:13.000,2019-01-17 12:30:00.000,12H à 15H,2019-01-20 15:59:59.999,2019-01-11 18:26:00.000,+24h,386.513056
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
536169,10276847,0000500287,F397,completed,standard,DCL0129618,0.0,46.5,46.50,0.00,2023-06-18 23:17:23.999,2023-06-23 07:44:59.999,6H à 9H,2023-06-26 17:15:00.000,0,+24h,104.460000
536170,10276859,0000000781,F398,completed,standard,DCL0779147,0.0,46.0,46.00,0.00,2023-06-18 23:22:06.000,2023-07-04 05:29:59.999,0H à 6H,2023-07-10 05:29:59.999,0,+24h,366.131667
536171,10276871,0000000206,F397,completed,standard,DCL0524194,0.0,29.9,29.90,0.00,2023-06-18 23:30:14.000,2023-06-23 15:59:59.999,15H à 18H,2023-06-25 23:29:59.999,0,+24h,112.496111
536172,10276885,0000000206,H10,completed,standard,DCL0311148,0.0,56.9,56.90,0.00,2023-06-18 23:36:58.000,2023-06-19 14:45:00.000,12H à 15H,2023-06-21 22:44:59.999,0,+24h,15.133889


In [1509]:
# remove electrique option
df = df.drop(df[df['option'] == 'electrique'].index)
df

Unnamed: 0,id,pocket,product,status,option,guest_id,booking_fees,amount,total_amount,discount,creation_date_hour,beginning_date_hour,begining_slice,end_date_hour,cxl_date_hour,los,lead_time_hours
0,5144307,0000000036,H10,finished,standard,DCL0300508,1.0,58.2,59.20,0.00,2019-01-01 09:11:01.000,2019-02-07 15:00:00.000,15H à 18H,2019-02-10 19:44:59.999,unknown,+24h,893.816389
1,5144312,0000000781,H10,finished,standard,DCL0266680,3.0,29.5,32.50,0.00,2019-01-01 09:32:29.000,2019-01-08 07:30:00.000,6H à 9H,2019-01-10 21:30:00.000,unknown,+24h,165.958611
2,5144315,0000000491,H10,finished,standard,DCL0286500,3.5,28.8,23.04,9.26,2019-01-01 09:45:16.999,2019-01-02 06:30:00.000,6H à 9H,2019-01-04 21:45:00.000,unknown,+24h,20.745278
3,5144319,0000600287,H10,finished,standard,DCL0254021,4.0,40.0,44.00,0.00,2019-01-01 09:58:58.999,2019-01-31 05:45:00.000,0H à 6H,2019-02-02 17:45:00.000,unknown,+24h,715.766944
4,5144320,0000600287,H10,canceled,standard,DCL0271067,4.0,56.0,60.00,0.00,2019-01-01 09:59:13.000,2019-01-17 12:30:00.000,12H à 15H,2019-01-20 15:59:59.999,2019-01-11 18:26:00.000,+24h,386.513056
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
536169,10276847,0000500287,F397,completed,standard,DCL0129618,0.0,46.5,46.50,0.00,2023-06-18 23:17:23.999,2023-06-23 07:44:59.999,6H à 9H,2023-06-26 17:15:00.000,unknown,+24h,104.460000
536170,10276859,0000000781,F398,completed,standard,DCL0779147,0.0,46.0,46.00,0.00,2023-06-18 23:22:06.000,2023-07-04 05:29:59.999,0H à 6H,2023-07-10 05:29:59.999,unknown,+24h,366.131667
536171,10276871,0000000206,F397,completed,standard,DCL0524194,0.0,29.9,29.90,0.00,2023-06-18 23:30:14.000,2023-06-23 15:59:59.999,15H à 18H,2023-06-25 23:29:59.999,unknown,+24h,112.496111
536172,10276885,0000000206,H10,completed,standard,DCL0311148,0.0,56.9,56.90,0.00,2023-06-18 23:36:58.000,2023-06-19 14:45:00.000,12H à 15H,2023-06-21 22:44:59.999,unknown,+24h,15.133889


In [1513]:
# remove premium option
#df = df.drop(df[df['option'] == 'premium'].index)
#df

Unnamed: 0,id,pocket,product,status,option,guest_id,booking_fees,amount,total_amount,discount,creation_date_hour,beginning_date_hour,begining_slice,end_date_hour,cxl_date_hour,los,lead_time_hours
21,5144500,0000000025,F98,finished,standard,DCL0318692,2.5,57.0,59.5,0.0,2019-01-01 14:11:05.999,2019-01-07 06:14:59.999,6H à 9H,2019-01-11 20:45:00.000,unknown,+24h,136.065000
127,5145275,0000000025,H10,finished,standard,DCL0331881,2.5,38.9,41.4,0.0,2019-01-02 08:42:20.000,2019-02-10 09:59:59.999,9H à 12H,2019-02-12 21:00:00.000,unknown,+24h,937.294444
164,5145879,0000000025,H10,finished,standard,DCL0184041,2.5,23.9,26.4,0.0,2019-01-02 12:48:05.000,2019-01-12 08:00:00.000,6H à 9H,2019-01-13 20:00:00.000,unknown,+24h,235.198611
219,5146764,0000000025,H10,finished,standard,DCL0296356,5.0,23.9,28.9,0.0,2019-01-02 18:56:11.999,2019-01-08 08:29:59.999,6H à 9H,2019-01-09 19:15:00.000,unknown,+24h,133.563333
236,5147056,0000000025,H10,finished,standard,DCL0332599,2.5,76.4,78.9,0.0,2019-01-02 21:22:33.000,2019-01-04 12:14:59.999,12H à 15H,2019-01-09 11:15:00.000,unknown,+24h,38.874167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
535857,10274244,0000000025,F398,progress,standard,DCL0887126,3.0,83.8,86.8,0.0,2023-06-18 00:25:54.000,2023-06-18 10:44:59.999,9H à 12H,2023-06-25 20:00:00.000,unknown,+24h,10.318333
535919,10274879,0000000025,H10,completed,standard,DCL0248367,3.0,50.0,53.0,0.0,2023-06-18 12:11:33.000,2023-06-20 13:30:00.000,12H à 15H,2023-06-22 20:00:00.000,unknown,+24h,49.307500
535931,10274957,0000000025,F398,completed,standard,DCL0468746,3.0,68.0,71.0,0.0,2023-06-18 12:59:42.000,2023-06-19 14:15:00.000,12H à 15H,2023-06-23 15:00:00.000,unknown,+24h,25.255000
535964,10275269,0000000025,H10,completed,standard,DCL0887283,3.0,25.9,28.9,0.0,2023-06-18 15:17:44.000,2023-06-19 07:44:59.999,6H à 9H,2023-06-20 17:15:00.000,unknown,+24h,16.454444


In [1511]:
# Count the occurrences of each product and store it in a new DataFrame
product_counts = df['product'].value_counts().reset_index()

# Rename the columns in the new DataFrame
product_counts.columns = ['product', 'count']

# Print the product names and their corresponding counts
print(product_counts)

   product   count
0      H10  313265
1     F397   73963
2     F398   50106
3      F44   40795
4      F98   35485
5      F60    5078
6      F63    4946
7     F109    2996
8     F132    1573
9     F139    1117
10    F400     999
11    F414     995
12    F150     680
13     F54     632
14     F67     495
15    F343     451
16    F138     421
17      F7     378
18     F33     231
19    F319     229
20     F70     106
21    F100      77
22    F110      33
23    F227      20
24    F137      20
25     F58      18
26      F5       1
27    F215       1


In [1470]:
# Count the occurrences of each pocket and store it in a new DataFrame
pocket_counts = df['pocket'].value_counts().reset_index()

# Rename the columns in the new DataFrame
pocket_counts.columns = ['pocket', 'count']

# Print the product names and their corresponding counts
print(pocket_counts.head(20))

        pocket  count
0   0000000781  95071
1   0000200287  66734
2   0000100435  38029
3   0000000206  29671
4   0000000272  23639
5   0000000036  23394
6   0000000034  22554
7   0000500435  20897
8   0000000052  18650
9   0000000025  18316
10  0000000875  15028
11  0000600287  14539
12  0000000180  14258
13  0000500287  11963
14  0000000276  11397
15  0000300969  10100
16  0000200057   9531
17  0000200021   9299
18  0000200023   7650
19  0000100368   7609


In [1514]:
numbers = ['0000000025']
df = df[df['pocket'].isin(numbers)]
print(df.head())

          id      pocket product    status    option    guest_id  \
21   5144500  0000000025     F98  finished  standard  DCL0318692   
127  5145275  0000000025     H10  finished  standard  DCL0331881   
164  5145879  0000000025     H10  finished  standard  DCL0184041   
219  5146764  0000000025     H10  finished  standard  DCL0296356   
236  5147056  0000000025     H10  finished  standard  DCL0332599   

     booking_fees  amount  total_amount  discount      creation_date_hour  \
21            2.5    57.0          59.5       0.0 2019-01-01 14:11:05.999   
127           2.5    38.9          41.4       0.0 2019-01-02 08:42:20.000   
164           2.5    23.9          26.4       0.0 2019-01-02 12:48:05.000   
219           5.0    23.9          28.9       0.0 2019-01-02 18:56:11.999   
236           2.5    76.4          78.9       0.0 2019-01-02 21:22:33.000   

        beginning_date_hour begining_slice       end_date_hour cxl_date_hour  \
21  2019-01-07 06:14:59.999        6H à 9H 2019-

In [1515]:
# Count the occurrences of each option
options = df['option'].value_counts().reset_index()
options

Unnamed: 0,index,option
0,standard,13395


In [1516]:
# Replace the names of the products
df['product'] = df['product'].replace({'H10': 'hourly rate', 
                                       'F397':'WE package',
                                       'F398': '1 week package',
                                       'F44': '1 week package',
                                       'F98':'WE package',
                                       'F60': '1 month package',
                                       'F63' : '2 weeks package',
                                       'F109': 'other package',
                                       'F132': 'other package',
                                       'F139': '1 month package',
                                       'F400':'1 month package',
                                       'F414': '2 weeks package',
                                       'F150': 'other package',
                                       'F54': '1 week package',
                                       'F67': 'other package',
                                       'F343': 'WE package',
                                       'F138': '1 week package',
                                       'F7': 'other package',
                                       'F33': '2 weeks package',
                                       'F319':'1 week package',
                                       'F70': '1 month package',
                                       'F100': 'other package',
                                       'F110': 'other package',
                                       'F227': 'other package',
                                       'F137': 'other package',
                                       'F58': 'other package',
                                       'F5' : '1 month package',
                                       'F215': 'other package'
                                       })

# Print the updated DataFrame
print(df)

              id      pocket         product     status    option    guest_id  \
21       5144500  0000000025      WE package   finished  standard  DCL0318692   
127      5145275  0000000025     hourly rate   finished  standard  DCL0331881   
164      5145879  0000000025     hourly rate   finished  standard  DCL0184041   
219      5146764  0000000025     hourly rate   finished  standard  DCL0296356   
236      5147056  0000000025     hourly rate   finished  standard  DCL0332599   
...          ...         ...             ...        ...       ...         ...   
535857  10274244  0000000025  1 week package   progress  standard  DCL0887126   
535919  10274879  0000000025     hourly rate  completed  standard  DCL0248367   
535931  10274957  0000000025  1 week package  completed  standard  DCL0468746   
535964  10275269  0000000025     hourly rate  completed  standard  DCL0887283   
535981  10275405  0000000025  1 week package  completed  standard  DCL0701766   

        booking_fees  amoun

In [1517]:
# Count the occurrences of each product and store it in a new DataFrame
product_counts = df['product'].value_counts().reset_index()

# Rename the columns in the new DataFrame
product_counts.columns = ['product', 'count']

# Print the product names and their corresponding counts
print(product_counts)

           product  count
0      hourly rate   9166
1       WE package   3488
2   1 week package    597
3  1 month package    142
4    other package      2


In [1518]:
print(df.head())

          id      pocket      product    status    option    guest_id  \
21   5144500  0000000025   WE package  finished  standard  DCL0318692   
127  5145275  0000000025  hourly rate  finished  standard  DCL0331881   
164  5145879  0000000025  hourly rate  finished  standard  DCL0184041   
219  5146764  0000000025  hourly rate  finished  standard  DCL0296356   
236  5147056  0000000025  hourly rate  finished  standard  DCL0332599   

     booking_fees  amount  total_amount  discount      creation_date_hour  \
21            2.5    57.0          59.5       0.0 2019-01-01 14:11:05.999   
127           2.5    38.9          41.4       0.0 2019-01-02 08:42:20.000   
164           2.5    23.9          26.4       0.0 2019-01-02 12:48:05.000   
219           5.0    23.9          28.9       0.0 2019-01-02 18:56:11.999   
236           2.5    76.4          78.9       0.0 2019-01-02 21:22:33.000   

        beginning_date_hour begining_slice       end_date_hour cxl_date_hour  \
21  2019-01-07 06:

# Creation of df_days, with the nb of bookings, cars and cancellations per days

In [1519]:
# Creation of a DataFrame with 1 car for each day, cancellations (cxl), 
# and nb of bookings and cancellations by days of arrival
df_days = pd.DataFrame({
    'date': pd.date_range(start='2021-06-01', end='2023-06-20'),
    'nb_cars': 0,
    'nb_cars_cxl': 0,
    'nb_bookings': 0,
    'nb_bookings_cxl': 0
})

# Print the updated DataFrame
print(df_days.head())


        date  nb_cars  nb_cars_cxl  nb_bookings  nb_bookings_cxl
0 2021-06-01        0            0            0                0
1 2021-06-02        0            0            0                0
2 2021-06-03        0            0            0                0
3 2021-06-04        0            0            0                0
4 2021-06-05        0            0            0                0


In [1520]:
# Iterate through each row of the DataFrame `df`
for _, row in df.iterrows():
    start = row['beginning_date_hour']
    end = row['end_date_hour']
    status = row['status']
    
    # Update nb_cars column
    df_days.loc[(df_days['date'] >= start) & (df_days['date'] <= end) & (status != 'canceled'), 'nb_cars'] += 1
    
    # Update nb_cars_canceled column
    df_days.loc[(df_days['date'] >= start) & (df_days['date'] <= end) & (status == 'canceled'), 'nb_cars_cxl'] += 1



In [1521]:
# Count the number of bookings and cancellations for each date
booking_counts = df.loc[df['status'] != 'canceled', 'beginning_date_hour'].dt.date.value_counts()
cancellation_counts = df.loc[df['status'] == 'canceled', 'beginning_date_hour'].dt.date.value_counts()

# Update nb_bookings column
df_days['nb_bookings'] = df_days['date'].map(booking_counts).fillna(0).astype(int)

# Update nb_bookings_cxl column
df_days['nb_bookings_cxl'] = df_days['date'].map(cancellation_counts).fillna(0).astype(int)


In [1522]:
df_days.head()

Unnamed: 0,date,nb_cars,nb_cars_cxl,nb_bookings,nb_bookings_cxl
0,2021-06-01,6,0,5,0
1,2021-06-02,10,0,13,0
2,2021-06-03,15,0,6,0
3,2021-06-04,18,0,15,0
4,2021-06-05,18,0,5,1


In [1523]:
# Save the df to csv 
df_days.to_csv("../prepared_data/old_df/01_06_2021_to_20_06_2023_25std.csv", index=False)


In [1524]:
# Convert 'date' column to datetime data type
df_days['date'] = pd.to_datetime(df_days['date'])

# Reshape the DataFrame using melt
df_melted = df_days.melt(id_vars='date', value_vars=['nb_cars', 'nb_cars_cxl', 'nb_bookings', 'nb_bookings_cxl'])

# Create the line plot
fig = go.Figure()

# Add traces for each variable
for variable in ['nb_cars', 'nb_cars_cxl', 'nb_bookings', 'nb_bookings_cxl']:
    fig.add_trace(go.Scatter(
        x=df_melted['date'],
        y=df_melted[df_melted['variable'] == variable]['value'],
        name=variable
    ))

# Add range slider and selectors
fig.update_layout(
    xaxis=dict(
        rangeselector=dict(
            buttons=list([
                dict(count=1, label="1m", step="month", stepmode="backward"),
                dict(count=6, label="6m", step="month", stepmode="backward"),
                dict(count=1, label="YTD", step="year", stepmode="todate"),
                dict(count=1, label="1y", step="year", stepmode="backward"),
                dict(step="all")
            ])
        ),
        rangeslider=dict(visible=True)
    ),
    title="Number of cars and bookings per day"
)

# Set the height of the plot
fig.update_layout(height=800)

# Show the graph
fig.show()


# Creation of df_products with the nb of cars per days by products (w/o cancellations)

In [1525]:
# Creation of a DataFrame with 1 car for each day without cancellations, displayed by products 

df_products = pd.DataFrame({
    'date': pd.date_range(start='2021-06-01', end='2023-06-20'),
    'hourly rate': 0,
    'WE package': 0,
    '1 week package': 0,
    '1 month package':0,
    'other package': 0,
    '2 weeks package': 0,   
})

# Print the updated DataFrame
print(df_products.head())

        date  hourly rate  WE package  1 week package  1 month package  \
0 2021-06-01            0           0               0                0   
1 2021-06-02            0           0               0                0   
2 2021-06-03            0           0               0                0   
3 2021-06-04            0           0               0                0   
4 2021-06-05            0           0               0                0   

   other package  2 weeks package  
0              0                0  
1              0                0  
2              0                0  
3              0                0  
4              0                0  


In [1526]:
# Iterate through each row of the DataFrame `df`
for _, row in df.iterrows():
    start = row['beginning_date_hour']
    end = row['end_date_hour']
    status = row['status']
    product = row['product']  # Retrieve the product value from the row
    
    # Update the count for the respective product
    df_products.loc[(df_products['date'] >= start) & (df_products['date'] <= end) & (status != 'canceled'), product] += 1


In [1527]:
df_products.head()

Unnamed: 0,date,hourly rate,WE package,1 week package,1 month package,other package,2 weeks package
0,2021-06-01,1,0,4,1,0,0
1,2021-06-02,3,0,6,1,0,0
2,2021-06-03,7,0,7,1,0,0
3,2021-06-04,10,0,7,1,0,0
4,2021-06-05,6,9,3,0,0,0


In [1528]:
# Save the df to csv 
df_products.to_csv("../prepared_data/parts_of_final_df/01_06_2021_to_20_06_2023_products_25std.csv", index=False)

In [1529]:
# Plot a graph of the number of cars per day and products
wide_df = pd.DataFrame(df_products)

fig = px.bar(wide_df, 
             x="date", 
             y=["hourly rate", "WE package", "1 week package", "1 month package", "other package", "2 weeks package"], 
             title="Number of cars per day & product")

# Add a range slider and selectors
fig.update_xaxes(
    rangeslider_visible=True,
    rangeselector=dict(
        buttons=list([
            dict(count=1, label="1m", step="month", stepmode="backward"),
            dict(count=6, label="6m", step="month", stepmode="backward"),
            dict(count=1, label="YTD", step="year", stepmode="todate"),
            dict(count=1, label="1y", step="year", stepmode="backward"),
            dict(step="all")
        ])
    )
)

# Set the height of the plot
fig.update_layout(height=800)

# Show the graph
fig.show()


# Creation of df_turnover

In [1530]:
# Initialize the DataFrame with the date column
df_turnover = pd.DataFrame({
    'date': pd.date_range(start='2021-06-01', end='2023-06-20')
})

# Calculate turnover by date
df['date'] = pd.to_datetime(df['beginning_date_hour']).dt.date  # Convert 'beginning_date_hour' to date
turnover_data = df.loc[df['status'] != 'canceled'].groupby('date').agg({'amount': 'sum'}).reset_index()
turnover_data = turnover_data.rename(columns={'amount': 'turnover'})
turnover_data['date'] = pd.to_datetime(turnover_data['date'])  # Convert 'date' to datetime

# Calculate discount by date
discount_data = df.loc[df['status'] != 'canceled'].groupby('date').agg({'discount': 'sum'}).reset_index()
discount_data['date'] = pd.to_datetime(discount_data['date'])  # Convert 'date' to datetime

# Calculate booking fees by date
booking_fees_data = df.loc[df['status'] != 'canceled'].groupby('date').agg({'booking_fees': 'sum'}).reset_index()
booking_fees_data['date'] = pd.to_datetime(booking_fees_data['date'])  # Convert 'date' to datetime

# Calculate mean of lead_time_hours by date
mean_lead_time_data = df.groupby('date')['lead_time_hours'].mean().reset_index()
mean_lead_time_data['date'] = pd.to_datetime(mean_lead_time_data['date'])  # Convert 'date' to datetime

# Merge the data into the df_turnover DataFrame
df_turnover = df_turnover.merge(turnover_data, on='date', how='left')
df_turnover = df_turnover.merge(discount_data, on='date', how='left')
df_turnover = df_turnover.merge(booking_fees_data, on='date', how='left')
df_turnover = df_turnover.merge(mean_lead_time_data, on='date', how='left')

# Fill NaN values with 0
df_turnover[['turnover', 'discount', 'booking_fees', 'lead_time_hours']] = df_turnover[['turnover', 'discount', 'booking_fees', 'lead_time_hours']].fillna(0)

# Print the updated DataFrame
print(df_turnover.head())


        date  turnover  discount  booking_fees  lead_time_hours
0 2021-06-01     264.0       0.0           0.0       153.696667
1 2021-06-02     342.9       0.0           0.0        68.652372
2 2021-06-03     209.4       0.0           0.0       124.014444
3 2021-06-04     411.2     291.2           0.0        66.289667
4 2021-06-05     131.2      59.2           0.0       105.820370


In [1531]:
# Save the df to csv 
df_turnover.to_csv("../prepared_data/parts_of_final_df/01_06_2021_to_20_06_2023_turnover_25std.csv", index=False)

In [1532]:
# Multiply the discount column by -1 to make it negative
df_turnover['discount'] = df_turnover['discount'] * -1

# Reshape the DataFrame using melt
df_melted = df_turnover[['date', 'turnover', 'booking_fees', 'discount']].melt(id_vars='date', var_name='category', value_name='amount')

# Create the bar chart
fig = px.bar(df_melted, x='date', y='amount', color='category', barmode='relative',
             labels={'amount': 'Amount', 'category': 'Category'})

# Set the y-axis range to include negative values
fig.update_layout(yaxis_range=[df_melted['amount'].min(), df_melted['amount'].max()])

# Customize the layout
fig.update_layout(
    xaxis_title='Date',
    yaxis_title='Amount',
    legend_title='Category',
    barmode='relative',
    bargap=0.2,
    bargroupgap=0.1
)

# Add range slider and selectors
fig.update_layout(
    xaxis=dict(
        rangeselector=dict(
            buttons=list([
                dict(count=1, label="1m", step="month", stepmode="backward"),
                dict(count=6, label="6m", step="month", stepmode="backward"),
                dict(count=1, label="YTD", step="year", stepmode="todate"),
                dict(count=1, label="1y", step="year", stepmode="backward"),
                dict(step="all")
            ])
        ),
        rangeslider=dict(visible=True)
    ),
    title="Turnover by days (amount of booking, booking fees, and discount)"
)

# Set the height of the plot
fig.update_layout(height=800)

# Show the graph
fig.show()



# Adding the last columns of the original dataset

In [1533]:
# Creation of a DataFrame with 1 car for each day without cancellations, displayed by products 

df_option = pd.DataFrame({
    'date': pd.date_range(start='2021-06-01', end='2023-06-20'),
    'standard': 0,
    'premium': 0,
    '6H à 9H': 0,
    '15H à 18H': 0,
    '9H à 12H': 0,
    '12H à 15H': 0,
    '0H à 6H': 0,
    '18H à 24H': 0,
    '+24h': 0,
    '06:00 24:00': 0,
    '00:30 06:00': 0,
    '00:00 00:30': 0
})

# Iterate through each row of the DataFrame `df`
for _, row in df.iterrows():
    start = row['beginning_date_hour']
    end = row['end_date_hour']
    status = row['status']
    option = row['option']  # Retrieve the option value from the row
    begining_slice = row['begining_slice']  # Retrieve the option value from the row
    los = row['los']  # Retrieve the option value from the row
    
    # Update the count for the respective option
    df_option.loc[(df_option['date'] >= start) & (df_option['date'] <= end) & (status != 'canceled'), option] += 1
    df_option.loc[(df_option['date'] >= start) & (df_option['date'] <= end) & (status != 'canceled'), begining_slice] += 1    
    df_option.loc[(df_option['date'] >= start) & (df_option['date'] <= end) & (status != 'canceled'), los] += 1   
    

print(df_option.head())

        date  standard  premium  6H à 9H  15H à 18H  9H à 12H  12H à 15H  \
0 2021-06-01         6        0        2          0         0          2   
1 2021-06-02        10        0        5          0         1          2   
2 2021-06-03        15        0        8          1         2          2   
3 2021-06-04        18        0        9          1         2          3   
4 2021-06-05        18        0        2          4         4          8   

   0H à 6H  18H à 24H  +24h  06:00 24:00  00:30 06:00  00:00 00:30  
0        1          1     6            0            0            0  
1        1          1    10            0            0            0  
2        1          1    15            0            0            0  
3        2          1    18            0            0            0  
4        0          0    18            0            0            0  


In [1534]:
# Save the df to csv 
df_option.to_csv("../prepared_data/parts_of_final_df/01_06_2021_to_20_06_2023_option_25std.csv", index=False)

# Import DF

In [1535]:
# Load prepared dataset
df_days = pd.read_csv("../prepared_data/old_df/01_06_2021_to_20_06_2023_25std.csv")
print(df_days.head())


         date  nb_cars  nb_cars_cxl  nb_bookings  nb_bookings_cxl
0  2021-06-01        6            0            5                0
1  2021-06-02       10            0           13                0
2  2021-06-03       15            0            6                0
3  2021-06-04       18            0           15                0
4  2021-06-05       18            0            5                1


In [1536]:
# Load products dataset
df_products = pd.read_csv("../prepared_data/parts_of_final_df/01_06_2021_to_20_06_2023_products_25std.csv")
print(df_products.head())


         date  hourly rate  WE package  1 week package  1 month package  \
0  2021-06-01            1           0               4                1   
1  2021-06-02            3           0               6                1   
2  2021-06-03            7           0               7                1   
3  2021-06-04           10           0               7                1   
4  2021-06-05            6           9               3                0   

   other package  2 weeks package  
0              0                0  
1              0                0  
2              0                0  
3              0                0  
4              0                0  


In [1537]:
# Load turnover dataset
df_turnover = pd.read_csv("../prepared_data/parts_of_final_df/01_06_2021_to_20_06_2023_turnover_25std.csv")
print(df_turnover.head())

         date  turnover  discount  booking_fees  lead_time_hours
0  2021-06-01     264.0       0.0           0.0       153.696667
1  2021-06-02     342.9       0.0           0.0        68.652372
2  2021-06-03     209.4       0.0           0.0       124.014444
3  2021-06-04     411.2     291.2           0.0        66.289667
4  2021-06-05     131.2      59.2           0.0       105.820370


In [1538]:
# Load option dataset
df_option = pd.read_csv("../prepared_data/parts_of_final_df/01_06_2021_to_20_06_2023_option_25std.csv")
print(df_option.head())

         date  standard  premium  6H à 9H  15H à 18H  9H à 12H  12H à 15H  \
0  2021-06-01         6        0        2          0         0          2   
1  2021-06-02        10        0        5          0         1          2   
2  2021-06-03        15        0        8          1         2          2   
3  2021-06-04        18        0        9          1         2          3   
4  2021-06-05        18        0        2          4         4          8   

   0H à 6H  18H à 24H  +24h  06:00 24:00  00:30 06:00  00:00 00:30  
0        1          1     6            0            0            0  
1        1          1    10            0            0            0  
2        1          1    15            0            0            0  
3        2          1    18            0            0            0  
4        0          0    18            0            0            0  


# Merging DF

In [1539]:
# Merging DataFrames
new_df = df_days.merge(df_products, left_index=True, right_index=True, suffixes=('_days', '_products'))
new_df = new_df.merge(df_turnover, left_index=True, right_index=True, suffixes=('_merged', '_turnover'))
new_df = new_df.merge(df_option, left_index=True, right_index=True, suffixes=('_merged', '_option'))

new_df = new_df.drop(['date_products', 'date_merged', 'date_option' ], axis=1)
new_df = new_df.rename(columns={'date_days': 'date'})

new_df

Unnamed: 0,date,nb_cars,nb_cars_cxl,nb_bookings,nb_bookings_cxl,hourly rate,WE package,1 week package,1 month package,other package,...,6H à 9H,15H à 18H,9H à 12H,12H à 15H,0H à 6H,18H à 24H,+24h,06:00 24:00,00:30 06:00,00:00 00:30
0,2021-06-01,6,0,5,0,1,0,4,1,0,...,2,0,0,2,1,1,6,0,0,0
1,2021-06-02,10,0,13,0,3,0,6,1,0,...,5,0,1,2,1,1,10,0,0,0
2,2021-06-03,15,0,6,0,7,0,7,1,0,...,8,1,2,2,1,1,15,0,0,0
3,2021-06-04,18,0,15,0,10,0,7,1,0,...,9,1,2,3,2,1,18,0,0,0
4,2021-06-05,18,0,5,1,6,9,3,0,0,...,2,4,4,8,0,0,18,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745,2023-06-16,28,3,28,3,16,2,8,2,0,...,11,4,5,5,3,0,28,0,0,0
746,2023-06-17,36,6,9,0,5,23,6,2,0,...,4,10,7,14,0,1,36,0,0,0
747,2023-06-18,36,5,2,0,3,25,5,3,0,...,5,12,7,11,0,1,36,0,0,0
748,2023-06-19,21,5,14,0,0,12,6,3,0,...,4,5,5,7,0,0,21,0,0,0


In [1540]:
column_names = new_df.columns.tolist()
print(column_names)


['date', 'nb_cars', 'nb_cars_cxl', 'nb_bookings', 'nb_bookings_cxl', 'hourly rate', 'WE package', '1 week package', '1 month package', 'other package', '2 weeks package', 'turnover', 'discount', 'booking_fees', 'lead_time_hours', 'standard', 'premium', '6H à 9H', '15H à 18H', '9H à 12H', '12H à 15H', '0H à 6H', '18H à 24H', '+24h', '06:00 24:00', '00:30 06:00', '00:00 00:30']


In [1541]:
# Save the df to csv 
new_df.to_csv("../prepared_data/01_06_2021_to_20_06_2023_prepared_df_25std.csv", index=False)

# Adding strikes, holidays... to prepared_df

In [1542]:
# Load strike dataset
df_prepared = pd.read_csv("../prepared_data/01_06_2021_to_20_06_2023_prepared_df_25std.csv")
print(df_prepared.head())

         date  nb_cars  nb_cars_cxl  nb_bookings  nb_bookings_cxl  \
0  2021-06-01        6            0            5                0   
1  2021-06-02       10            0           13                0   
2  2021-06-03       15            0            6                0   
3  2021-06-04       18            0           15                0   
4  2021-06-05       18            0            5                1   

   hourly rate  WE package  1 week package  1 month package  other package  \
0            1           0               4                1              0   
1            3           0               6                1              0   
2            7           0               7                1              0   
3           10           0               7                1              0   
4            6           9               3                0              0   

   ...  6H à 9H  15H à 18H  9H à 12H  12H à 15H  0H à 6H  18H à 24H  +24h  \
0  ...        2          0         0   

In [1543]:
# Add the dates of strikes 
df_prepared['strike'] = 0  # Initialize 'strike' column with 0
dates_of_strike = ['2021-08-05', '2021-11-17', 
                   '2022-08-06', '2022-09-29',
                   '2023-01-19','2023-01-31',
                   '2023-02-07','2023-02-11','2023-02-16',
                   '2023-03-07','2023-03-11','2023-03-15','2023-03-21','2023-03-30',
                   '2023-04-06','2023-04-13',
                   '2023-06-06']
df_prepared.loc[df_prepared['date'].isin(dates_of_strike), 'strike'] = 1  # Set 'strike' to 1 for the specified dates
df_prepared.head()

Unnamed: 0,date,nb_cars,nb_cars_cxl,nb_bookings,nb_bookings_cxl,hourly rate,WE package,1 week package,1 month package,other package,...,15H à 18H,9H à 12H,12H à 15H,0H à 6H,18H à 24H,+24h,06:00 24:00,00:30 06:00,00:00 00:30,strike
0,2021-06-01,6,0,5,0,1,0,4,1,0,...,0,0,2,1,1,6,0,0,0,0
1,2021-06-02,10,0,13,0,3,0,6,1,0,...,0,1,2,1,1,10,0,0,0,0
2,2021-06-03,15,0,6,0,7,0,7,1,0,...,1,2,2,1,1,15,0,0,0,0
3,2021-06-04,18,0,15,0,10,0,7,1,0,...,1,2,3,2,1,18,0,0,0,0
4,2021-06-05,18,0,5,1,6,9,3,0,0,...,4,4,8,0,0,18,0,0,0,0


In [1544]:
# Add the dates of holidays
df_prepared['holidays'] = 0  # Initialize 'holidays' column with 0
dates_of_holidays = ['2023-01-01', '2023-04-10','2023-05-01','2023-05-08','2023-05-18','2023-05-19','2023-05-29','2023-07-14','2023-08-15','2023-11-01','2023-11-11','2023-12-25', 
                   '2022-01-01', '2022-04-18','2022-05-01','2022-05-08','2022-05-26','2022-05-27','2022-06-06','2022-07-14','2022-08-15','2022-11-01','2022-11-11','2022-12-25',
                   '2021-07-14','2021-08-15','2021-11-01','2021-11-11','2021-12-25',
                   ]
df_prepared.loc[df_prepared['date'].isin(dates_of_holidays), 'holidays'] = 1  # Set 'holidays' to 1 for the specified dates
df_prepared.head()

Unnamed: 0,date,nb_cars,nb_cars_cxl,nb_bookings,nb_bookings_cxl,hourly rate,WE package,1 week package,1 month package,other package,...,9H à 12H,12H à 15H,0H à 6H,18H à 24H,+24h,06:00 24:00,00:30 06:00,00:00 00:30,strike,holidays
0,2021-06-01,6,0,5,0,1,0,4,1,0,...,0,2,1,1,6,0,0,0,0,0
1,2021-06-02,10,0,13,0,3,0,6,1,0,...,1,2,1,1,10,0,0,0,0,0
2,2021-06-03,15,0,6,0,7,0,7,1,0,...,2,2,1,1,15,0,0,0,0,0
3,2021-06-04,18,0,15,0,10,0,7,1,0,...,2,3,2,1,18,0,0,0,0,0
4,2021-06-05,18,0,5,1,6,9,3,0,0,...,4,8,0,0,18,0,0,0,0,0


In [1545]:
# Add the vacations

df_prepared['vacation'] = 0  # Initialize 'vacation' column with 0
df_prepared['date'] = pd.to_datetime(df_prepared['date'])


# Define vacation date ranges
vacation_ranges = [
    ('2021-07-06', '2021-09-01'),
    ('2021-10-23', '2021-11-07'),
    ('2021-12-18', '2022-01-02'),
    ('2022-02-05', '2022-03-06'),   
    ('2022-04-09', '2022-08-05'),    
    ('2022-07-07', '2022-08-31'),
    ('2022-10-22', '2022-11-06'),
    ('2022-12-17', '2023-01-02'),
    ('2023-02-04', '2023-03-05'),
    ('2023-04-08', '2023-05-08'),
    ('2023-07-08', '2023-09-03'),
    ('2023-10-21', '2023-11-05'),
    ('2023-12-23', '2024-01-07'),
]

# Iterate over each vacation range
for start_date, end_date in vacation_ranges:
    # Generate a list of dates within the range
    vacation_dates = pd.date_range(start=start_date, end=end_date, freq='D').date
    # Set 'vacation' to 1 for the dates in the vacation range
    df_prepared.loc[df_prepared['date'].isin(vacation_dates), 'vacation'] = 1

df_prepared.head()

Unnamed: 0,date,nb_cars,nb_cars_cxl,nb_bookings,nb_bookings_cxl,hourly rate,WE package,1 week package,1 month package,other package,...,12H à 15H,0H à 6H,18H à 24H,+24h,06:00 24:00,00:30 06:00,00:00 00:30,strike,holidays,vacation
0,2021-06-01,6,0,5,0,1,0,4,1,0,...,2,1,1,6,0,0,0,0,0,0
1,2021-06-02,10,0,13,0,3,0,6,1,0,...,2,1,1,10,0,0,0,0,0,0
2,2021-06-03,15,0,6,0,7,0,7,1,0,...,2,1,1,15,0,0,0,0,0,0
3,2021-06-04,18,0,15,0,10,0,7,1,0,...,3,2,1,18,0,0,0,0,0,0
4,2021-06-05,18,0,5,1,6,9,3,0,0,...,8,0,0,18,0,0,0,0,0,0


In [1546]:
# Save the df to csv 
df_prepared.to_csv("../prepared_data/01_06_2021_to_20_06_2023_prepared_df_25std.csv", index=False)