# Getaround analysis

## Import libraries and data

In [1]:
# Import libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

import xgboost as xgb

import joblib

import plotly.express as px
import plotly.graph_objects as go

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) # to avoid deprecation warnings


In [2]:
print("XGBoost version:", xgb.__version__)

XGBoost version: 1.7.6


In [3]:

print("NumPy version:", np.__version__)

NumPy version: 1.23.5


In [4]:
print("Joblib version :", joblib.__version__)

Joblib version : 1.3.2


In [5]:
print("Pandas version :", pd.__version__)

Pandas version : 1.5.3


In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
# Import delay data
delay_data = pd.read_excel("/content/drive/MyDrive/JEDHA_Certification/BLOC_5-Deployment/Data/get_around_delay_analysis.xlsx")

In [8]:
# Import princing data
pricing_data = pd.read_csv("/content/drive/MyDrive/JEDHA_Certification/BLOC_5-Deployment/Data/get_around_pricing_project.csv")

## 1. Delay Analysis

### Data exploration and cleaning

In [9]:
# First look at the dataset
delay_data.head()

Unnamed: 0,rental_id,car_id,checkin_type,state,delay_at_checkout_in_minutes,previous_ended_rental_id,time_delta_with_previous_rental_in_minutes
0,505000,363965,mobile,canceled,,,
1,507750,269550,mobile,ended,-81.0,,
2,508131,359049,connect,ended,70.0,,
3,508865,299063,connect,canceled,,,
4,511440,313932,mobile,ended,,,


In [10]:
# Let's start with global information about the dataset
delay_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21310 entries, 0 to 21309
Data columns (total 7 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   rental_id                                   21310 non-null  int64  
 1   car_id                                      21310 non-null  int64  
 2   checkin_type                                21310 non-null  object 
 3   state                                       21310 non-null  object 
 4   delay_at_checkout_in_minutes                16346 non-null  float64
 5   previous_ended_rental_id                    1841 non-null   float64
 6   time_delta_with_previous_rental_in_minutes  1841 non-null   float64
dtypes: float64(3), int64(2), object(2)
memory usage: 1.1+ MB


In [11]:
# Change the data type of id's columns to string
delay_data['rental_id'] = delay_data['rental_id'].astype(str)
delay_data['car_id'] = delay_data['car_id'].astype(str)

# Convert the 'previous_ended_rental_id' column to string while removing '.0'
delay_data['previous_ended_rental_id'] = delay_data['previous_ended_rental_id'].apply(lambda x: str(int(x)) if not pd.isna(x) else x)

# Check the data types of the DataFrame to confirm the changes
print(delay_data.dtypes)

rental_id                                      object
car_id                                         object
checkin_type                                   object
state                                          object
delay_at_checkout_in_minutes                  float64
previous_ended_rental_id                       object
time_delta_with_previous_rental_in_minutes    float64
dtype: object


In [12]:
# Basic stats
print("Number of rows : {}".format(delay_data.shape[0]))
print()

print("Display of dataset: ")
display(delay_data.head())
print()

print("Basics statistics: ")
data_desc = delay_data.describe(include='all')
display(data_desc)
print()

print("Percentage of missing values: ")
display(100*delay_data.isnull().sum()/delay_data.shape[0])

Number of rows : 21310

Display of dataset: 


Unnamed: 0,rental_id,car_id,checkin_type,state,delay_at_checkout_in_minutes,previous_ended_rental_id,time_delta_with_previous_rental_in_minutes
0,505000,363965,mobile,canceled,,,
1,507750,269550,mobile,ended,-81.0,,
2,508131,359049,connect,ended,70.0,,
3,508865,299063,connect,canceled,,,
4,511440,313932,mobile,ended,,,



Basics statistics: 


Unnamed: 0,rental_id,car_id,checkin_type,state,delay_at_checkout_in_minutes,previous_ended_rental_id,time_delta_with_previous_rental_in_minutes
count,21310.0,21310.0,21310,21310,16346.0,1841.0,1841.0
unique,21310.0,8143.0,2,2,,1788.0,
top,505000.0,334190.0,mobile,ended,,537243.0,
freq,1.0,33.0,17003,18045,,4.0,
mean,,,,,59.701517,,279.28843
std,,,,,1002.561635,,254.594486
min,,,,,-22433.0,,0.0
25%,,,,,-36.0,,60.0
50%,,,,,9.0,,180.0
75%,,,,,67.0,,540.0



Percentage of missing values: 


rental_id                                      0.000000
car_id                                         0.000000
checkin_type                                   0.000000
state                                          0.000000
delay_at_checkout_in_minutes                  23.294228
previous_ended_rental_id                      91.360863
time_delta_with_previous_rental_in_minutes    91.360863
dtype: float64

There are a lot of missing values in the last two columns 'previous_ended_rental_id' and 'time_delta_with_previous_rental_in_minutes'. These columns give information about previous rental when happenned less that 12 hours before the actual rental. It means that in our dataset, 91% of the rentals happenned more than 12 hours after the previous rental.

There are also 23% of missing values in the 'delay_at_checkout_in_minutes' column. Let's have a look at it.

#### Missing values

In [13]:
# Check if any values in the "delay_at_checkout_in_minutes" are equal to 0
zero_delay_count = (delay_data['delay_at_checkout_in_minutes'] == 0).sum()

print("Number of values equal to 0 in delay_at_checkout_in_minutes column:", zero_delay_count)

Number of values equal to 0 in delay_at_checkout_in_minutes column: 122


At first I thought that maybe 'NaN' values in this column would mean that there was no delay, but as there are some lines with 0min of delay, my guess was wrong.
So I thought about deleting the rows where there are missing values in the 'delay_at_checkout_in_minutes' column, but doing it, I would lost almost all the 'canceled' values in the 'state' column.

Finally, I decided to group the data in the 'delay_at_checkout_in_minutes' column in ranges, and replace the missing values with 'unknown'.

In [14]:
# Create new column 'delay' from 'delay_at_checkout_in_minutes' column.
# Define the conditions and labels
conditions = [
    (delay_data['delay_at_checkout_in_minutes'] <= 0),
    (delay_data['delay_at_checkout_in_minutes'] < 60),
    (delay_data['delay_at_checkout_in_minutes'] < 120),
    (delay_data['delay_at_checkout_in_minutes'] < 300),
    (delay_data['delay_at_checkout_in_minutes'] < 1440),
    (delay_data['delay_at_checkout_in_minutes'] >= 1440),
    (delay_data['delay_at_checkout_in_minutes'].isna())
]

labels = ['Early or On Time', '< 1 Hour', '1 to 2 Hours', '2 to 5 Hours', '5 to 24 Hours', '1 day or more', 'Unknown']

# Create the 'delay' column based on the conditions and labels
delay_data['delay'] = np.select(conditions, labels)

# Calculate the value counts of each delay category
delay_counts = delay_data['delay'].value_counts()

# Calculate the percentage of each category
delay_percentages = (delay_counts / delay_counts.sum()) * 100

# Display the result
print(delay_percentages)


Early or On Time    32.576255
< 1 Hour            23.350540
Unknown             23.294228
1 to 2 Hours         8.728297
2 to 5 Hours         7.114031
5 to 24 Hours        4.049742
1 day or more        0.886908
Name: delay, dtype: float64


In [15]:
delay_data.shape

(21310, 8)

 No more missing values in the 'delay' column. For the moment, I will not deal with missing values of the last two columns because we will need them later in our analysis.

#### Outliers

In [16]:
# Create box plots for all numeric columns in the dataset
for column in delay_data.select_dtypes(include=['number']).columns:
    fig = px.box(delay_data, y=column, title=f'Box Plot of {column}')
    fig.update_xaxes(title_text=column)
    fig.show()


There are some outliers in the 'delay_at_checkout_in_minutes'. Let's delete all datas that were more than 10 days late for checkout.

In [17]:
delay_data.shape

(21310, 8)

In [18]:
# Count the number of entries with delay_at_checkout_in_minutes > 15422
outlier_entries = delay_data[delay_data['delay_at_checkout_in_minutes'] > 15422]

# Get the count of such entries
num_outliers = len(outlier_entries)

# Display the count
print("Number of entries with delay_at_checkout_in_minutes > 15422:", num_outliers)

# Filter out and remove the outliers
delay_data = delay_data[(delay_data['delay_at_checkout_in_minutes'] <= 15422) | (delay_data['delay_at_checkout_in_minutes'].isna())]

# Check the shape of the DataFrame after removing outliers
print("Shape of delay_data after removing outliers:", delay_data.shape)

Number of entries with delay_at_checkout_in_minutes > 15422: 5
Shape of delay_data after removing outliers: (21305, 8)


We removed 5 outliers, and kept abnormal values until 10 days late in order to keep the integrity of the data.

#### Duplicated values

In [19]:
# Check duplicated values
delay_data.duplicated(subset=None, keep='first').value_counts()


False    21305
dtype: int64

There is no duplicated row in this dataset. Let's proceed.

### EDA - visualizations

In [20]:
# Bar chart for 'checkin_type'
checkin_type_counts = delay_data['checkin_type'].value_counts()
checkin_type_percentage = (checkin_type_counts / checkin_type_counts.sum()) * 100

fig2 = px.bar(
    x=checkin_type_counts.index,
    y=checkin_type_counts,
    title='Distribution of Checkin Types',
    labels={'x': 'Checkin Type', 'y': 'Count'},
)
fig2.update_traces(marker_color='skyblue')

# Add tags with percentages
percentage_text = [f'{p:.2f}%' for p in checkin_type_percentage]
fig2.update_traces(marker_color='skyblue', text=percentage_text, textposition='outside')


fig2.update_yaxes(title='Count')

fig2.show()

Only 20% of rentals are booked from Connect cars.

#### How often are drivers late for the next check-in? How does it impact the next driver?

In [21]:
# Calculate the total number of entries in the dataset
total_entries = len(delay_data)

# Calculate the number of late returns (positive values)
late_returns_count = len(delay_data[delay_data['delay_at_checkout_in_minutes'] > 0])

# Calculate the number of NaN values
nan_count = delay_data['delay_at_checkout_in_minutes'].isna().sum()

# Check the number of 'on time' or 'earlier' values
on_time_or_earlier_count = len(delay_data[delay_data['delay_at_checkout_in_minutes'] <= 0])

# Calculate the percentage of late returns
late_return_percentage = (late_returns_count / total_entries) * 100

# Calculate the percentage of on time/earlier returns
on_time_percentage = (on_time_or_earlier_count / total_entries) * 100

# Calculate the percentage of NaNs
NaN_percentage = (nan_count / total_entries) * 100

# Display the result
print("Total Number of Entries:", total_entries)
print("Number of Late Returns (excluding NaN as negative):", late_returns_count)
print("Number of On Time or Earlier Returns:", on_time_or_earlier_count)
print("Number of NaN Values:", nan_count)
print("Percentage of Late Returns :", late_return_percentage)
print("Percentage of On Time/Early Returns :", on_time_percentage)
print("Percentage of NaN :", NaN_percentage)

Total Number of Entries: 21305
Number of Late Returns (excluding NaN as negative): 9399
Number of On Time or Earlier Returns: 6942
Number of NaN Values: 4964
Percentage of Late Returns : 44.11640459985919
Percentage of On Time/Early Returns : 32.583900492842055
Percentage of NaN : 23.299694907298758


In [21]:
# Calculate the counts and percentages of each delay category
delay_counts = delay_data['delay'].value_counts()
delay_percentages = (delay_counts / delay_counts.sum()) * 100

# Create a histogram manually
fig3 = go.Figure(data=[
    go.Bar(
        x=delay_counts.index,
        y=delay_counts,
        text=delay_percentages.round(2).astype(str) + '%',  # Add percentage to the text
        textposition='outside',  # Show text outside the bars
        marker=dict(color='lightblue'),  # Customize bar color
    )
])

# Customize the layout
fig3.update_layout(
    title='Distribution of Delays with Percentage Tags',
    xaxis=dict(title='Delay'),
    yaxis=dict(title='Count'),
    showlegend=False,
)

fig3.show()


32% of the rental end well as the user checks out earlier or on time, and 44% of them check out late.

We don't have any information about the delay for 23% of our dataset.

Among the users that check out late, more than 50% are "only" one hour or less late.

In [22]:
# Calculate the frequency of late check-ins
late_checkins = delay_data[delay_data['delay_at_checkout_in_minutes'] > 0]
late_checkin_frequency = (len(late_checkins) / len(delay_data)) * 100

print("Percentage of late check-ins:", late_checkin_frequency)

# Analyze the impact on the next driver

average_delay = late_checkins['time_delta_with_previous_rental_in_minutes'].mean()
max_delay = late_checkins['time_delta_with_previous_rental_in_minutes'].max()
min_delay = late_checkins['time_delta_with_previous_rental_in_minutes'].min()

print("Average delay in minutes for the next driver:", average_delay)
print("Maximum delay in minutes for the next driver:", max_delay)
print("Minimum delay in minutes for the next driver:", min_delay)


Percentage of late check-ins: 44.11640459985919
Average delay in minutes for the next driver: 259.3765586034913
Maximum delay in minutes for the next driver: 720.0
Minimum delay in minutes for the next driver: 0.0


Let's check which delays had a negative impact on the next rental. Here we want to highlight the rentals that couldnot start on time because the previous rental was checked out later that the new check in rental.

In [23]:
# Create a new DataFrame 'test_df' by merging on 'previous_ended_rental_id'
merge_id_df = delay_data.merge(
    delay_data[['rental_id']],
    left_on='previous_ended_rental_id',
    right_on='rental_id',
    how='inner'
)

# Calculate the new column 'delta_previous_and_delay'
merge_id_df['delta_previous_and_delay'] = merge_id_df['time_delta_with_previous_rental_in_minutes'] - merge_id_df['delay_at_checkout_in_minutes']

# Select rows where 'delta_previous_and_delay' values are strictly negative
negative_delta_rows = merge_id_df[merge_id_df['delta_previous_and_delay'] < 0]

# Display the selected rows
display(negative_delta_rows.head())
print(f"Shape of negative_delta_rows dataset : {negative_delta_rows.shape}")


Unnamed: 0,rental_id_x,car_id,checkin_type,state,delay_at_checkout_in_minutes,previous_ended_rental_id,time_delta_with_previous_rental_in_minutes,delay,rental_id_y,delta_previous_and_delay
8,535770,352436,mobile,ended,74.0,524703,60.0,1 to 2 Hours,524703,-14.0
9,537576,397470,mobile,ended,18.0,539005,0.0,< 1 Hour,539005,-18.0
11,540479,374684,mobile,ended,12.0,539751,0.0,< 1 Hour,539751,-12.0
12,541862,382364,mobile,ended,125.0,540607,0.0,2 to 5 Hours,540607,-125.0
18,543808,369230,mobile,ended,75.0,536315,60.0,1 to 2 Hours,536315,-15.0


Shape of negative_delta_rows dataset : (270, 10)


Here the 'delta_previous_and_delay' columns shows the delta in minutes between the time between 2 rentals, and the delay of the previous rental. We only select negative values because when negative, it means that the next user didn't have access to its rental on time because the previous user was late for check out.

In [24]:
# Calculate the percentage of problematic delays among all the delays
nb_late_checkins = len(late_checkins)
nb_problematic_delays = negative_delta_rows.shape[0]
problematic_delays_rate = nb_problematic_delays*100/nb_late_checkins
print(f"Among all the delays, {round(problematic_delays_rate, 2)}% of delays caused problems to the next rental because the checkout\n was made later than the new rental checkin.")



Among all the delays, 2.87% of delays caused problems to the next rental because the checkout
 was made later than the new rental checkin.


In [25]:
# Calculate the average duration of problematic delays
average_problematic_delay = negative_delta_rows['delay_at_checkout_in_minutes'].mean()

# Calculate the average duration of non-problematic delays
average_non_problematic_delay = delay_data[delay_data['delay_at_checkout_in_minutes'] > 0]['delay_at_checkout_in_minutes'].mean()

# Compare the averages
print("Average Duration of Problematic Delays:", average_problematic_delay)
print("Average Duration of Non-Problematic Delays:", average_non_problematic_delay)


Average Duration of Problematic Delays: 355.75925925925924
Average Duration of Non-Problematic Delays: 178.6762421534206


In [26]:
# Filter problematic and non-problematic delays
problematic_delays = delay_data[delay_data['delay_at_checkout_in_minutes'] < 0]
non_problematic_delays = delay_data[delay_data['delay_at_checkout_in_minutes'] >= 0]

# Create a box plot for delay duration
fig = px.box(data_frame=delay_data, x='delay', y='delay_at_checkout_in_minutes',
             category_orders={"delay": ["Early or On Time", "< 1 Hour", "1 to 2 Hours", "2 to 5 Hours", "5 to 24 Hours", "1 day or more", "Unknown"]})

fig.update_layout(title='Distribution of Delay Durations by Delay Category')
fig.show()


In [27]:
# Create a histogram for problematic delays
fig = px.histogram(data_frame=negative_delta_rows, x='delay_at_checkout_in_minutes',
                   title='Distribution of Problematic Delays by Delay Duration')
fig.show()

Most of the delay durations are under 200 minutes.

In [28]:
# Create a time-based histogram for check-in times
fig = px.histogram(data_frame=negative_delta_rows, x='checkin_type', color='delay',
                   title='Distribution of Check-in Types by Delay Category')
fig.show()

The worst delays concern the rentals made through the web application.



#### Threshold: how long should the minimum delay be?

In [29]:
# Group data by delay threshold and calculate the percentage of problematic delays
thresholds = [60, 90, 120, 150, 180, 210, 240, 300, 360, 420, 480, 600, 720, 1440]  # Define different delay thresholds (in minutes)
problematic_rates = []

for threshold in thresholds:
    # Filter data for delays exceeding the threshold
    delayed_rentals = delay_data[delay_data['delay_at_checkout_in_minutes'] > threshold]

    # Calculate the percentage of problematic delays for each threshold
    problematic_rate = (len(delayed_rentals) / len(delay_data)) * 100
    problematic_rates.append(problematic_rate)

# Create a line plot to visualize the impact of different thresholds
fig = px.line(x=thresholds, y=problematic_rates, markers=True, title='Impact of Delay Threshold on Problematic Delays')
fig.update_layout(xaxis_title='Delay Threshold (minutes)', yaxis_title='Percentage of Problematic Delays')
fig.show()

According to this graph, the threshold should be set at 300 minutes (5 hours) so that we hope to get less that 5% of problematic delays.

As we've seen before, delays seem to be more important when check-in occurs through mobile. Let's check if different threshold should be set according on whether check-in occured through mobile or connect.

In [33]:
# Filter data for 'mobile' check-ins
mobile_checkins = delay_data[delay_data['checkin_type'] == 'mobile']

# Define different delay thresholds (in minutes)
thresholds = [60, 90, 120, 150, 180, 210, 240, 300, 360, 420, 480, 600, 720, 800, 900, 950, 1000, 1200, 1440]

# Initialize a list to store problematic rates for 'mobile' check-ins
mobile_problematic_rates = []

for threshold in thresholds:
    # Filter 'mobile' check-ins for delays exceeding the threshold
    delayed_mobile_checkins = mobile_checkins[mobile_checkins['delay_at_checkout_in_minutes'] > threshold]

    # Calculate the percentage of problematic delays for each threshold and 'mobile' check-ins
    mobile_problematic_rate = (len(delayed_mobile_checkins) / len(mobile_checkins)) * 100
    mobile_problematic_rates.append(mobile_problematic_rate)

# Create a line plot to visualize the impact of different thresholds for 'mobile' check-ins
fig = px.line(x=thresholds, y=mobile_problematic_rates, markers=True, title='Impact of Delay Threshold on Problematic Delays for Mobile Check-ins')
fig.update_layout(xaxis_title='Delay Threshold (minutes)', yaxis_title='Percentage of Problematic Delays')
fig.show()


For mobile check-in, the most adapted threshold to get less than 2% problematic delays seems to be 950 minutes (more than 15 hours). It seems too high as owners would not be able to rent their car twice in the same day.

If we aim to get less than 5% problematic delays, then the best threshold seems to be 360 minutes (6 hours).

In [30]:
# Filter data for 'connect' check-ins
connect_checkins = delay_data[delay_data['checkin_type'] == 'connect']

# Define different delay thresholds (in minutes)
thresholds = [60, 90, 120, 130, 140, 150, 180, 210, 240, 300, 360]

# Initialize a list to store problematic rates for 'connect' check-ins
connect_problematic_rates = []

for threshold in thresholds:
    # Filter 'connect' check-ins for delays exceeding the threshold
    delayed_connect_checkins = connect_checkins[connect_checkins['delay_at_checkout_in_minutes'] > threshold]

    # Calculate the percentage of problematic delays for each threshold and 'connect' check-ins
    connect_problematic_rate = (len(delayed_connect_checkins) / len(connect_checkins)) * 100
    connect_problematic_rates.append(connect_problematic_rate)

# Create a line plot to visualize the impact of different thresholds for 'connect' check-ins
fig = px.line(x=thresholds, y=connect_problematic_rates, markers=True, title='Impact of Delay Threshold on Problematic Delays for Connect Check-ins')
fig.update_layout(xaxis_title='Delay Threshold (minutes)', yaxis_title='Percentage of Problematic Delays')
fig.show()


On the other hand, for connect check-in, the best threshold to get less than 2% of problematic delays seems to be 250 minutes (a bit more than 4 hours), so much less than the mobile check-ins.

If we aim to get less than 5% problematic delays, the threshold should be set at 150 minutes (2,5 hours).

In conclusion, a different threshold should definitely be set according to the check-in type.

If we want to minimize the risk of problematic delays to 2%, we should choose : 950 minutes for mobile check-ins and 250 minutes for connect check-ins. The problem is that this way, we limit the number of times the car can be rented per day, so we decrease the owner's revenue.

If we prefer to higher the risk to 5% in order to allow to the owners to rent their car several times a day, then we should set 360 minutes for mobile check-in and 150 minutes for connect check-ins.

#### How many problematic cases will it solve depending on the chosen threshold and scope?

In [31]:
# Define the chosen threshold for mobile and connect check-ins with 5% risk
threshold_mobile_5 = 360  # Minutes for mobile check-ins
threshold_connect_5 = 150  # Minutes for connect check-ins

# Filter data for 'mobile' check-ins with delays below or equal to the threshold
solved_mobile_checkins_5 = negative_delta_rows[
    (negative_delta_rows['checkin_type'] == 'mobile') &
    (negative_delta_rows['delay_at_checkout_in_minutes'] <= threshold_mobile_5)]

# Filter data for 'connect' check-ins with delays below or equal to the threshold
solved_connect_checkins_5 = negative_delta_rows[
    (negative_delta_rows['checkin_type'] == 'connect') &
    (negative_delta_rows['delay_at_checkout_in_minutes'] <= threshold_connect_5)]

# Calculate the number of problematic delays solved for each check-ins
solved_mobile_problems_5 = len(solved_mobile_checkins_5)
solved_connect_problems_5 = len(solved_connect_checkins_5)

# Calculate the percentage of problematic delays solved for each check-in type
total_mobile_problems_5 = len(negative_delta_rows[negative_delta_rows['checkin_type'] == 'mobile'])
total_connect_problems_5 = len(negative_delta_rows[negative_delta_rows['checkin_type'] == 'connect'])

percentage_solved_mobile_5 = (solved_mobile_problems_5 / total_mobile_problems_5) * 100
percentage_solved_connect_5 = (solved_connect_problems_5 / total_connect_problems_5) * 100

# Display the results
print("Number of problematic delays solved for mobile check-ins:", solved_mobile_problems_5)
print("Number of problematic delays solved for connect check-ins:", solved_connect_problems_5)

print("Percentage of problematic delays solved for mobile check-ins:", round(percentage_solved_mobile_5, 2), "%")
print("Percentage of problematic delays solved for connect check-ins:", round(percentage_solved_connect_5, 2), "%")

Number of problematic delays solved for mobile check-ins: 142
Number of problematic delays solved for connect check-ins: 63
Percentage of problematic delays solved for mobile check-ins: 74.74 %
Percentage of problematic delays solved for connect check-ins: 78.75 %


In [32]:
# Define the chosen threshold for mobile and connect check-ins with 5% risk
threshold_mobile_2 = 950  # Minutes for mobile check-ins
threshold_connect_2 = 250  # Minutes for connect check-ins

# Filter data for 'mobile' check-ins with delays below or equal to the threshold
solved_mobile_checkins_2 = negative_delta_rows[
    (negative_delta_rows['checkin_type'] == 'mobile') &
    (negative_delta_rows['delay_at_checkout_in_minutes'] <= threshold_mobile_2)]

# Filter data for 'connect' check-ins with delays below or equal to the threshold
solved_connect_checkins_2 = negative_delta_rows[
    (negative_delta_rows['checkin_type'] == 'connect') &
    (negative_delta_rows['delay_at_checkout_in_minutes'] <= threshold_connect_2)]

# Calculate the number of problematic delays solved for each check-ins
solved_mobile_problems_2 = len(solved_mobile_checkins_2)
solved_connect_problems_2 = len(solved_connect_checkins_2)

# Calculate the percentage of problematic delays solved for each check-in type
total_mobile_problems_2 = len(negative_delta_rows[negative_delta_rows['checkin_type'] == 'mobile'])
total_connect_problems_2 = len(negative_delta_rows[negative_delta_rows['checkin_type'] == 'connect'])

percentage_solved_mobile_2 = (solved_mobile_problems_2 / total_mobile_problems_2) * 100
percentage_solved_connect_2 = (solved_connect_problems_2 / total_connect_problems_2) * 100

# Display the results
print("Number of problematic delays solved for mobile check-ins:", solved_mobile_problems_2)
print("Number of problematic delays solved for connect check-ins:", solved_connect_problems_2)

print("Percentage of problematic delays solved for mobile check-ins:", round(percentage_solved_mobile_2, 2), "%")
print("Percentage of problematic delays solved for connect check-ins:", round(percentage_solved_connect_2, 2), "%")

Number of problematic delays solved for mobile check-ins: 163
Number of problematic delays solved for connect check-ins: 72
Percentage of problematic delays solved for mobile check-ins: 85.79 %
Percentage of problematic delays solved for connect check-ins: 90.0 %


Results are better with thresholds set to minimize problematic delay risks to 2%, but I think the loss of income for owners is worth a higher risk of problematic delays.

## Pricing Analysis

### Data exploration and cleaning

In [None]:
# First look at the dataset
pricing_data

In [None]:
# Let's start with global information about the dataset
pricing_data.info()

In [None]:
# Delete 'Unnamed: 0' column
pricing_data = pricing_data.drop("Unnamed: 0", axis=1)

In [None]:
# Basic stats
print("Number of rows : {}".format(pricing_data.shape[0]))
print()

print("Display of dataset: ")
display(pricing_data.head())
print()

print("Basics statistics: ")
data_desc = pricing_data.describe(include='all')
display(data_desc)
print()

print("Percentage of missing values: ")
display(100*pricing_data.isnull().sum()/pricing_data.shape[0])

#### Missing values

There are no missing values in this dataset.

#### Outliers

In [None]:
# Create box plots for all numeric columns in the dataset
for column in pricing_data.select_dtypes(include=['number']).columns:
    fig = px.box(pricing_data, y=column, title=f'Box Plot of {column}')
    fig.update_xaxes(title_text=column)
    fig.show()

There is an abnormal value in the 'mileage' column, as I don't see how is it possible for a car to have a 1 000 000 mileage.

The other outliers seem ok to me and I decided to keep them.

In [None]:
# Delete 'mileage' outliers
pricing_data = pricing_data[pricing_data['mileage'] <= 500000]

# Check new shape of dataset
pricing_data.shape

As I thought, we only lost one entrie.

#### Duplicated values

In [None]:
# Check duplicated values
pricing_data.duplicated(subset=None, keep='first').value_counts()

There is no duplicated row in this dataset. Let's proceed.

### EDA - visualizations

#### Histograms

In [None]:
# Create a histogram for 'mileage'
fig_mileage = px.histogram(pricing_data, x="mileage", title="Distribution of Mileage")
fig_mileage.show()

# Create a histogram for 'engine power'
fig_engine_power = px.histogram(pricing_data, x="engine_power", title="Distribution of Engine Power")
fig_engine_power.show()

# Create a histogram for 'rental price per day'
fig_rental_price = px.histogram(pricing_data, x="rental_price_per_day", title="Distribution of Rental Price per Day")
fig_rental_price.show()


In [None]:
# Check high rental prices entries
high_pricing_data = pricing_data[pricing_data['rental_price_per_day'] > 300]
high_pricing_data


On a second though, these entries seem to be abnormal entries as the cars are not that impressive and are extremely expensive. Let's delete them.

In [None]:
# Delete outliers from 'rental_price_per_day' column
pricing_data = pricing_data[pricing_data['rental_price_per_day'] <= 300]


In [None]:
# Check high engine power entries
high_engine_data = pricing_data[pricing_data['engine_power'] >= 300]
high_engine_data

I don't know enough about cars to decide whether these engine powers are abnormal values or not, so I won't touch them.

#### Bar plots

In [None]:
# Bar plot for car brand (model_key)
fig_brand = px.bar(pricing_data, x="model_key", title="Frequency of Car Brands")
fig_brand.show()

# Bar plot for fuel type
fig_fuel = px.bar(pricing_data, x="fuel", title="Frequency of Fuel Types")
fig_fuel.show()

# Bar plot for paint color
fig_color = px.bar(pricing_data, x="paint_color", title="Frequency of Paint Colors")
fig_color.show()

# Bar plot for car type
fig_type = px.bar(pricing_data, x="car_type", title="Frequency of Car Types")
fig_type.show()

# Bar plot for private parking availability
fig_parking = px.bar(pricing_data, x="private_parking_available", title="Private Parking Availability")
fig_parking.show()


Nothing upsets me about these bar plots :
- the most frequent brand cars are : Citroën, Renault and BWM
- most frequent fuel is diesel : diesel was cheaper than petrol back in 2017 so it makes sens that there are more diesel cars than petrol cars
- most frequent paint colors are neutral colors : black, grey and blue
- most frequent car types are : estate, sedan and suv

According to these information, I guess that the data are from the US.

#### Bivariate analysis

In [None]:
# Numeric features
  # List of numeric columns (excluding 'rental_price_per_day')
numeric_columns = ['mileage', 'engine_power']

  # Create scatter plots for each numeric column vs. rental_price_per_day
for column in numeric_columns:
    fig = px.scatter(pricing_data, x=column, y='rental_price_per_day', title=f'{column} vs. Rental Price')
    fig.show()


We can say that the higher the mileage, the lower rental price per day is.

Regarding engine power, when the power increases, the rental price per day increases as well.

In [None]:
# Categorical features
  # List of categorical columns
categorical_columns = ['model_key', 'fuel', 'paint_color', 'car_type', 'private_parking_available', 'has_gps', 'has_air_conditioning', 'automatic_car', 'has_getaround_connect', 'has_speed_regulator', 'winter_tires']

  # Create box plots for each categorical column vs. rental_price_per_day
for column in categorical_columns:
    fig = px.box(pricing_data, x=column, y='rental_price_per_day', title=f'{column} vs. Rental Price')
    fig.show()


Here are some interpretations :
- the most luxurious brands are more expensive
- hybrid is the most expensive fuel type
- green car are cheaper
- sedan and suv cars are more expensive
- when a parking lot is available, the rental price per day is higher
- cars with GPS option are lightly more expensive
- cars with air conditionning option are lightly more expensive
- automatic cars are more expensive
- Getaround Connect option doesn't seem to have an impact on the rental price per day
- cars with speed regulator option are more expensive
- cars with winter tires are more expensive.

In general, cars with more or better options are more expensive, it totally makes sens.

#### Which share of our owner’s revenue would potentially be affected by the feature?

Unfortunately there is no way to connect delay_data dataset and pricing_data dataset as they share no identical informations. So we won't be able to answer this question.

## Machine Learning

Data Visualizations:

Create visualizations to gain insights from the data. You can use libraries like Matplotlib or Seaborn to plot histograms, scatterplots, and other relevant charts. Visualizations can help you understand the distribution of delays and relationships between variables.
Answering Specific Questions:

Based on your project goals, start answering the specific questions related to late returns and their impact on the next driver. For example, you can calculate the average delay, the share of owner's revenue affected, and the number of affected rentals.
Dashboard Development:

Begin developing your web dashboard using a framework like Streamlit. Incorporate the visualizations and analysis you've performed to make the dashboard informative and interactive.

## Data preprocessing

In [None]:
# Separate target variable Y from features X
target_variable = "rental_price_per_day"
features_list = [col for col in pricing_data.columns if col != target_variable]

X = pricing_data.loc[:,features_list]
Y = pricing_data.loc[:,target_variable]

print('Y : ')
print(Y.head())
print()
print('X :')
print(X.head())

In [None]:
# Automatically detect names of numeric/categorical columns
numeric_features = []
categorical_features = []
for i,t in X.dtypes.items():
    if ('float' in str(t)) or ('int' in str(t)) :
        numeric_features.append(i)
    else :
        categorical_features.append(i)

print('Found numeric features ', numeric_features)
print('Found categorical features ', categorical_features)

In [None]:
# Divide dataset Train set & Test set
print("Dividing into train and test sets...")
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
print("...Done.")
print()

In [None]:
# Create pipeline for numeric features
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Create pipeline for categorical features
categorical_transformer = Pipeline(
    steps=[
    ('encoder', OneHotEncoder(drop='first', handle_unknown = 'ignore')) # first column will be dropped to avoid creating correlations between features
    ])

# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [None]:
# Preprocessings on train set
print("Performing preprocessings on train set...")
print(X_train.head())
X_train = preprocessor.fit_transform(X_train)
print('...Done.')
print(X_train[0:5]) # MUST use this syntax because X_train is a numpy array and not a pandas DataFrame anymore
print()

# Preprocessings on test set
print("Performing preprocessings on test set...")
print(X_test.head())
X_test = preprocessor.transform(X_test)

print('...Done.')
print(X_test[0:5,:]) # MUST use this syntax because X_test is a numpy array and not a pandas DataFrame anymore
print()

In [None]:
import pickle

# Save preprocessor object
with open('preprocessor.pkl', 'wb') as file:
    pickle.dump(preprocessor, file)

## Models

### Multiple Linear Regression

In [None]:
# Train model
print("Train model...")
regressor_lreg = LinearRegression()
regressor_lreg.fit(X_train, Y_train)
print("...Done.")

In [None]:
# Predictions on training set
Y_train_pred = regressor_lreg.predict(X_train)

# Predictions on test set
Y_test_pred = regressor_lreg.predict(X_test)

# Calculate R2 score
R2_train = regressor_lreg.score(X_train, Y_train)
R2_test = regressor_lreg.score(X_test, Y_test)

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(Y_test, Y_test_pred)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(Y_test, Y_test_pred)

# Add metrics DataFrame
  # Create a DataFrame to store the metrics
metrics_df = pd.DataFrame(columns=['Model', 'R2_Train', 'R2_Test', 'MAE', 'MSE'])

  # Store model name in a variable
model_name = 'Multivariate Logistic Regression'

  # Add the f1 scores of the model to the DataFrame
metrics_df = metrics_df.append({'Model': model_name, 'R2_Train': R2_train, 'R2_Test': R2_test, 'MAE': mae, 'MSE': mse}, ignore_index=True)

  # Show the DataFrame
display(metrics_df)

### Feature selection

In [None]:
# Calculate the coefficients for each feature
regressor_lreg.coef_

In [None]:
# Get the name of columns corresponding to coefficients
column_names = []
for name, transformer, features_list in preprocessor.transformers_:
    if name == 'num': # if pipeline is for numeric variables
        features = features_list # just get the names of columns to which it has been applied
    else:
        features = transformer.get_feature_names_out() # get output columns names from OneHotEncoder
    column_names.extend(features) # concatenate features names

print("Names of columns corresponding to each coefficient: ", column_names)

In [None]:
# Create a pandas DataFrame to store the coefficients
coefs = pd.DataFrame(index = column_names, data = regressor_lreg.coef_.transpose(), columns=["coefficients"])

# Compute abs() and sort values
feature_importance = abs(coefs).sort_values(by = 'coefficients')
feature_importance

It seems that the less important features are some boolean options : *private_parking_available*, *has_air_conditioning* and *winter_tires*. Let's try to relaunch the model dropping these columns.

In [None]:
# Drop the 'private_parking_available', 'has_air_conditioning' and 'winter_tires' columns for feature selection
fs_pricing_data = pricing_data.drop(['private_parking_available', 'has_air_conditioning', 'winter_tires'], axis=1)

In [None]:
# Separate target variable Y1 from features X1
fs_target_variable = "rental_price_per_day"
fs_features_list = [col for col in fs_pricing_data.columns if col != target_variable]

X1 = fs_pricing_data.loc[:,fs_features_list]
Y1 = fs_pricing_data.loc[:,fs_target_variable]

In [None]:
# Automatically detect names of numeric/categorical columns
fs_numeric_features = []
fs_categorical_features = []
for i,t in X1.dtypes.items():
    if ('float' in str(t)) or ('int' in str(t)) :
        fs_numeric_features.append(i)
    else :
        fs_categorical_features.append(i)

In [None]:
# Divide dataset Train set & Test set
print("Dividing into train and test sets...")
X1_train, X1_test, Y1_train, Y1_test = train_test_split(X1, Y1, test_size=0.2, random_state=0)

In [None]:
# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
fs_preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, fs_numeric_features),
        ('cat', categorical_transformer, fs_categorical_features)
    ])

In [None]:
# Preprocessings on train set
print("Performing preprocessings on train set...")
print(X1_train.head())
X1_train = fs_preprocessor.fit_transform(X1_train)
print('...Done.')
print(X1_train[0:5])
print()

# Preprocessings on test set
print("Performing preprocessings on test set...")
print(X1_test.head())
X1_test = fs_preprocessor.transform(X1_test)

print('...Done.')
print(X1_test[0:5,:])

In [None]:
# Train model
print("Train model...")
fs_lreg = LinearRegression()
fs_lreg.fit(X1_train, Y1_train)
print("...Done.")

In [None]:
# Predictions on training set
Y1_train_pred = fs_lreg.predict(X1_train)

# Predictions on test set
Y1_test_pred = fs_lreg.predict(X1_test)

# Calculate R2 score
R2_train_fs = fs_lreg.score(X1_train, Y1_train)
R2_test_fs = fs_lreg.score(X1_test, Y1_test)

# Calculate Mean Absolute Error (MAE)
mae_fs = mean_absolute_error(Y1_test, Y1_test_pred)

# Calculate Mean Squared Error (MSE)
mse_fs = mean_squared_error(Y1_test, Y1_test_pred)

# Add metrics in the metrics resume DataFrame
  # Store model name in a variable
model_name2 = 'Feature selection LinReg'

  # Add the f1 scores of the model to the DataFrame
metrics_df = metrics_df.append({'Model': model_name2, 'R2_Train': R2_train_fs, 'R2_Test': R2_test_fs,
                                'MAE': mae_fs, 'MSE': mse_fs}, ignore_index=True)

  # Show the DataFrame
display(metrics_df)

### Regularization and hyperparameters optimization - Ridge & Lasso

#### Ridge

In [None]:
# Perform 10-fold cross-validation to evaluate the generalized R2 score obtained with a Ridge model
print("10-fold cross-validation...")
regressor_ridge = Ridge()
scores_ridge = cross_val_score(regressor_ridge, X_train, Y_train, cv=10)
print('The cross-validated R2-score is : ', scores_ridge.mean())
print('The standard deviation is : ', scores_ridge.std())

In [None]:
# Perform grid search
print("Grid search...")

# Grid of values to be tested
params = {
    'alpha': [0.0, 0.5, 1.0, 1.5, 1.7, 1.71, 1.72, 1.73, 1.74, 1.75, 2.0, 5.0] # 0 corresponds to no regularization
}
gridsearch_ridge = GridSearchCV(regressor_ridge, param_grid = params, cv = 10) # cv : the number of folds to be used for CV
gridsearch_ridge.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearch_ridge.best_params_)
print("Best R2 score : ", gridsearch_ridge.best_score_)

In [None]:
# Predictions on training set
Y_train_pred = gridsearch_ridge.predict(X_train)

# Predictions on test set
Y_test_pred = gridsearch_ridge.predict(X_test)

# Calculate R2 score
R2_train_ridge = gridsearch_ridge.score(X_train, Y_train)
R2_test_ridge = gridsearch_ridge.score(X_test, Y_test)

# Calculate Mean Absolute Error (MAE)
mae_ridge = mean_absolute_error(Y_test, Y_test_pred)

# Calculate Mean Squared Error (MSE)
mse_ridge = mean_squared_error(Y_test, Y_test_pred)

# Add metrics in the metrics resume DataFrame
  # Store model name in a variable
model_name3 = 'Ridge'

  # Add the f1 scores of the model to the DataFrame
metrics_df = metrics_df.append({'Model': model_name3, 'R2_Train': R2_train_ridge, 'R2_Test': R2_test_ridge,
                                'MAE': mae_ridge, 'MSE': mse_ridge}, ignore_index=True)

  # Show the DataFrame
display(metrics_df)

#### Lasso

In [None]:
# Perform 10-fold cross-validation to evaluate the generalized R2 score obtained with a Lasso model
print("10-fold cross-validation...")
regressor_lasso = Lasso()
scores_lasso = cross_val_score(regressor_lasso, X_train, Y_train, cv=10)
print('The cross-validated R2-score is : ', scores_lasso.mean())
print('The standard deviation is : ', scores_lasso.std())

In [None]:
# Perform grid search
print("Grid search...")

# Grid of values to be tested
params = {
    'alpha': [0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 20.0, 50.0, 75.0, 100.0]
}
gridsearch_lasso = GridSearchCV(regressor_lasso, param_grid = params, cv = 10) # cv : the number of folds to be used for CV
gridsearch_lasso.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearch_lasso.best_params_)
print("Best R2 score : ", gridsearch_lasso.best_score_)

In [None]:
# Predictions on training set
Y_train_pred = gridsearch_lasso.predict(X_train)

# Predictions on test set
Y_test_pred = gridsearch_lasso.predict(X_test)

# Calculate R2 score
R2_train_lasso = gridsearch_lasso.score(X_train, Y_train)
R2_test_lasso = gridsearch_lasso.score(X_test, Y_test)

# Calculate Mean Absolute Error (MAE)
mae_lasso = mean_absolute_error(Y_test, Y_test_pred)

# Calculate Mean Squared Error (MSE)
mse_lasso = mean_squared_error(Y_test, Y_test_pred)

# Add metrics in the metrics resume DataFrame
  # Store model name in a variable
model_name4 = 'Lasso'

  # Add the f1 scores of the model to the DataFrame
metrics_df = metrics_df.append({'Model': model_name4, 'R2_Train': R2_train_lasso, 'R2_Test': R2_test_lasso,
                                'MAE': mae_lasso, 'MSE': mse_lasso}, ignore_index=True)

  # Show the DataFrame
display(metrics_df)

### SVM

In [None]:
# Define a range of hyperparameters to search
param_grid = {
    'kernel': ['linear', 'rbf', 'poly'],  # Different kernel functions to try
    'C': [0.1, 1, 10],  # Regularization parameter
    'epsilon': [0.01, 0.1, 1]  # Epsilon parameter for epsilon-SVR
}

# Create an SVM regressor
regressor_svm = SVR()

# Create a GridSearchCV object
grid_search_svm = GridSearchCV(regressor_svm, param_grid, cv=10, scoring='r2')

# Perform grid search on the training data
print("Performing grid search for SVM...")
grid_search_svm.fit(X_train, Y_train)
print("...Done.")

# Get the best hyperparameters
best_params_svm = grid_search_svm.best_params_
print("Best hyperparameters for SVM:", best_params_svm)

# Get the best R2 score achieved during grid search
best_r2_score_svm = grid_search_svm.best_score_
print("Best R2 score for SVM:", best_r2_score_svm)

# Get the best SVM model with the best hyperparameters
best_svm_model = grid_search_svm.best_estimator_

# Use the best model to make predictions
Y_test_pred_svm = best_svm_model.predict(X_test)

# Calculate R2 score
R2_test_svm = best_svm_model.score(X_test, Y_test)

# Calculate Mean Absolute Error (MAE)
mae_svm = mean_absolute_error(Y_test, Y_test_pred_svm)

# Calculate Mean Squared Error (MSE)
mse_svm = mean_squared_error(Y_test, Y_test_pred_svm)

# Add metrics in the metrics resume DataFrame
model_name_svm = 'SVM (GridSearch)'
metrics_df = metrics_df.append({'Model': model_name_svm, 'R2_Train': best_r2_score_svm, 'R2_Test': R2_test_svm,
                                'MAE': mae_svm, 'MSE': mse_svm}, ignore_index=True)

# Show the DataFrame with SVM model metrics
display(metrics_df)


### Decision Tree

In [None]:
# Create a Decision Tree regressor
regressor_dt = DecisionTreeRegressor()

# Define a range of hyperparameters to search
param_grid = {
    'max_depth': [1, 5, 10],         # Maximum depth of the tree
    'min_samples_split': [2, 3, 4],        # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 4, 8],          # Minimum number of samples required to be at a leaf node
    }

# Create a GridSearchCV object
grid_search_dt = GridSearchCV(regressor_dt, param_grid, cv=5, scoring='r2')

# Perform grid search on the training data
print("Performing grid search for Decision Tree...")
grid_search_dt.fit(X_train, Y_train)
print("...Done.")

# Get the best hyperparameters
best_params_dt = grid_search_dt.best_params_
print("Best hyperparameters for Decision Tree:", best_params_dt)

# Get the best R2 score achieved during grid search
best_r2_score_dt = grid_search_dt.best_score_
print("Best R2 score for SVM:", best_r2_score_dt)

# Get the best mean squared error achieved during grid search
best_mse_dt = -grid_search_dt.best_score_
print("Best Mean Squared Error for Decision Tree:", best_mse_dt)

# Get the best Decision Tree model with the best hyperparameters
best_dt_model = grid_search_dt.best_estimator_

# Use the best model to make predictions
Y_test_pred_dt = best_dt_model.predict(X_test)

# Calculate R2 score
R2_test_dt = best_dt_model.score(X_test, Y_test)

# Calculate Mean Absolute Error (MAE)
mae_dt = mean_absolute_error(Y_test, Y_test_pred_dt)

# Calculate Mean Squared Error (MSE)
mse_dt = mean_squared_error(Y_test, Y_test_pred_dt)

# Add metrics in the metrics resume DataFrame
model_name_dt = 'Decision Tree (GridSearch)'
new_row = pd.DataFrame({'Model': [model_name_dt], 'R2_Train': best_r2_score_dt, 'R2_Test': R2_test_dt,
                                'MAE': mae_dt, 'MSE': mse_dt})
metrics_df = pd.concat([metrics_df, new_row], ignore_index=True)

# Show the DataFrame with Decision Tree model metrics
display(metrics_df)


### Random Forest

In [None]:
# Create a Random Forest regressor
regressor_rf = RandomForestRegressor(random_state=0)

# Define a range of hyperparameters to search
param_grid = {
    'n_estimators': [50, 100, 200],          # Number of trees in the forest
    'max_depth': [1, 15, 30],         # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],        # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],          # Minimum number of samples required to be at a leaf node
}

# Create a GridSearchCV object
grid_search_rf = GridSearchCV(regressor_rf, param_grid, cv=5, scoring='r2')

# Perform grid search on the training data
print("Performing grid search for Random Forest...")
grid_search_rf.fit(X_train, Y_train)
print("...Done.")

# Get the best hyperparameters
best_params_rf = grid_search_rf.best_params_
print("Best hyperparameters for Random Forest:", best_params_rf)

# Get the best R2 score achieved during grid search
best_r2_score_rf = grid_search_rf.best_score_
print("Best R2 score for SVM:", best_r2_score_rf)

# Get the best mean squared error achieved during grid search
best_mse_rf = -grid_search_rf.best_score_
print("Best Mean Squared Error for Random Forest:", best_mse_rf)

# Get the best Random Forest model with the best hyperparameters
best_rf_model = grid_search_rf.best_estimator_

# Use the best model to make predictions
Y_test_pred_rf = best_rf_model.predict(X_test)

# Calculate R2 score
R2_test_rf = best_rf_model.score(X_test, Y_test)

# Calculate Mean Absolute Error (MAE)
mae_rf = mean_absolute_error(Y_test, Y_test_pred_rf)

# Calculate Mean Squared Error (MSE)
mse_rf = mean_squared_error(Y_test, Y_test_pred_rf)

# Add metrics in the metrics resume DataFrame
model_name_rf = 'Random Forest (GridSearch)'
new_row = pd.DataFrame({'Model': [model_name_rf], 'R2_Train': best_r2_score_rf, 'R2_Test': R2_test_rf,
                                'MAE': mae_rf, 'MSE': mse_rf})
metrics_df = pd.concat([metrics_df, new_row], ignore_index=True)

# Show the DataFrame with Random Forest model metrics
display(metrics_df)


### AdaBoost

In [None]:
# Create an AdaBoost regressor
regressor_adaboost = AdaBoostRegressor()

# Define a range of hyperparameters to search
param_grid = {
    'n_estimators': [50, 100, 200],         # Number of estimators (weak learners)
    'learning_rate': [0.01, 0.1, 1.0],    # Learning rate
}

# Create a GridSearchCV object
grid_search_adaboost = GridSearchCV(regressor_adaboost, param_grid, cv=5, scoring='r2')

# Perform grid search on the training data
print("Performing grid search for AdaBoost...")
grid_search_adaboost.fit(X_train, Y_train)
print("...Done.")

# Get the best hyperparameters
best_params_adaboost = grid_search_adaboost.best_params_
print("Best hyperparameters for AdaBoost:", best_params_adaboost)

# Get the best R2 score achieved during grid search
best_r2_score_adaboost = grid_search_adaboost.best_score_
print("Best R2 score for SVM:", best_r2_score_adaboost)

# Get the best mean squared error achieved during grid search
best_mse_adaboost = -grid_search_adaboost.best_score_
print("Best Mean Squared Error for AdaBoost:", best_mse_adaboost)

# Get the best AdaBoost model with the best hyperparameters
best_adaboost_model = grid_search_adaboost.best_estimator_

# Use the best model to make predictions
Y_test_pred_adaboost = best_adaboost_model.predict(X_test)

# Calculate R2 score
R2_test_adaboost = best_adaboost_model.score(X_test, Y_test)

# Calculate Mean Absolute Error (MAE)
mae_adaboost = mean_absolute_error(Y_test, Y_test_pred_adaboost)

# Add metrics in the metrics resume DataFrame
model_name_adaboost = 'AdaBoost (GridSearch)'
new_row = pd.DataFrame({'Model': [model_name_adaboost], 'R2_Train': best_r2_score_adaboost, 'R2_Test': R2_test_adaboost,
                                'MAE': mae_adaboost, 'MSE': best_mse_adaboost})
metrics_df = pd.concat([metrics_df, new_row], ignore_index=True)
# Show the DataFrame with AdaBoost model metrics
display(metrics_df)


### Gradient Boosting

In [None]:
# Create a Gradient Boosting regressor
regressor_gb = GradientBoostingRegressor()

# Define a range of hyperparameters to search
param_grid = {
    'n_estimators': [50, 100, 200],       # Number of estimators (weak learners)
    'learning_rate': [0.01, 0.1, 1.0],    # Learning rate
    'max_depth': [3, 4, 5],               # Maximum depth of individual trees
    'min_samples_split': [2, 5, 10],      # Minimum samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]         # Minimum samples required to be at a leaf node
}

# Create a GridSearchCV object
grid_search_gb = GridSearchCV(regressor_gb, param_grid, cv=5, scoring='r2')

# Perform grid search on the training data
print("Performing grid search for Gradient Boosting...")
grid_search_gb.fit(X_train, Y_train)
print("...Done.")

# Get the best hyperparameters
best_params_gb = grid_search_gb.best_params_
print("Best hyperparameters for Gradient Boosting:", best_params_gb)

# Get the best R2 score achieved during grid search
best_r2_score_gb = grid_search_gb.best_score_
print("Best R2 score for SVM:", best_r2_score_gb)

# Get the best mean squared error achieved during grid search
best_mse_gb = -grid_search_gb.best_score_
print("Best Mean Squared Error for Gradient Boosting:", best_mse_gb)

# Get the best Gradient Boosting model with the best hyperparameters
best_gb_model = grid_search_gb.best_estimator_

# Use the best model to make predictions
Y_test_pred_gb = best_gb_model.predict(X_test)

# Calculate R2 score
R2_test_gb = best_gb_model.score(X_test, Y_test)

# Calculate Mean Absolute Error (MAE)
mae_gb = mean_absolute_error(Y_test, Y_test_pred_gb)

# Calculate Mean Squared Error (MSE)
mse_gb = mean_squared_error(Y_test, Y_test_pred_gb)

# Add metrics in the metrics resume DataFrame
model_name_gb = 'Gradient Boosting (GridSearch)'
new_row = pd.DataFrame({'Model': [model_name_gb], 'R2_Train': best_r2_score_gb, 'R2_Test': R2_test_gb,
                                'MAE': mae_gb, 'MSE': mse_gb})
metrics_df = pd.concat([metrics_df, new_row], ignore_index=True)
# Show the DataFrame with Gradient Boosting model metrics
display(metrics_df)


### XGBoost

In [None]:
# Create an XGBoost regressor
regressor_xgb = xgb.XGBRegressor()

# Define a range of hyperparameters to search
param_grid = {
    'n_estimators': [50, 100, 200],       # Number of estimators (trees)
    'learning_rate': [0.01, 0.1, 1.0],    # Learning rate
    'max_depth': [3, 4, 5],               # Maximum depth of trees
    'min_child_weight': [1, 2, 4],        # Minimum sum of instance weight (hessian) needed in a child
    'gamma': [0, 0.1, 0.2],               # Minimum loss reduction required to make a further partition on a leaf node
    'subsample': [0.8, 0.9, 1.0],         # Fraction of samples used for fitting the trees
    'colsample_bytree': [0.8, 0.9, 1.0]   # Fraction of features used for fitting the trees
}

# Create a GridSearchCV object
grid_search_xgb = GridSearchCV(regressor_xgb, param_grid, cv=5, scoring='r2')

# Perform grid search on the training data
print("Performing grid search for XGBoost...")
grid_search_xgb.fit(X_train, Y_train)
print("...Done.")

# Get the best hyperparameters
best_params_xgb = grid_search_xgb.best_params_
print("Best hyperparameters for XGBoost:", best_params_xgb)

# Get the best R2 score achieved during grid search
best_r2_score_xgb = grid_search_xgb.best_score_
print("Best R2 score for SVM:", best_r2_score_xgb)

# Get the best mean squared error achieved during grid search
best_mse_xgb = -grid_search_xgb.best_score_
print("Best Mean Squared Error for XGBoost:", best_mse_xgb)

# Get the best XGBoost model with the best hyperparameters
best_xgb_model = grid_search_xgb.best_estimator_

# Use the best model to make predictions
Y_test_pred_xgb = best_xgb_model.predict(X_test)

# Calculate R2 score
R2_test_xgb = best_xgb_model.score(X_test, Y_test)

# Calculate Mean Absolute Error (MAE)
mae_xgb = mean_absolute_error(Y_test, Y_test_pred_xgb)

# Calculate Mean Squared Error (MSE)
mse_xgb = mean_squared_error(Y_test, Y_test_pred_xgb)

# Add metrics in the metrics resume DataFrame
model_name_xgb = 'XGBoost (GridSearch)'
new_row = pd.DataFrame({'Model': [model_name_xgb], 'R2_Train': best_r2_score_xgb, 'R2_Test': R2_test_xgb,
                                'MAE': mae_xgb, 'MSE': mse_xgb})
metrics_df = pd.concat([metrics_df, new_row], ignore_index=True)

# Show the DataFrame with XGBoost model metrics
display(metrics_df)

### Model selection - Voting

In [None]:
# Create a list of your trained regressor models
regressor_list = [
    ('Linear Regression', regressor_lreg),
    ('Ridge', gridsearch_ridge.best_estimator_),
    ('Lasso', gridsearch_lasso.best_estimator_),
    ('SVM', grid_search_svm.best_estimator_),
    ('Decision Tree', grid_search_dt.best_estimator_),
    ('Random Forest', grid_search_rf.best_estimator_),
    ('AdaBoost', best_adaboost_model),
    ('XGBoost', best_xgb_model)
]

# Create a VotingRegressor
voting_regressor = VotingRegressor(estimators=regressor_list)

# Fit the VotingRegressor on the training data
voting_regressor.fit(X_train, Y_train)

# Make predictions on the test data
Y_test_pred_voting = voting_regressor.predict(X_test)

# Calculate R2 score on train set
R2_train_voting = voting_regressor.score(X_train, Y_train)

# Calculate R2 score on test set
R2_test_voting = voting_regressor.score(X_test, Y_test)

# Calculate Mean Absolute Error (MAE)
mae_voting = mean_absolute_error(Y_test, Y_test_pred_voting)

# Calculate Mean Squared Error (MSE)
mse_voting = mean_squared_error(Y_test, Y_test_pred_voting)

# Add metrics in the metrics resume DataFrame
model_name_voting = 'Voting Regressor'
new_row = pd.DataFrame({'Model': [model_name_voting], 'R2_Train': R2_train_voting, 'R2_Test': R2_test_voting,
                                'MAE': mae_voting, 'MSE': mse_voting})
metrics_df = pd.concat([metrics_df, new_row], ignore_index=True)

# Show the DataFrame with Voting Regressor model metrics
display(metrics_df.sort_values(by='R2_Test', ascending=False))


## Model exportation

In [None]:
# Instanciate the best model (i.e. the one which got the best R2 on test set)
best_model = best_xgb_model

# Specify the filename for the saved model
model_filename = "best_model.pkl"

# Save the model to the specified file
joblib.dump(best_model, model_filename)

print(f"Saved the best model to {model_filename}")

In [None]:
import os
print(os.getcwd())
