<a href="https://colab.research.google.com/github/cbonnin88/RailFlow/blob/main/Product_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import polars as pl
import plotly.express as px

In [2]:
# Loading the Data
df_searches = pl.read_csv('clean_searches.csv')
df_bookings = pl.read_csv('clean_bookings.csv')
df_users = pl.read_csv('clean_users.csv')

In [3]:
df_searches.head()

search_id,user_id,timestamp,origin_station,destination_station,passenger_count,departure_date,depature_date
str,str,str,str,str,i64,str,str
"""fba4f473-c45c-44cb-a180-f1f0c5…","""d09d81e5-e17a-416c-b37a-a69be0…","""2025-05-10T11:53:09.000000""","""Nantes""","""Strasbourg""",1,"""2025-05-30T11:53:09.419189""",
"""e979d7ef-1c14-4bd7-b19e-fbf76b…","""bfcade59-80f8-4618-9ccc-de44a7…","""2025-01-18T02:45:56.746525""","""Marseille St-Charles""","""Nantes""",1,"""2025-01-23T02:45:56.746525""",
"""529b14cb-0ed5-4ee0-9228-e4a84d…","""0f2b800f-9f10-433e-9998-06d95e…","""2025-05-14T06:00:00.490315""","""Lyon Part-Dieu""","""Nantes""",2,"""2025-05-21T06:00:00.490315""",
"""1b905016-8e6c-4ea4-b96c-b4b9e8…","""5eab057d-28fa-4bcb-b1ab-a08a2c…","""2025-07-03T02:35:23.365211""","""Bordeaux St-Jean""","""Bordeaux St-Jean""",3,"""2025-08-01T02:35:23.365211""",
"""10bbab91-5dde-45a9-8717-df2cf2…","""64f51332-15dd-467b-9ef1-f4a32b…","""2025-10-31T14:05:16.469735""","""Lille Europe""","""Paris Gare De Lyon""",1,"""2025-11-05T14:05:16.469735""",


In [4]:
df_bookings.head()

booking_id,search_id,payment_status,ticket_class,amount_eur
str,str,str,str,f64
"""e80dc352-ee1b-4c10-b3eb-ce93bd…","""2fc1ab31-20f9-4577-8bed-544797…","""Success""","""2nd Class""",71.81
"""77b36a1f-fe38-4a8c-9af9-e7cff1…","""de0dbd1d-85f8-436b-b95f-ce4c0f…","""Success""","""2nd Class""",140.79
"""c21dffac-e07b-4338-92df-95cff2…","""2ab924c3-e678-45f6-b838-9248df…","""Success""","""2nd Class""",32.69
"""ccb7e082-734b-4644-9997-7bd3e4…","""f4bb9c14-7546-4df8-87c1-91765e…","""Success""","""2nd Class""",60.48
"""06d439ac-a86b-46ec-aaaf-159bd6…","""eb188485-97a9-4903-9521-b6e5d8…","""Success""","""2nd Class""",132.07


In [5]:
df_users.head()

user_id,signup_date,subscription_type,age,device_os
str,str,str,i64,str
"""d4f8a8ea-df0b-4f5b-8589-f8f2bd…","""2024-10-18""","""Max Jeune""",36,"""iOS"""
"""e5d121fd-10fa-4cbb-ac2d-548da5…","""2024-05-28""","""Free""",80,"""Unknown"""
"""8c606780-bb5b-4167-bb3e-6c5a87…","""2025-11-08""","""Free""",32,"""iOS"""
"""d9848138-7a75-49e1-b1fd-00f6cb…","""2025-01-14""","""Free""",43,"""Unknown"""
"""b568452d-e88d-4b23-b924-c5ef67…","""2024-12-25""","""Free""",39,"""Unknown"""


# **The Global Pulse & Conversion Rate**

1. **Question:** "How is our business performing day-by-day? What is our global conversion rate?"

In [6]:
# Create a 'Master Table' (Lef Join Searches -> Bookings)
df_master = (
    df_searches
    .join(df_bookings, on='search_id',how='left')
    .with_columns([
        # Create a boolean flag: Did this search result in a booking?
        pl.col('booking_id').is_not_null().alias('is_converted')
    ])
)

In [7]:
display(df_master.head())

search_id,user_id,timestamp,origin_station,destination_station,passenger_count,departure_date,depature_date,booking_id,payment_status,ticket_class,amount_eur,is_converted
str,str,str,str,str,i64,str,str,str,str,str,f64,bool
"""fba4f473-c45c-44cb-a180-f1f0c5…","""d09d81e5-e17a-416c-b37a-a69be0…","""2025-05-10T11:53:09.000000""","""Nantes""","""Strasbourg""",1,"""2025-05-30T11:53:09.419189""",,,,,,False
"""e979d7ef-1c14-4bd7-b19e-fbf76b…","""bfcade59-80f8-4618-9ccc-de44a7…","""2025-01-18T02:45:56.746525""","""Marseille St-Charles""","""Nantes""",1,"""2025-01-23T02:45:56.746525""",,,,,,False
"""529b14cb-0ed5-4ee0-9228-e4a84d…","""0f2b800f-9f10-433e-9998-06d95e…","""2025-05-14T06:00:00.490315""","""Lyon Part-Dieu""","""Nantes""",2,"""2025-05-21T06:00:00.490315""",,"""50c88dbe-a42e-4953-8510-2b4caa…","""Success""","""2nd Class""",78.09,True
"""1b905016-8e6c-4ea4-b96c-b4b9e8…","""5eab057d-28fa-4bcb-b1ab-a08a2c…","""2025-07-03T02:35:23.365211""","""Bordeaux St-Jean""","""Bordeaux St-Jean""",3,"""2025-08-01T02:35:23.365211""",,,,,,False
"""10bbab91-5dde-45a9-8717-df2cf2…","""64f51332-15dd-467b-9ef1-f4a32b…","""2025-10-31T14:05:16.469735""","""Lille Europe""","""Paris Gare De Lyon""",1,"""2025-11-05T14:05:16.469735""",,,,,,False


In [10]:
daily_stats = (
    df_master
    .with_columns(pl.col('timestamp').str.to_datetime()) # Convert timestamp to datetime
    .sort('timestamp')
    .group_by_dynamic('timestamp',every='1d')
    .agg([
        pl.len().alias('total_searches'),
        pl.col('is_converted').sum().alias('total_bookings'),
        pl.col('amount_eur').sum().alias('total_revenue')
    ])
    .with_columns([
        (pl.col('total_bookings') / pl.col('total_searches')).alias('conversion_rate')
    ])
)

In [11]:
display(daily_stats.head())

timestamp,total_searches,total_bookings,total_revenue,conversion_rate
datetime[μs],u32,u32,f64,f64
2025-01-14 00:00:00,38,8,682.89,0.210526
2025-01-15 00:00:00,48,15,1429.85,0.3125
2025-01-16 00:00:00,44,15,1280.15,0.340909
2025-01-17 00:00:00,38,11,743.93,0.289474
2025-01-18 00:00:00,43,13,781.3,0.302326


# **Visualization: Dual Axis Line Chart (Searches vs Conversion)**

In [14]:
fig_pulse = px.line(
    daily_stats.to_pandas(),
    x='timestamp',
    y='total_searches',
    title= 'Daily Search Volumn',
    labels = {'total_searches':'Total Searches','timestamp':'Month'}
)

fig_pulse.add_bar(
    x=daily_stats['timestamp'],
    y=daily_stats['conversion_rate'],
    name='Conversion Rate'
)

fig_pulse.show()

# **The Route Matrix (Heatmap)**

**Question:** Which routes are our bread and butter? Are there routes with high demand but low supply (or low conversion)?"

In [15]:
# Counting the searches per Origin-Destination pair
route_stats = (
    df_searches
    .group_by(['origin_station','destination_station'])
    .len()
    .rename({'len':'search_count'})
    .sort('search_count',descending=True)
)

In [17]:
display(route_stats.head())

origin_station,destination_station,search_count
str,str,u32
"""Bordeaux St-Jean""","""Strasbourg""",343
"""Nantes""","""Lille Europe""",339
"""Strasbourg""","""Marseille St-Charles""",339
"""Lyon Part-Dieu""","""Strasbourg""",335
"""Marseille St-Charles""","""Nantes""",335


In [18]:
# Pivot for Heatmap format (Origin as Rows, Destination as Columns)
route_matrix = (
    route_stats
    .pivot(values='search_count',index='origin_station',columns='destination_station')
    .fill_null(0) # Fill routes with 0 searches
)


the argument `columns` for `DataFrame.pivot` is deprecated. It was renamed to `on` in version 1.0.0.



In [19]:
display(route_matrix.head())

origin_station,Strasbourg,Lille Europe,Marseille St-Charles,Nantes,Paris Gare De Lyon,Bordeaux St-Jean,Lyon Part-Dieu
str,u32,u32,u32,u32,u32,u32,u32
"""Bordeaux St-Jean""",343,327,295,334,300,302,306
"""Nantes""",314,339,324,302,304,274,298
"""Strasbourg""",294,316,339,316,331,263,322
"""Lyon Part-Dieu""",335,325,280,310,290,328,326
"""Marseille St-Charles""",294,297,276,335,318,280,304


In [22]:
fig_routes = px.imshow(
    route_matrix.to_pandas().set_index('origin_station'),
    text_auto=True,
    title='Search Volume Heatmap: Origin vs Destination',
    color_continuous_scale = 'Viridis',
    labels= {'origin_station': 'Origin Station'}
)

fig_routes.show()

# **Price Sensitivity & Lead Time**

**Question:** Do people who book far in advance pay less? Does the price distribution differ by Ticket Class?

In [24]:
price_analysis = (
    df_master
    .filter(pl.col('is_converted')) # Only look at actual bookings
    .with_columns([
        pl.col('departure_date').str.to_datetime(), # Convert to datetime
        pl.col('timestamp').str.to_datetime()       # Convert to datetime
    ])
    .with_columns([
        (pl.col('departure_date') - pl.col('timestamp')).dt.total_days().alias('days_advance')
    ])
    .filter(pl.col('days_advance') >= 0) # A safety filter
)

In [25]:
display(price_analysis.head())

search_id,user_id,timestamp,origin_station,destination_station,passenger_count,departure_date,depature_date,booking_id,payment_status,ticket_class,amount_eur,is_converted,days_advance
str,str,datetime[μs],str,str,i64,datetime[μs],str,str,str,str,f64,bool,i64
"""529b14cb-0ed5-4ee0-9228-e4a84d…","""0f2b800f-9f10-433e-9998-06d95e…",2025-05-14 06:00:00.490315,"""Lyon Part-Dieu""","""Nantes""",2,2025-05-21 06:00:00.490315,,"""50c88dbe-a42e-4953-8510-2b4caa…","""Success""","""2nd Class""",78.09,True,7
"""9b97a019-5ba3-4c38-8c7b-57c7a7…","""dee0d056-732a-41dc-a91c-8e0fb6…",2025-12-08 09:23:02.316447,"""Paris Gare De Lyon""","""Bordeaux St-Jean""",1,2025-12-09 09:23:02.316447,,"""e57786f6-e282-4deb-a571-f48ed3…","""Success""","""2nd Class""",80.77,True,1
"""bba41b7b-b271-4dc0-b37a-a95432…","""62abe694-d577-4c54-922d-3f2341…",2025-06-03 23:12:25.705854,"""Strasbourg""","""Lyon Part-Dieu""",1,2025-06-23 23:12:25.705854,,"""338ba2f4-b2a2-4743-be91-630d60…","""Success""","""2nd Class""",133.09,True,20
"""59c62e92-f03d-4316-aaa5-e87dee…","""4cf05f28-b67f-4adc-9332-b9fb36…",2025-08-21 22:27:40.032473,"""Bordeaux St-Jean""","""Lille Europe""",2,2025-09-18 22:27:40.032473,,"""3c38a45e-1296-4153-ac6c-ecb6ed…","""Success""","""1st Class""",120.09,True,28
"""6269f53f-6d6a-47f0-b6e2-c40cd3…","""ec5f2f3d-e881-44db-a5e0-a85366…",2025-11-03 07:42:22,"""Paris Gare De Lyon""","""Paris Gare De Lyon""",1,2025-11-06 07:42:22.968098,,"""9779a681-6571-4fbe-86c0-e600fe…","""Success""","""2nd Class""",30.71,True,3


In [27]:
# Average Price per Ticket Class
class_pricing = (
    price_analysis
    .group_by('ticket_class')
    .agg([
        pl.col('amount_eur').mean().alias('avg_price'),
        pl.col('amount_eur').median().alias('median_price'),
        pl.len().alias('count')
    ])
)

In [28]:
display(class_pricing.head())

ticket_class,avg_price,median_price,count
str,f64,f64,u32
"""2nd Class""",86.77517,86.88,3199
"""1st Class""",88.20613,89.12,801


In [31]:
fig_price_distribution = px.box(
    price_analysis.to_pandas(),
    x='ticket_class',
    y='amount_eur',
    color='ticket_class',
    title='Price Distribution by Ticket Class',
    labels = {'ticket_class':'Ticket Class','amount_eur':'Amount (EUR)'}
)

fig_price_distribution.show()

# **Are last-minute tickets more expensive?**

In [36]:
fig_price = px.scatter(
    price_analysis.sample(n=1000, with_replacement=True).to_pandas(), # Sample to avoid overcrowding
    x='days_advance',
    y='amount_eur',
    color='ticket_class',
    title='Does booking early save money?',
    labels={'amount_eur':'Amount (EUR)','days_advance':'Days booked in advance','ticket_class':'Ticket Class'}
)
fig_price.show()