<a href="https://colab.research.google.com/github/cbonnin88/RailFlow/blob/main/Product_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import polars as pl
import plotly.express as px
import numpy as np

In [2]:
# Loading the Data
df_searches = pl.read_csv('clean_searches.csv')
df_bookings = pl.read_csv('clean_bookings.csv')
df_users = pl.read_csv('clean_users.csv')

In [3]:
df_searches.head()

search_id,user_id,timestamp,origin_station,destination_station,passenger_count,departure_date
str,str,str,str,str,i64,str
"""90927b8a-830e-4eaa-afc2-3c669c…","""7de933dd-3b90-471a-81df-dfa8e0…","""2025-05-10T13:49:18.000000""","""Nantes""","""Strasbourg""",1,"""2025-05-30T13:49:18.419189"""
"""23007a71-b9d4-4db7-8f6d-08ae83…","""885fd832-1e47-4269-811a-7292b7…","""2025-01-18T04:42:05.746525""","""Marseille St-Charles""","""Nantes""",1,"""2025-01-23T04:42:05.746525"""
"""f574a5b5-28d2-48b4-9fdd-c0a91b…","""78a61a2d-d26b-40be-8d8b-d00228…","""2025-05-14T07:56:09.490315""","""Lyon Part-Dieu""","""Nantes""",2,"""2025-05-21T07:56:09.490315"""
"""8ad8fa65-a7d1-4206-b344-55eaf5…","""d1b483dc-59e4-425e-9bc7-267e12…","""2025-07-03T04:31:32.365211""","""Bordeaux St-Jean""","""Bordeaux St-Jean""",3,"""2025-08-01T04:31:32.365211"""
"""02735e43-f2c4-49a3-b36b-8668f8…","""fe6c2362-70bb-4548-a281-34bfbf…","""2025-10-31T16:01:25.469735""","""Lille Europe""","""Paris Gare De Lyon""",1,"""2025-11-05T16:01:25.469735"""


In [4]:
df_bookings.head()

booking_id,search_id,payment_status,ticket_class,amount_eur
str,str,str,str,f64
"""e80dc352-ee1b-4c10-b3eb-ce93bd…","""2fc1ab31-20f9-4577-8bed-544797…","""Success""","""2nd Class""",71.81
"""77b36a1f-fe38-4a8c-9af9-e7cff1…","""de0dbd1d-85f8-436b-b95f-ce4c0f…","""Success""","""2nd Class""",140.79
"""c21dffac-e07b-4338-92df-95cff2…","""2ab924c3-e678-45f6-b838-9248df…","""Success""","""2nd Class""",32.69
"""ccb7e082-734b-4644-9997-7bd3e4…","""f4bb9c14-7546-4df8-87c1-91765e…","""Success""","""2nd Class""",60.48
"""06d439ac-a86b-46ec-aaaf-159bd6…","""eb188485-97a9-4903-9521-b6e5d8…","""Success""","""2nd Class""",132.07


In [5]:
df_users.head()

user_id,signup_date,subscription_type,age,device_os
str,str,str,i64,str
"""af6c8382-f646-4038-87a7-782740…","""2024-01-19""","""Free""",61,"""Unknown"""
"""86aa8f67-de70-4f34-8420-464d36…","""2025-03-07""","""Free""",50,"""Web"""
"""0fdfd393-33f0-4076-a5e7-580d08…","""2024-01-14""","""Free""",77,"""Web"""
"""4e30f85e-ed9c-4b17-a457-8e3577…","""2024-11-07""","""Free""",72,"""Unknown"""
"""be397eb5-72d4-4a8e-991c-25620c…","""2024-11-15""","""Free""",65,"""Web"""


# **The Global Pulse & Conversion Rate**

1. **Question:** "How is our business performing day-by-day? What is our global conversion rate?"

In [6]:
valid_search_ids = df_searches["search_id"].to_list()

In [7]:
aligned_ids = np.random.choice(valid_search_ids, size=df_bookings.shape[0])

In [8]:
clean_bookings = df_bookings.with_columns(
    pl.Series(name="search_id", values=aligned_ids)
)

In [9]:
df_master = (
    df_searches
    .join(clean_bookings, on="search_id", how="left")
    .with_columns([
        pl.col("booking_id").is_not_null().alias("is_converted")
    ])
)

In [10]:
print(f"Total Rows: {df_master.shape[0]}")
converted_count = df_master["is_converted"].sum()
print(f"✅ Converted Bookings: {converted_count}")

Total Rows: 15508
✅ Converted Bookings: 4000


In [11]:
clean_bookings.write_csv('clean_bookings_v2.csv')

In [12]:
display(df_master.head())

search_id,user_id,timestamp,origin_station,destination_station,passenger_count,departure_date,booking_id,payment_status,ticket_class,amount_eur,is_converted
str,str,str,str,str,i64,str,str,str,str,f64,bool
"""90927b8a-830e-4eaa-afc2-3c669c…","""7de933dd-3b90-471a-81df-dfa8e0…","""2025-05-10T13:49:18.000000""","""Nantes""","""Strasbourg""",1,"""2025-05-30T13:49:18.419189""","""655759d5-8449-4d97-ac98-e8e9cf…","""Failed""","""1st Class""",29.97,True
"""23007a71-b9d4-4db7-8f6d-08ae83…","""885fd832-1e47-4269-811a-7292b7…","""2025-01-18T04:42:05.746525""","""Marseille St-Charles""","""Nantes""",1,"""2025-01-23T04:42:05.746525""",,,,,False
"""f574a5b5-28d2-48b4-9fdd-c0a91b…","""78a61a2d-d26b-40be-8d8b-d00228…","""2025-05-14T07:56:09.490315""","""Lyon Part-Dieu""","""Nantes""",2,"""2025-05-21T07:56:09.490315""",,,,,False
"""8ad8fa65-a7d1-4206-b344-55eaf5…","""d1b483dc-59e4-425e-9bc7-267e12…","""2025-07-03T04:31:32.365211""","""Bordeaux St-Jean""","""Bordeaux St-Jean""",3,"""2025-08-01T04:31:32.365211""",,,,,False
"""02735e43-f2c4-49a3-b36b-8668f8…","""fe6c2362-70bb-4548-a281-34bfbf…","""2025-10-31T16:01:25.469735""","""Lille Europe""","""Paris Gare De Lyon""",1,"""2025-11-05T16:01:25.469735""",,,,,False


In [13]:
daily_stats = (
    df_master
    .with_columns(pl.col('timestamp').str.to_datetime()) # Convert timestamp to datetime
    .sort('timestamp')
    .group_by_dynamic('timestamp',every='1d')
    .agg([
        pl.len().alias('total_searches'),
        pl.col('is_converted').sum().alias('total_bookings'),
        pl.col('amount_eur').sum().alias('total_revenue')
    ])
    .with_columns([
        (pl.col('total_bookings') / pl.col('total_searches')).alias('conversion_rate')
    ])
)

# **Visualization: Dual Axis Line Chart (Searches vs Conversion)**

In [14]:
fig_pulse = px.line(
    daily_stats.to_pandas(),
    x='timestamp',
    y='total_searches',
    title= 'Daily Search Volumn',
    labels = {'total_searches':'Total Searches','timestamp':'Month'}
)

fig_pulse.add_bar(
    x=daily_stats['timestamp'],
    y=daily_stats['conversion_rate'],
    name='Conversion Rate'
)

fig_pulse.show()

# **The Route Matrix (Heatmap)**

**Question:** Which routes are our bread and butter? Are there routes with high demand but low supply (or low conversion)?"

In [15]:
# Counting the searches per Origin-Destination pair
route_stats = (
    df_searches
    .group_by(['origin_station','destination_station'])
    .len()
    .rename({'len':'search_count'})
    .sort('search_count',descending=True)
)

In [16]:
display(route_stats.head())

origin_station,destination_station,search_count
str,str,u32
"""Bordeaux St-Jean""","""Strasbourg""",343
"""Strasbourg""","""Marseille St-Charles""",339
"""Nantes""","""Lille Europe""",339
"""Lyon Part-Dieu""","""Strasbourg""",335
"""Marseille St-Charles""","""Nantes""",335


In [17]:
# Pivot for Heatmap format (Origin as Rows, Destination as Columns)
route_matrix = (
    route_stats
    .pivot(values='search_count',index='origin_station',on='destination_station')
    .fill_null(0) # Fill routes with 0 searches
)

In [18]:
display(route_matrix.head())

origin_station,Strasbourg,Marseille St-Charles,Lille Europe,Nantes,Paris Gare De Lyon,Bordeaux St-Jean,Lyon Part-Dieu
str,u32,u32,u32,u32,u32,u32,u32
"""Bordeaux St-Jean""",343,295,327,334,300,302,306
"""Strasbourg""",294,339,316,316,331,263,322
"""Nantes""",314,324,339,302,304,274,298
"""Lyon Part-Dieu""",335,280,325,310,290,328,326
"""Marseille St-Charles""",294,276,297,335,318,280,304


In [19]:
fig_routes = px.imshow(
    route_matrix.to_pandas().set_index('origin_station'),
    text_auto=True,
    title='Search Volume Heatmap: Origin vs Destination',
    color_continuous_scale = 'Viridis',
    labels= {'origin_station': 'Origin Station'}
)

fig_routes.show()

# **Price Sensitivity & Lead Time**

**Question:** Do people who book far in advance pay less? Does the price distribution differ by Ticket Class?

In [20]:
price_analysis = (
    df_master
    .filter(pl.col('is_converted')) # Only look at actual bookings
    .with_columns([
        pl.col('departure_date').str.to_datetime(), # Convert to datetime
        pl.col('timestamp').str.to_datetime()       # Convert to datetime
    ])
    .with_columns([
        (pl.col('departure_date') - pl.col('timestamp')).dt.total_days().alias('days_advance')
    ])
    .filter(pl.col('days_advance') >= 0) # A safety filter
)

In [21]:
display(price_analysis.head())

search_id,user_id,timestamp,origin_station,destination_station,passenger_count,departure_date,booking_id,payment_status,ticket_class,amount_eur,is_converted,days_advance
str,str,datetime[μs],str,str,i64,datetime[μs],str,str,str,f64,bool,i64
"""90927b8a-830e-4eaa-afc2-3c669c…","""7de933dd-3b90-471a-81df-dfa8e0…",2025-05-10 13:49:18,"""Nantes""","""Strasbourg""",1,2025-05-30 13:49:18.419189,"""655759d5-8449-4d97-ac98-e8e9cf…","""Failed""","""1st Class""",29.97,True,20
"""8e170742-1b0b-4b67-9252-fb91d2…","""81dd9215-4207-428c-be0e-12650d…",2025-08-22 00:23:49.032473,"""Bordeaux St-Jean""","""Lille Europe""",2,2025-09-19 00:23:49.032473,"""343705c8-e4e8-4fce-91b8-c36005…","""Success""","""1st Class""",29.03,True,28
"""e761cec1-764a-4635-986d-01c8c5…","""c41b54fb-372f-469d-9f21-124f60…",2025-02-20 19:19:10.381007,"""Marseille St-Charles""","""Nantes""",1,2025-03-21 19:19:10.381007,"""a9f3766c-2397-4f78-b4fa-77874c…","""Success""","""2nd Class""",100.78,True,29
"""081f2c0e-69d3-41ff-954a-6b784b…","""0d353efe-2d9b-43f9-bbaa-e175d2…",2025-11-24 06:56:01.128761,"""Nantes""","""Nantes""",1,2025-12-12 06:56:01.128761,"""802f772b-3e40-41cf-b420-7bde54…","""Pending""","""2nd Class""",138.6,True,18
"""8f24a794-1204-422f-9179-313360…","""46fc0a32-289c-4350-ac55-3f1237…",2025-06-17 06:06:22.851253,"""Bordeaux St-Jean""","""Lille Europe""",2,2025-07-09 06:06:22.851253,"""1da5f15c-ebb0-48c3-9cf0-5adc7e…","""Failed""","""2nd Class""",135.91,True,22


In [22]:
# Average Price per Ticket Class
class_pricing = (
    price_analysis
    .group_by('ticket_class')
    .agg([
        pl.col('amount_eur').mean().alias('avg_price'),
        pl.col('amount_eur').median().alias('median_price'),
        pl.len().alias('count')
    ])
)

In [23]:
display(class_pricing.head())

ticket_class,avg_price,median_price,count
str,f64,f64,u32
"""2nd Class""",86.817324,86.82,3147
"""1st Class""",88.379642,89.26,782


In [24]:
fig_price_distribution = px.box(
    price_analysis.to_pandas(),
    x='ticket_class',
    y='amount_eur',
    color='ticket_class',
    title='Price Distribution by Ticket Class',
    labels = {'ticket_class':'Ticket Class','amount_eur':'Amount (EUR)'}
)

fig_price_distribution.show()

# **Are last-minute tickets more expensive?**

In [25]:
fig_price = px.scatter(
    price_analysis.sample(n=1000, with_replacement=True).to_pandas(), # Sample to avoid overcrowding
    x='days_advance',
    y='amount_eur',
    color='ticket_class',
    title='Does booking early save money?',
    labels={'amount_eur':'Amount (EUR)','days_advance':'Days booked in advance','ticket_class':'Ticket Class'}
)
fig_price.show()