<a href="https://colab.research.google.com/github/cbonnin88/RailFlow/blob/main/Predicting_Conversion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q pandas scikit-learn google-cloud-bigquery db-dtypes plotly

In [2]:
from google.colab import auth
auth.authenticate_user()

In [3]:
import pandas as pd
import numpy as np
from google.cloud import bigquery
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import plotly.express as px
from sklearn.metrics import confusion_matrix
import joblib
from google.colab import files

In [4]:
# Configuration
project_id = 'railflow-484310'
client = bigquery.Client(project=project_id, location='europe-west9')

# **Defining The Query**

In [5]:
query = """
SELECT
  origin,
  destination,
  -- Extract features from timestamps
  EXTRACT(DAYOFWEEK FROM search_at) AS search_day_of_week,
  EXTRACT(HOUR FROM search_at) AS search_hour,
  DATE_DIFF(CAST(departure_date AS DATE), CAST(search_date AS DATE), DAY) AS lead_time_days,
  -- Target Variable
  is_converted
FROM `railflow-484310.dbt_railflow_dev.int_search_bookings`
-- Filter out negative lead times just in case any survived cleaning
WHERE CAST(departure_date AS DATE) >= CAST(search_date AS DATE)
"""

In [6]:
df_railflow = client.query(query).to_dataframe()
display(df_railflow.head())

Unnamed: 0,origin,destination,search_day_of_week,search_hour,lead_time_days,is_converted
0,Bordeaux St-Jean,Bordeaux St-Jean,1,18,14,False
1,Lille Europe,Bordeaux St-Jean,1,5,22,False
2,Lille Europe,Bordeaux St-Jean,6,7,3,False
3,Lille Europe,Bordeaux St-Jean,1,8,8,False
4,Lille Europe,Bordeaux St-Jean,2,20,27,False


# **Feature Engineering**

In [7]:
# I am using LabelEncoder for simplicity here. In production, OneHotEncoder would be better
# 1. Encode Categorical Variables (Origin & Destination)
le_origin = LabelEncoder()
le_dest = LabelEncoder()

In [8]:
df_railflow['origin_code'] = le_origin.fit_transform(df_railflow['origin'])
df_railflow['dest_code'] = le_dest.fit_transform(df_railflow['destination'])

In [9]:
# 2. Define Features (X) and Target (y)
features = ['origin_code','dest_code','search_day_of_week','search_hour','lead_time_days']
X = df_railflow[features]
y = df_railflow['is_converted'].astype(int) # Converting Boolean to 0/1

In [10]:
# 3. Split into Training and Testing sets (80% Train, 20% Test)
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=42)

print('Data successfully prepped for training')

Data successfully prepped for training


# **Training the Model**

In [11]:
# Initializing and Training
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train,y_train)

print('Model Trained !!')

Model Trained !!


# **Evaluating the Performance**

In [12]:
# 1. Generate Predictions
y_pred = rf_model.predict(X_test)

In [13]:
# 2. Print Metrics
print('Confusion Matrix:')
print(confusion_matrix(y_test,y_pred))
print('\nClassification Report:')
print(classification_report(y_test,y_pred))

Confusion Matrix:
[[2254  118]
 [ 530  138]]

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.95      0.87      2372
           1       0.54      0.21      0.30       668

    accuracy                           0.79      3040
   macro avg       0.67      0.58      0.59      3040
weighted avg       0.75      0.79      0.75      3040



In [14]:
cm = confusion_matrix(y_test,y_pred)

In [15]:
fig_cm = px.imshow(
    cm,
    text_auto = True, # shows numbers inside the squares
    aspect= 'auto', # Adjusting square size to fit
    labels = dict(x='Predicted',y='Actual',color='Count'),
    x=['Not Converted (0)','Converted (1)'],
    y=['Not Converted (0)','Converted (1)'],
    color_continuous_scale='Viridis_r',
    title = 'confusion Matrix: Actual vs Predicted'
)

fig_cm.update_layout(
    xaxis_title='Predcited Label',
    yaxis_title='Actual Label'
)
fig_cm.show()

In [16]:
# 3. Product Insight
# This tells my WHAT drives a user to book
importances = pd.DataFrame({
    'feature': features,
    'importance': rf_model.feature_importances_
}).sort_values('importance',ascending=False)

In [17]:
fig_product_insights = px.bar(
    importances,
    x='feature',
    y='importance',
    color='feature',
    title='What drives a bookings ?'
)

fig_product_insights.show()

# **Inference (Making a Prediction)**

Imagine a user just searched on the railflow app.

In [18]:
# Here I am simulating a new search: Paris -> Bordeaux, searching on a Friday (Day 6), at 2pm, 3 days in advance
paris_bdx_search = pd.DataFrame({
    'origin_code': le_origin.transform(['Paris Gare De Lyon']),
    'dest_code': le_dest.transform(['Bordeaux St-Jean']),
    'search_day_of_week': [6],
    'search_hour':[14],
    'lead_time_days': [3]
})

In [19]:
# Predict Probability
probability = rf_model.predict_proba(paris_bdx_search)[0][1] # Probability of Class 1 (Booking)
print(f'Booking Probability: {probability:.0%}')

Booking Probability: 5%


In [20]:
if probability > 0.7:
  print('Action: No discount needed. User likely to buy.')
elif probability > 0.3:
  print('Action: Send push notification with 5% off!')
else:
  print('Action: Ignore (Low Intent)')

Action: Ignore (Low Intent)


# **Exporting the Model & Encoders for Streamlit**

In [21]:
# 1. Bundle everything into a dictionary
artifacts = {
    'model': rf_model,
    'le_origin': le_origin,
    'le_dest': le_dest
}

In [22]:
# 2. Save to a file
joblib.dump(artifacts, 'railflow_brain.joblib')

['railflow_brain.joblib']

In [23]:
# 3. Download to my computer
files.download('railflow_brain.joblib')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>