<a href="https://colab.research.google.com/github/cbonnin88/meditrack/blob/main/ml_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [31]:
import polars as pl
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import plotly.express as px
import gdown as gd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import random

In [2]:
url_users ='https://drive.google.com/uc?id=1gLZgyIa17KoZsMrwP0-rg6cEFeAoD44O'
url_events = 'https://drive.google.com/uc?id=1DT00olR4KK_zMYXtMEm4q1LViasRc1Gn'
url_subs = 'https://drive.google.com/uc?id=1qNJJ8e8p-VbZ0HkfceOhqxoHsTFGEXGl'

In [3]:
gd.download(url_subs,'meditrack_subs_clean.csv',quiet=True)

'meditrack_subs_clean.csv'

In [4]:
gd.download(url_events,'meditrack_events_clean.csv',quiet=True)

'meditrack_events_clean.csv'

In [5]:
gd.download(url_users,'meditrack_users_clean.csv',quiet=True)

'meditrack_users_clean.csv'

In [6]:
df_users = pl.read_csv('meditrack_users_clean.csv')
df_events = pl.read_csv('meditrack_events_clean.csv')
df_subs = pl.read_csv('meditrack_subs_clean.csv')

In [7]:
df_users.head()

user_id,signup_date,age,email,country,acquisition_channel
str,str,i64,str,str,str
"""bdd640fb-0667-4ad1-9c80-317fa3…","""2025-03-07""",58,"""unknown@meditrack.coom""","""Denmark""","""Google"""
"""1a3d1fa7-bc89-40a9-a3b8-c1e939…","""2025-08-01""",32,"""garzaanthony@example.org""","""Ireland""","""Instagram"""
"""a28defe3-9bf0-4273-9247-6f57a5…","""2025-01-28""",52,"""frankgray@example.net""","""Denmark""","""Instagram"""
"""3602f8ac-10f1-4c81-848a-aa9e66…","""2025-10-27""",54,"""jason76@example.net""","""France""","""Google"""
"""81f631d4-a392-41a7-9777-a4774c…","""2025-07-07""",70,"""nathanielmartin@example.net""","""Denmark""","""Referral"""


In [8]:
df_events.head()

event_id,user_id,event_name,event_timestamp,platform
str,str,str,str,str
"""3d7d9e1e-c7e1-46e3-bf71-d4015f…","""233ffc82-92ec-49af-93ae-c376c3…","""app_open""","""2025-09-30""","""iOS"""
"""9f130820-0f3b-45f9-9345-c526a4…","""8aa9d39a-726f-46fd-8ec9-8bf100…","""book_apt""","""2025-05-22""","""Android"""
"""baedb7ab-965e-45d5-8735-93d177…","""ef32f72a-a4b5-4723-82fb-28690d…","""app_open""","""2025-08-23""","""Android"""
"""b1a33f6d-e3db-4202-9516-f81c43…","""4c6694ab-57b5-4de2-b1ae-84bec7…","""video_call""","""2025-07-21""","""Web"""
"""19664504-6e7e-4978-869d-7b0161…","""b4adaf89-0ff1-4a9f-ba37-0623bc…","""book_apt""","""2025-08-05""","""Android"""


In [9]:
df_subs.head()

user_id,monthly_fee,churned
str,f64,i64
"""bdd640fb-0667-4ad1-9c80-317fa3…",10.95,0
"""1a3d1fa7-bc89-40a9-a3b8-c1e939…",10.95,0
"""a28defe3-9bf0-4273-9247-6f57a5…",20.95,0
"""3602f8ac-10f1-4c81-848a-aa9e66…",20.95,1
"""81f631d4-a392-41a7-9777-a4774c…",10.95,1


# **Churn Prediction Model**

In [10]:
# Feature A: Activity Level ( How many events did they do ?)
event_counts = df_events.group_by('user_id').len().rename({'len':'total_events'})

In [11]:
# Feature B: Last Platform Used (Did they struggle on Web ?)
last_platform = (
    df_events.sort('event_timestamp', descending=True)
    .unique(subset=['user_id'], keep='first')
    .select(['user_id','platform'])
)

In [12]:
# Join everything into one big table
df_final = (
    df_users
    .join(df_subs,on='user_id',how='inner')
    .join(event_counts, on='user_id',how='left')
    .join(last_platform, on='user_id',how='left')
    .with_columns(pl.col('total_events').fill_null(0))
)

In [13]:
# Convert to Pandas for Scikit-Learn
df_pandas = df_final.to_pandas()

In [14]:
# Select only the columns we need for prediction
# We DROP 'user_id', 'email','name' because they dont predict churn
features = ['age','acquisition_channel','monthly_fee','total_events','platform']
target = 'churned'

X_raw = df_pandas[features]
y = df_pandas[target]

In [15]:
# One-Hot Encoding: Convert Text (Facebook, IOS) into numbers (0 or 1)
X = pd.get_dummies(X_raw, columns=['acquisition_channel','platform'], drop_first=True)

In [17]:
# Splitting Data (Train vs Test)
# I will hide  30% of data to test the model later
X_train, X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [20]:
# Training the Model
print('Training Random Forest Model...')
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train,y_train)

Training Random Forest Model...


In [23]:
# Evaluate the Model
predictions = model.predict(X_test)
print('\n---Model Performance Report---')
print(classification_report(y_test,predictions))


---Model Performance Report---
              precision    recall  f1-score   support

           0       0.47      0.45      0.46       137
           1       0.53      0.55      0.54       154

    accuracy                           0.50       291
   macro avg       0.50      0.50      0.50       291
weighted avg       0.50      0.50      0.50       291



In [24]:
# Feature Importance (The 'Why')
importances = pd.DataFrame({
    'Feature':X.columns,
    'Importance': model.feature_importances_
}).sort_values(by='Importance',ascending=True)

In [27]:
fig_model = px.bar(
    importances,
    x='Importance',
    y='Feature',
    orientation='h',
    title='What Drives User Churn? (Feature Importance)',
    color='Importance',
    color_continuous_scale='Viridis'
)

fig_model.show()

# **Recommendation Engine**

In [55]:
bookings = df_events.filter(pl.col('event_name') == 'book_apt')

In [75]:
# Assigning a random doctor_id to each bookin to simulate a real database
random.seed(42)
doctor_ids = [f'Dr_{random.randint(1,20)}' for _ in range(bookings.height)]

display(doctor_ids)

['Dr_4',
 'Dr_1',
 'Dr_9',
 'Dr_8',
 'Dr_8',
 'Dr_5',
 'Dr_4',
 'Dr_18',
 'Dr_3',
 'Dr_19',
 'Dr_14',
 'Dr_2',
 'Dr_1',
 'Dr_3',
 'Dr_7',
 'Dr_8',
 'Dr_17',
 'Dr_20',
 'Dr_1',
 'Dr_18',
 'Dr_7',
 'Dr_18',
 'Dr_14',
 'Dr_8',
 'Dr_15',
 'Dr_19',
 'Dr_9',
 'Dr_1',
 'Dr_6',
 'Dr_14',
 'Dr_11',
 'Dr_9',
 'Dr_5',
 'Dr_7',
 'Dr_11',
 'Dr_4',
 'Dr_3',
 'Dr_13',
 'Dr_4',
 'Dr_12',
 'Dr_12',
 'Dr_20',
 'Dr_9',
 'Dr_2',
 'Dr_15',
 'Dr_18',
 'Dr_4',
 'Dr_13',
 'Dr_3',
 'Dr_18',
 'Dr_10',
 'Dr_20',
 'Dr_12',
 'Dr_19',
 'Dr_7',
 'Dr_3',
 'Dr_2',
 'Dr_8',
 'Dr_10',
 'Dr_3',
 'Dr_8',
 'Dr_4',
 'Dr_13',
 'Dr_9',
 'Dr_15',
 'Dr_12',
 'Dr_6',
 'Dr_12',
 'Dr_12',
 'Dr_7',
 'Dr_9',
 'Dr_3',
 'Dr_20',
 'Dr_6',
 'Dr_18',
 'Dr_8',
 'Dr_6',
 'Dr_15',
 'Dr_13',
 'Dr_9',
 'Dr_18',
 'Dr_8',
 'Dr_11',
 'Dr_2',
 'Dr_8',
 'Dr_2',
 'Dr_11',
 'Dr_13',
 'Dr_9',
 'Dr_3',
 'Dr_7',
 'Dr_19',
 'Dr_11',
 'Dr_7',
 'Dr_16',
 'Dr_13',
 'Dr_15',
 'Dr_5',
 'Dr_9',
 'Dr_5',
 'Dr_8',
 'Dr_18',
 'Dr_18',
 'Dr_9',
 'Dr_19',
 'Dr_14'

In [78]:
# Adding these doctor IDs to our booking data
bookings = bookings.with_columns(pl.Series(name='doctor_id', values=doctor_ids))

display(bookings.head())

event_id,user_id,event_name,event_timestamp,platform,doctor_id
str,str,str,str,str,str
"""9f130820-0f3b-45f9-9345-c526a4…","""8aa9d39a-726f-46fd-8ec9-8bf100…","""book_apt""","""2025-05-22""","""Android""","""Dr_4"""
"""19664504-6e7e-4978-869d-7b0161…","""b4adaf89-0ff1-4a9f-ba37-0623bc…","""book_apt""","""2025-08-05""","""Android""","""Dr_1"""
"""fbb64c04-0dc5-4a75-a835-3c7e3e…","""20bd1422-0b0b-41de-be61-2d6605…","""book_apt""","""2025-11-20""","""iOS""","""Dr_9"""
"""2b0d5b1c-027d-4e84-9b0e-09ffb6…","""69b6175e-76c5-4d2b-a9fd-d358cb…","""book_apt""","""2024-12-21""","""Android""","""Dr_8"""
"""6cddab56-8c30-4073-a8e4-c8dedd…","""5fb698a5-e920-4b3a-a951-c8a7cd…","""book_apt""","""2025-05-21""","""Android""","""Dr_8"""


In [58]:
# Creating The Matrix: Count now many times User U booked Doctor D
# Convert to Pandas for the pivot table (easier for matrices)
df_bookings = bookings.to_pandas()

In [59]:
user_item_matrix = df_bookings.pivot_table(
    index='user_id',
    columns='doctor_id',
    aggfunc='size',
    fill_value=0
)

print('User-Item Matrix Created:')
print(user_item_matrix.head())

User-Item Matrix Created:
doctor_id                             Dr_1  Dr_10  Dr_11  Dr_12  Dr_13  Dr_14  \
user_id                                                                         
006ed6e3-6fa1-4735-b572-f3d00b5cea6a     0      0      0      0      0      0   
0123a348-638a-4df6-adf7-2cd3979aa051     0      0      0      1      0      0   
013ca998-2b2b-458c-ada5-8dadf56aabc0     0      0      0      0      0      1   
01a0960b-ba89-434b-b224-7627c1e0248e     0      1      0      1      0      0   
020e2e6a-bcc8-409b-98da-ae8cf9f7e004     0      0      0      0      0      0   

doctor_id                             Dr_15  Dr_16  Dr_17  Dr_18  Dr_19  Dr_2  \
user_id                                                                         
006ed6e3-6fa1-4735-b572-f3d00b5cea6a      0      0      0      0      0     0   
0123a348-638a-4df6-adf7-2cd3979aa051      0      0      1      0      0     0   
013ca998-2b2b-458c-ada5-8dadf56aabc0      0      0      0      0      0     0   
0

In [92]:
# Calculate Similarity (The Math)
# We calculate the Cosine Similarity between USERS
# How similar is User A's booking history to User B's ?

user_similarity = cosine_similarity(user_item_matrix)
user_similarity_df = pd.DataFrame(
    user_similarity,
    index=user_item_matrix.index,
    columns=user_item_matrix.index
)

display(user_similarity_df.head())

user_id,006ed6e3-6fa1-4735-b572-f3d00b5cea6a,0123a348-638a-4df6-adf7-2cd3979aa051,013ca998-2b2b-458c-ada5-8dadf56aabc0,01a0960b-ba89-434b-b224-7627c1e0248e,020e2e6a-bcc8-409b-98da-ae8cf9f7e004,02223c10-f82f-4dc3-9c35-730b49d52453,0262bf90-3bd8-4e22-8b0f-ab8e9fa4ce21,02ff9754-5291-4b6d-a592-4c9f29714f1f,030b23db-bfbe-4ca3-b610-350b76f53b65,03e7dd21-1d4d-46d6-860d-113d2851a8be,...,fd595978-7965-42e3-b9b0-94ac4d2d8efd,fd5cd74e-8778-4be6-9b29-f76ec8880b2c,fd888cbd-a804-4463-8d36-f5e492a6f3e6,fde6849b-468b-42af-9ca4-364d5d9ad9f1,fec88839-ec35-43de-b414-e3e1a3871e92,fed024cb-b6f3-4bd4-8409-b7c8ef59a4b5,fef72410-5528-4c45-a37a-a99961ff1d9a,ff12fe28-b840-4136-867e-4b72316e8645,ff5376a4-bb48-4bba-ac73-5be00ceeddc2,ffc9ff30-9f9a-4a33-9ad6-ff7be4af65aa
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
006ed6e3-6fa1-4735-b572-f3d00b5cea6a,1.0,0.0,0.0,0.408248,0.0,0.0,0.0,0.0,0.408248,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0123a348-638a-4df6-adf7-2cd3979aa051,0.0,1.0,0.0,0.408248,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.632456
013ca998-2b2b-458c-ada5-8dadf56aabc0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01a0960b-ba89-434b-b224-7627c1e0248e,0.408248,0.408248,0.0,1.0,0.0,0.0,0.0,0.0,0.333333,0.57735,...,0.0,0.0,0.57735,0.0,0.0,0.0,0.57735,0.0,0.0,0.774597
020e2e6a-bcc8-409b-98da-ae8cf9f7e004,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.5,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0


# **The Recommendation Function**

In [93]:
def recommend_doctors(target_user_id, n_recommendations=3):
  if target_user_id not in user_item_matrix.index:
    return 'User has no booking history (Cold Start)'

  sim_scores = user_similarity_df[target_user_id]
  similar_users = sim_scores.sort_values(ascending=False).index[1:4]

  recommended_docs = {}

  for peer in similar_users:
    # What did the peer book?
    peer_books = user_item_matrix.loc[peer]

    # Filter for doctors the peer booked (>0)
    docs = peer_books[peer_books > 0].index.tolist()

    for doc in docs:
      # Only recommend if our targetr user HAS NOT book this doctor yet
      if user_item_matrix.loc[target_user_id, doc] == 0:
          recommended_docs[doc] = recommended_docs.get(doc,0) + 1

  sorted_recs = sorted(recommended_docs.items(), key=lambda x:x[1], reverse=True)
  return [doc[0] for doc in sorted_recs[:n_recommendations]]

In [94]:
# Testing the Engine
# Picking a random user from our matrix

test_user = user_item_matrix.index[0]
recs = recommend_doctors(test_user)

In [95]:
print(f'\n--- Recommendations for User {test_user[:8]}... ---')
print(f'Based on similar users, they should book: {recs}')


--- Recommendations for User 006ed6e3... ---
Based on similar users, they should book: []
