## Import Dependencies

In [13]:
import pandas as pd
import numpy as np


from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report


from sqlalchemy import create_engine
from datetime import datetime
from urllib.parse import quote_plus
import os
from dotenv import load_dotenv



In [14]:

load_dotenv()

SQLALCHEMY_DATABASE_URL = os.getenv("DATABASE_URL")
engine = create_engine(SQLALCHEMY_DATABASE_URL)

In [15]:
chapters_df = pd.read_sql("""
    SELECT 
        id AS chapter_id,
        chapter_title AS chapter_name,
        course_id,
        owner_id AS user_id
    FROM chapters
""", engine)

chapters_df


Unnamed: 0,chapter_id,chapter_name,course_id,user_id
0,1,Chapter 1: Introduction to Statistics,1,1
1,2,Types of Data,1,1
2,3,Measures of Central Tendency,1,1
3,4,Measures of Dispersion,1,1
4,5,Probability Basics,1,1
5,6,Conditional Probability,1,1
6,7,Random Variables,1,1
7,8,Probability Distributions,1,1
8,9,Sampling Techniques,1,1
9,10,Statistical Applications,1,1


In [16]:
time_df = pd.read_sql("""
    SELECT
        chapter_id,
        owner_id AS user_id,
        SUM(CASE WHEN activity_type = 'view_content' THEN duration_seconds ELSE 0 END) AS view_content,
        SUM(CASE WHEN activity_type = 'summary' THEN duration_seconds ELSE 0 END) AS time_summary,
        SUM(CASE WHEN activity_type = 'ask_question' THEN duration_seconds ELSE 0 END) AS time_ask,
        SUM(CASE WHEN activity_type = 'mcq' THEN duration_seconds ELSE 0 END) AS time_mcq,
        MAX(session_end) AS last_activity
    FROM learning_sessions
    WHERE is_valid = True
    GROUP BY chapter_id, owner_id
""", engine)

time_df


Unnamed: 0,chapter_id,user_id,view_content,time_summary,time_ask,time_mcq,last_activity
0,28,1,2325,0,0,0,2026-01-05 10:09:47+00:00
1,27,1,0,1872,0,2264,2026-01-05 10:02:06+00:00
2,50,1,2358,0,0,0,2026-01-09 17:41:46+00:00
3,21,1,0,2353,0,1978,2026-01-05 09:43:54+00:00
4,31,1,0,2429,0,2220,2026-01-05 15:35:20+00:00
5,48,1,0,2258,0,2238,2026-01-09 18:17:21+00:00
6,1,1,0,2124,0,2454,2026-01-07 17:35:59+00:00
7,32,1,2331,0,0,0,2026-01-05 15:41:10+00:00
8,51,1,0,2509,0,2294,2026-01-05 10:24:59+00:00
9,9,1,0,2203,0,2516,2026-01-07 18:18:21+00:00


In [17]:
mcq_df = pd.read_sql("""
    SELECT
        chapter_id,
        owner_id AS user_id,
        COUNT(*) AS mcq_attempts,
        AVG(score_percentage) AS mcq_avg_score,
        MAX(attempted_at) AS last_mcq_attempt
    FROM mcq_attempts
    GROUP BY chapter_id, owner_id
""", engine)

mcq_df


Unnamed: 0,chapter_id,user_id,mcq_attempts,mcq_avg_score,last_mcq_attempt
0,28,1,5,92.0,2026-01-05 18:30:00+00:00
1,27,1,5,72.0,2026-01-05 19:00:00+00:00
2,50,1,5,84.0,2026-01-10 03:05:00+00:00
3,21,1,5,72.0,2026-01-05 16:00:00+00:00
4,31,1,5,76.0,2026-01-05 22:40:00+00:00
5,48,1,5,80.0,2026-01-10 01:12:00+00:00
6,32,1,5,84.0,2026-01-05 20:59:00+00:00
7,1,1,5,88.0,2026-01-07 22:40:00+00:00
8,51,1,5,80.0,2026-01-05 21:00:00+00:00
9,9,1,5,68.0,2026-01-08 01:12:00+00:00


In [18]:
df = chapters_df \
    .merge(time_df, on=["chapter_id", "user_id"], how="left") \
    .merge(mcq_df, on=["chapter_id", "user_id"], how="left")


In [19]:
df.fillna({
    "view_content":0,
    "time_summary": 0,
    "time_ask": 0,
    "time_mcq": 0,
    "mcq_attempts": 0,
    "mcq_avg_score": 0
}, inplace=True)


In [23]:
df["total_time"] = df["time_summary"] + df["time_ask"] + df["time_mcq"] + df["view_content"]

df["score_efficiency"] = df["mcq_avg_score"] / (df["total_time"] + 1)

In [25]:
# Create final dataframe with selected columns
# Use .copy() to avoid SettingWithCopyWarning when modifying later
final_df = df[[
    "user_id",
    "course_id",
    "chapter_id",
    "chapter_name",
    "view_content",
    "time_summary",
    "time_ask",
    "time_mcq",
    "total_time",
    "mcq_attempts",
    "mcq_avg_score",
    "score_efficiency",
]].copy()

final_df


Unnamed: 0,user_id,course_id,chapter_id,chapter_name,view_content,time_summary,time_ask,time_mcq,total_time,mcq_attempts,mcq_avg_score,score_efficiency
0,1,1,1,Chapter 1: Introduction to Statistics,0,2124,0,2454,4578,5,88.0,0.019218
1,1,1,2,Types of Data,2343,0,0,0,2343,5,96.0,0.040956
2,1,1,3,Measures of Central Tendency,0,2288,0,2125,4413,5,80.0,0.018124
3,1,1,4,Measures of Dispersion,2288,0,0,0,2288,5,76.0,0.033202
4,1,1,5,Probability Basics,0,2417,0,2218,4635,5,72.0,0.015531
5,1,1,6,Conditional Probability,2450,0,0,0,2450,5,92.0,0.037536
6,1,1,7,Random Variables,0,2186,0,1972,4158,5,68.0,0.01635
7,1,1,8,Probability Distributions,2104,0,0,0,2104,5,92.0,0.043705
8,1,1,9,Sampling Techniques,0,2203,0,2516,4719,5,68.0,0.014407
9,1,1,10,Statistical Applications,2024,0,0,0,2024,5,72.0,0.035556


In [26]:
final_df["needs_attention"] = (
    (final_df["mcq_avg_score"] < 60) |
    (final_df["inactive_days"] > 7)
).astype(int)


KeyError: 'inactive_days'

In [27]:
final_df['score_efficiency'] = final_df.pop('score_efficiency')

In [28]:
final_df

Unnamed: 0,user_id,course_id,chapter_id,chapter_name,view_content,time_summary,time_ask,time_mcq,total_time,mcq_attempts,mcq_avg_score,score_efficiency
0,1,1,1,Chapter 1: Introduction to Statistics,0,2124,0,2454,4578,5,88.0,0.019218
1,1,1,2,Types of Data,2343,0,0,0,2343,5,96.0,0.040956
2,1,1,3,Measures of Central Tendency,0,2288,0,2125,4413,5,80.0,0.018124
3,1,1,4,Measures of Dispersion,2288,0,0,0,2288,5,76.0,0.033202
4,1,1,5,Probability Basics,0,2417,0,2218,4635,5,72.0,0.015531
5,1,1,6,Conditional Probability,2450,0,0,0,2450,5,92.0,0.037536
6,1,1,7,Random Variables,0,2186,0,1972,4158,5,68.0,0.01635
7,1,1,8,Probability Distributions,2104,0,0,0,2104,5,92.0,0.043705
8,1,1,9,Sampling Techniques,0,2203,0,2516,4719,5,68.0,0.014407
9,1,1,10,Statistical Applications,2024,0,0,0,2024,5,72.0,0.035556


In [29]:
X = final_df.drop(columns=["user_id", "course_id", "chapter_id", "chapter_name", "score_efficiency" ], axis=1)
y = final_df['score_efficiency']

In [30]:
X

Unnamed: 0,view_content,time_summary,time_ask,time_mcq,total_time,mcq_attempts,mcq_avg_score
0,0,2124,0,2454,4578,5,88.0
1,2343,0,0,0,2343,5,96.0
2,0,2288,0,2125,4413,5,80.0
3,2288,0,0,0,2288,5,76.0
4,0,2417,0,2218,4635,5,72.0
5,2450,0,0,0,2450,5,92.0
6,0,2186,0,1972,4158,5,68.0
7,2104,0,0,0,2104,5,92.0
8,0,2203,0,2516,4719,5,68.0
9,2024,0,0,0,2024,5,72.0


In [31]:
y

0     0.019218
1     0.040956
2     0.018124
3     0.033202
4     0.015531
5     0.037536
6     0.016350
7     0.043705
8     0.014407
9     0.035556
10    0.018555
11    0.037773
12    0.017109
13    0.035161
14    0.016146
15    0.037784
16    0.015510
17    0.038055
18    0.015004
19    0.037860
20    0.016620
21    0.043564
22    0.022310
23    0.038621
24    0.017404
25    0.039553
26    0.020446
27    0.035990
28    0.016344
29    0.036021
30    0.016242
31    0.035271
32    0.013099
33    0.031196
34    0.015551
35    0.035088
36    0.016579
37    0.046301
38    0.016571
39    0.035608
40    0.017375
41    0.044362
42    0.022638
43    0.041288
44    0.015223
45    0.043672
46    0.017790
47    0.037534
48    0.016653
49    0.024680
Name: score_efficiency, dtype: float64

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2)

In [33]:
np.random.seed(42)
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.9925417793357336

In [34]:
model.predict(X_test)

array([0.01617815, 0.04210549, 0.01582531, 0.03907701, 0.034736  ,
       0.03617962, 0.01694324, 0.01719517, 0.03685855, 0.01592483])

In [35]:
y_test

36    0.016579
21    0.043564
34    0.015551
43    0.041288
35    0.035088
39    0.035608
38    0.016571
48    0.016653
17    0.038055
28    0.016344
Name: score_efficiency, dtype: float64