In [8]:
# Load data
import pandas as pd 
demographic_df = pd.read_csv('data/demographics.csv')
academic_df = pd.read_csv('data/academicPerformance.csv')
activities_df = pd.read_csv('data/extracurricularActivities.csv')
behavior_df = pd.read_csv('data/behavioralPatterns.csv')


In [9]:
# Rename ID columns to StudentID for consistency
demographic_df.rename(columns={'ID': 'StudentID'}, inplace=True)
academic_df.rename(columns={'Student ID': 'StudentID'}, inplace=True)

In [10]:
merged_df = demographic_df.merge(academic_df, on='StudentID') \
                          .merge(activities_df, on='StudentID') \
                          .merge(behavior_df, on='StudentID')

In [11]:
# Create performance score
merged_df["PerformanceScore"] = (
    merged_df["Attendance %"] * 0.3 +
    merged_df["Completed Assignments"] * 0.4 +
    merged_df["Time Spent On Materials (Hours)"] * 0.2 +
    merged_df["Forum Posts"] * 0.1
)

In [12]:
merged_df.head(3)

Unnamed: 0,StudentID,Age,Marital Status,Employment Status,Gender,Socioeconomic Status,Income Level,Location,District,Education Level,...,Role,Start Date,End Date,Date,Time Spent On Materials (Hours),Forum Posts,Instructor Messages,Completed Assignments,Time Spent On Forum (Hours),PerformanceScore
0,S001,22,Single,Part-time,Female,Middle,25000,Urban,Kampala,Undergraduate,...,Player,2023-09-01,2024-06-30,2025-03-01,2.5,3,2,2,0.5,30.1
1,S001,22,Single,Part-time,Female,Middle,25000,Urban,Kampala,Undergraduate,...,Player,2023-09-01,2024-06-30,2025-03-02,2.0,2,1,2,0.3,29.9
2,S001,22,Single,Part-time,Female,Middle,25000,Urban,Kampala,Undergraduate,...,Player,2023-09-01,2024-06-30,2025-03-03,2.5,3,2,2,0.6,30.1


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

# Make a copy of the dataset
df_enc = merged_df.copy()

# Encode categorical columns
categorical_cols = [
    "StudentID",
    "Marital Status",
    "Employment Status",
    "Gender",
    "Socioeconomic Status",
    "Location",
    "District",
    "Education Level",
    "Javascript",
    "Python",
    "HCD",
    "Communication",
    "Course Completion",
    "Activity",
    "Participation Status",
    "Role",
    "Start Date",
    "End Date",
    "Date"
]

label_encoders = {}  
for col in categorical_cols:
    le = LabelEncoder()
    df_enc[col] = le.fit_transform(df_enc[col].astype(str))  # convert to string just in case of NaN
    label_encoders[col] = le

# Define features (X) and target (y)
X = df_enc.drop(["PerformanceScore", "StudentID"], axis=1)
y = df_enc["PerformanceScore"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

print("Model trained successfully ")


Model trained successfully 


In [14]:
sample = X_test.iloc[0].copy()

# Original prediction
pred_old = model.predict([sample])[0]

# Modify Hours Per Week
sample["Hours Per Week"] = 10
pred_new = model.predict([sample])[0]

print(f"Original Score: {pred_old:.2f}")
print(f"New Score (10 hours): {pred_new:.2f}")
print(f"Change: {pred_new - pred_old:.2f}")


Original Score: 27.75
New Score (10 hours): 27.75
Change: 0.00




In [15]:
import shap
import plotly.graph_objects as go

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

# Take first student
student = X_test.iloc[0]
shap_vals = shap_values[0]

# Create Plotly bar chart of feature impacts
fig_shap = go.Figure([
    go.Bar(
        x=shap_vals,
        y=student.index,
        orientation="h",
        marker=dict(color=["red" if v < 0 else "green" for v in shap_vals])
    )
])
fig_shap.update_layout(
    title="SHAP Feature Impact (Student 0)",
    xaxis_title="Impact on Performance Score",
    yaxis_title="Features"
)
fig_shap.show()


  from .autonotebook import tqdm as notebook_tqdm
Matplotlib is building the font cache; this may take a moment.


In [16]:
import joblib
joblib.dump(model, "student_performance_model.pkl")


['student_performance_model.pkl']

In [17]:
import joblib

# Save the dictionary of LabelEncoders
joblib.dump(label_encoders, "label_encoders.pkl")


['label_encoders.pkl']