In [1]:
# Capstone Project: Student Success & Career Path Prediction

# Scenario

# The university wants to analyze student performance data to:

# Predict exam scores (Regression).
# Classify students into “At Risk” vs. “On Track” categories (Classification).
# Cluster students into groups with similar study habits (Clustering).
# Recommend interventions (extra tutoring, workshops, counseling).


import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_absolute_error, r2_score, classification_report
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier


np.random.seed(42)
n = 300

data = {
    "study_hours": np.random.randint(1, 10, n),
    "attendance": np.random.randint(50, 100, n),
    "assignments_completion": np.random.randint(40, 100, n),
    "sleep_hours": np.random.randint(4, 9, n),
    "stress_level": np.random.randint(1, 10, n),
}

df = pd.DataFrame(data)

# Generate exam score (dependent on features)
df["exam_score"] = (
    df["study_hours"] * 5 +
    df["attendance"] * 0.3 +
    df["assignments_completion"] * 0.2 -
    df["stress_level"] * 2 +
    df["sleep_hours"] * 2 +
    np.random.normal(0, 5, n)
)


X_reg = df.drop("exam_score", axis=1)
y_reg = df["exam_score"]

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

reg_model = LinearRegression()
reg_model.fit(X_train_r, y_train_r)

y_pred_r = reg_model.predict(X_test_r)

print("MAE:", mean_absolute_error(y_test_r, y_pred_r))
print("R2 Score:", r2_score(y_test_r, y_pred_r))


df["risk_label"] = np.where(df["exam_score"] < 40, 0, 1)

X_clf = df.drop(["exam_score", "risk_label"], axis=1)
y_clf = df["risk_label"]

X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X_clf, y_clf, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train_c_scaled = scaler.fit_transform(X_train_c)
X_test_c_scaled = scaler.transform(X_test_c)

clf_model = LogisticRegression()
clf_model.fit(X_train_c_scaled, y_train_c)

y_pred_c = clf_model.predict(X_test_c_scaled)

print(classification_report(y_test_c, y_pred_c))

X_cluster = df[["study_hours", "attendance", "stress_level"]]

scaler_cluster = StandardScaler()
X_cluster_scaled = scaler_cluster.fit_transform(X_cluster)

kmeans = KMeans(n_clusters=3, random_state=42)
df["cluster"] = kmeans.fit_predict(X_cluster_scaled)

print(df["cluster"].value_counts())

def recommend_intervention(row):
    if row["risk_label"] == 0 and row["attendance"] < 70:
        return "Extra Tutoring"
    elif row["stress_level"] > 7:
        return "Counseling"
    elif row["assignments_completion"] < 60:
        return "Time Management Workshop"
    else:
        return "On Track – Encourage Advanced Learning"

df["intervention"] = df.apply(recommend_intervention, axis=1)

print(df[["exam_score", "risk_label", "cluster", "intervention"]].head())

MAE: 3.2949460373238546
R2 Score: 0.9183232069977294
              precision    recall  f1-score   support

           0       0.50      0.33      0.40         6
           1       0.93      0.96      0.95        54

    accuracy                           0.90        60
   macro avg       0.71      0.65      0.67        60
weighted avg       0.89      0.90      0.89        60

cluster
2    105
0    101
1     94
Name: count, dtype: int64
   exam_score  risk_label  cluster                            intervention
0   65.647218           1        1                Time Management Workshop
1   60.723676           1        1                Time Management Workshop
2   79.357265           1        2  On Track – Encourage Advanced Learning
3   64.513298           1        0  On Track – Encourage Advanced Learning
4   72.620504           1        0  On Track – Encourage Advanced Learning
