In [2]:
import os
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", DEVICE)

Using device: cpu


In [11]:
# from regression.ipynb

data = pd.read_csv("../student_success_survey.csv")

import sys
import os
import pandas as pd

# Step 1: Add the script's directory to the Python path.
# This creates a correct relative path from your notebook to the script's folder.
feature_engineer_path = os.path.abspath(os.path.join('..', 'feature_engineer(task3)'))
if feature_engineer_path not in sys.path:
    sys.path.append(feature_engineer_path)

# Step 2: Now you can import functions from your .py file.
from feature_engineer import run_full_pipeline, engineer_behavioral_features

df = run_full_pipeline(data)

def prepare_data(df, target_col='final_course_score'):
    """
    Prepare the data for training
    """
    # Make a copy to avoid modifying original
    df_processed = df.copy()
    
    # Handle categorical variables
    # 1. Encode 'prereq_ct_grade' (ordinal encoding - grades have natural order)
    grade_mapping = {
        'Exempted/Di': 3.75, 'C+ or lower': 2.5,  'B-': 2.7, 'B': 3.5, 
        'B+': 4.0, 'A-': 4.5, 'A/A+': 5  # Average of A and A+
    }
    df_processed['prereq_ct_grade_encoded'] = df_processed['prereq_ct_grade'].map(grade_mapping)
    
    # 2. Encode binary categorical variables
    binary_mapping = {'Yes': 1, 'No': 0}
    df_processed['used_pytorch_tensorflow_enc'] = df_processed['used_pytorch_tensorflow'].map(binary_mapping)
    df_processed['laptop_or_cloud_ready_enc'] = df_processed['laptop_or_cloud_ready'].map(binary_mapping)
    
    # 3. Encode 'pillar year' (combination of pillar and year)
    # Extract pillar and year information
    df_processed['pillar'] = df_processed['pillar year'].apply(lambda x: x.split()[0])
    df_processed['year'] = df_processed['pillar year'].apply(lambda x: 'final' if 'final' in x else '3rd')
    
    # One-hot encode pillar
    pillar_dummies = pd.get_dummies(df_processed['pillar'], prefix='pillar')
    year_dummies = pd.get_dummies(df_processed['year'], prefix='year')
    
    # Combine all features
    feature_cols = ['cgpa', 'prereq_ct_grade_encoded', 'used_pytorch_tensorflow_enc', 
                    'laptop_or_cloud_ready_enc', 'total_grit_score', 'hidden_knowledge_score',
                    'study_friction_index', 'python_confidence_gap']
    
    # Add dummy variables
    X = pd.concat([df_processed[feature_cols], pillar_dummies, year_dummies], axis=1)
    y = df_processed[target_col]
    
    return X, y

# Step 1: Data Preprocessing

X, y = prepare_data(df)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features (important for neural networks)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled)
y_train_tensor = torch.FloatTensor(y_train.values).reshape(-1, 1)
X_test_tensor = torch.FloatTensor(X_test_scaled)
y_test_tensor = torch.FloatTensor(y_test.values).reshape(-1, 1)

# Create datasets and dataloaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)

# Step 2: Define the Linear Model
class LinearRegressionModel(nn.Module):
    def __init__(self, input_dim):
        super(LinearRegressionModel, self).__init__()
        self.linear = nn.Linear(input_dim, 1)  # Single linear layer
        
    def forward(self, x):
        return self.linear(x)

# Get input dimension
input_dim = X_train_scaled.shape[1]
model = LinearRegressionModel(input_dim)

# Step 3: Training setup
criterion = nn.MSELoss()  # Mean Squared Error for regression
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Step 4: Training loop
def train_model(model, train_loader, criterion, optimizer, epochs=100):
    train_losses = []
    
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        
        for batch_X, batch_y in train_loader:
            # Forward pass
            predictions = model(batch_X)
            loss = criterion(predictions, batch_y)
            
            # Backward pass and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        avg_loss = total_loss / len(train_loader)
        train_losses.append(avg_loss)
        
        if (epoch + 1) % 20 == 0:
            print(f'Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.4f}')
    
    return train_losses

# Train the model
print("Training the model...")
train_losses = train_model(model, train_loader, criterion, optimizer, epochs=100)

  y_train_tensor = torch.FloatTensor(y_train.values).reshape(-1, 1)


Training the model...
Epoch [20/100], Loss: 2907.7500
Epoch [40/100], Loss: 1425.3096
Epoch [60/100], Loss: 554.6607
Epoch [80/100], Loss: 161.4061
Epoch [100/100], Loss: 68.0416


In [21]:
def get_at_risk_students(model, scaler, data_df):
    # preprocess new data 
    X_new, _ = prepare_data(data_df)

    print("prepared data")
    
    # scale
    X_new_scaled = scaler.transform(X_new)
    X_new_tensor = torch.FloatTensor(X_new_scaled)

    print("scaled data")
    
    # predict
    model.eval()
    with torch.no_grad():
        predictions = model(X_new_tensor)

        print("predicted data")

    df = data_df.copy()
    df['predicted_score'] = predictions.numpy().flatten()

    # filter, at-risk if pred score < 50
    at_risk_df = df[df['predicted_score'] < 50].copy()

    print("filtered")

    # generate recommendations
    recommendations = []
    for _, row in at_risk_df.iterrows():
        grit_rec = ""
        friction_rec = ""

        grit_score = row['total_grit_score']
        if grit_score < 2.5:
            grit_rec = " consider additional mentoring and have a structured study routine"
        elif grit_score < 3.8:
            grit_rec = " have more consistent practice and weekly planning"

        study_friction = row['study_friction_index']
        if study_friction > 7.5:
            friction_rec = " find a routine that reduces travelling time or increase your planned study hours"

        rec = "You may want to"
        if grit_rec:
            rec += grit_rec
        if grit_rec and friction_rec:
            rec += " and"
        if friction_rec:
            rec += friction_rec
        
        recommendations.append(rec)

    at_risk_df['recommendation'] = recommendations

    return at_risk_df
    

In [22]:
risk_df = get_at_risk_students(model, scaler, df)
risk_df.head()

prepared data
scaled data
predicted data
filtered


Unnamed: 0,cgpa,is_cgpa_missing,prereq_ct_grade,used_pytorch_tensorflow,laptop_or_cloud_ready,total_grit_score,hidden_knowledge_score,study_friction_index,is_logistics_missing,python_confidence_gap,pillar year,final_course_score,predicted_score,recommendation
46,3.5,0,B-,No,No,2.333333,0,1.833333,0,1.9044,ESD final year,36.6,44.590885,You may want to consider additional mentoring ...
84,3.24,0,B+,No,Yes,1.666667,0,0.380952,0,2.0,EPD 3rd year student,46.7,47.860924,You may want to consider additional mentoring ...
185,3.01,0,C+ or lower,No,Yes,1.666667,0,0.645833,0,2.0,EPD final year,45.4,43.001621,You may want to consider additional mentoring ...
216,3.28,0,B,No,Yes,2.166667,0,0.291667,0,2.0,ISTD final year,31.4,46.271435,You may want to consider additional mentoring ...


# Reflection

We prioritse Recall over Precision as false negatives are more harmful than false positives in this case. High Recall means that there are few missed cases/false negatives while High Precision means there are few alarms/false positives. Since it is worse if at-risk students are falsely categorised as non-at-risk and not alerted, as compared to non-at-risk students being alerted as at-risk, High Recall is preferred.