In [1]:
# Personalized Learning Model for Neurodivergent Students 
# Data Preprocessing 

# Carly Carroll

In [9]:
##### LOAD LIBRARIES ##### 

import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np

In [10]:
##### LOAD CSV FILES #####

student_vle = pd.read_csv("studentVle.csv")
vle = pd.read_csv("vle.csv")
assessments = pd.read_csv("assessments.csv")
courses = pd.read_csv("courses.csv")
student_mental_health = pd.read_csv("Student Mental health.csv")
student_mat = pd.read_csv("student-mat.csv", sep=';')
student_por = pd.read_csv("student-por.csv", sep=';')
student_assessment = pd.read_csv("studentAssessment.csv")
student_info = pd.read_csv("studentInfo.csv")
student_registration = pd.read_csv("studentRegistration.csv")

In [11]:
##### MERGE DATASETS #####

### merge student-mat and student-por ###

# define columns to merge on 
merge_cols = [
    "school", "sex", "age", "address", "famsize", "Pstatus",
    "Medu", "Fedu", "Mjob", "Fjob", "reason", "nursery", "internet"
]

# perform the merge using inner join to keep only students in both datasets
student_mat_por = pd.merge(
    student_mat,
    student_por,
    on=merge_cols,
    suffixes=('_math', '_port')
)


### merge student_info, student_registration, student_assessment, and student_vle ###

# merge student_info and student_registration 
engagement_profile = pd.merge(
    student_info,
    student_registration,
    on=["id_student", "code_module", "code_presentation"],
    how="left"
)

# merge in student_assessment
engagement_profile = pd.merge(
    engagement_profile,
    student_assessment,
    on="id_student",
    how="left"
)

# Merge in student_vle
engagement_profile = pd.merge(
    engagement_profile,
    student_vle,
    on=["id_student", "code_module", "code_presentation"],
    how="left"
)

In [12]:
##### DATA CLEANING #####

### student_mat_por ###

# binary encoding map 
binary_map = {'yes': 1, 'no': 0, 'F': 0, 'M': 1, 'U': 1, 'R': 0, 'LE3': 0, 'GT3': 1, 'T': 1, 'A': 0}
for col in student_mat_por.columns:
    if student_mat_por[col].dtype == 'object' and student_mat_por[col].isin(binary_map.keys()).any():
        student_mat_por[col] = student_mat_por[col].map(binary_map).fillna(student_mat_por[col])
        
# one hot encode categorical columns
nominal_cols = [col for col in student_mat_por.columns if student_mat_por[col].dtype == 'object']
student_mat_por = pd.get_dummies(student_mat_por, columns=nominal_cols)

# drop duplicates and missing values
student_mat_por.drop_duplicates(inplace=True)
student_mat_por.dropna(inplace=True)

# normalize grade and numeric data
scale_cols = [col for col in student_mat_por.columns if col.startswith(('G1', 'G2', 'G3')) or 'absences' in col]
scaler = StandardScaler()
student_mat_por[scale_cols] = scaler.fit_transform(student_mat_por[scale_cols])

### engagement_profile ###

engagement_profile.drop_duplicates(inplace=True)
engagement_profile.dropna(subset=['score', 'sum_click'], how='all', inplace=True)

### student_mental_health ###

student_mental_health.dropna(inplace=True)
student_mental_health = student_mental_health.applymap(lambda x: x.strip() if isinstance(x, str) else x)

# binary encode categorical yes/no and gender 
mental_map = {'Yes': 1, 'No': 0, 'Male': 1, 'Female': 0}
for col in student_mental_health.columns:
    if student_mental_health[col].dtype == 'object' and student_mental_health[col].isin(mental_map.keys()).any():
        student_mental_health[col] = student_mental_health[col].map(mental_map).fillna(student_mental_health[col])

### courses ###

courses.drop_duplicates(inplace=True)

### vle ###

vle.drop_duplicates(inplace=True)

### assessments ###

assessments.drop_duplicates(inplace=True)
assessments.dropna(inplace=True)

In [13]:
##### FEATURE ENGINEERING #####

### grade features ###

student_mat_por["avg_grade_math"] = student_mat_por[["G1_math", "G2_math", "G3_math"]].mean(axis=1)
student_mat_por["avg_grade_port"] = student_mat_por[["G1_port", "G2_port", "G3_port"]].mean(axis=1)

student_mat_por["grade_gain_math"] = student_mat_por["G3_math"] - student_mat_por["G1_math"]
student_mat_por["grade_gain_port"] = student_mat_por["G3_port"] - student_mat_por["G1_port"]

student_mat_por["grade_std_math"] = student_mat_por[["G1_math", "G2_math", "G3_math"]].std(axis=1)
student_mat_por["grade_std_port"] = student_mat_por[["G1_port", "G2_port", "G3_port"]].std(axis=1)

### support features ###

# find columns like schoolsup_math, etc.

support_keywords = ['schoolsup', 'famsup', 'paid', 'internet']
support_cols_math = [col for col in student_mat_por.columns if any(key in col and 'math' in col for key in support_keywords)]
support_cols_port = [col for col in student_mat_por.columns if any(key in col and 'port' in col for key in support_keywords)]

# map yes/no to 1/0
for col in support_cols_math + support_cols_port:
    student_mat_por[col] = student_mat_por[col].map({'yes': 1, 'no': 0})

student_mat_por["support_score_math"] = student_mat_por[support_cols_math].sum(axis=1)
student_mat_por["support_score_port"] = student_mat_por[support_cols_port].sum(axis=1)

### behavioral features ###

student_mat_por["avg_studytime"] = student_mat_por[["studytime_math", "studytime_port"]].mean(axis=1)
student_mat_por["avg_traveltime"] = student_mat_por[["traveltime_math", "traveltime_port"]].mean(axis=1)

# average alcohol use
alcohol_cols = [col for col in student_mat_por.columns if 'Dalc' in col or 'Walc' in col]
student_mat_por["avg_alcohol_use"] = student_mat_por[alcohol_cols].mean(axis=1)

# family relationship average 
famrel_cols = [col for col in student_mat_por.columns if 'famrel' in col]
if len(famrel_cols) == 2:
    student_mat_por["avg_famrel"] = student_mat_por[famrel_cols].mean(axis=1)

In [7]:
##### SAVE MERGED/CLEANED DATASETS AS CSV FILES #####

student_mat_por.to_csv('student_mat_por_clean.csv', index=False)
engagement_profile.to_csv('engagement_profile_clean.csv', index=False)
student_mental_health.to_csv('student_mental_health_clean.csv', index=False)
courses.to_csv('courses_clean.csv', index=False)
vle.to_csv('vle_clean.csv', index=False)
assessments.to_csv('assessments_clean.csv', index=False)