Below is a Python script showing how to load the core OULAD tables, extract “content-based” features from the VLE interactions (i.e. resource types and click counts), plus a few auxiliary signals (active days, assignment performance), and assemble a per-student feature matrix suitable for content-based profiling in a recommender

In [None]:
import pandas as pd
import numpy as np

In [2]:


def load_oulad(path="../data/raw/OULAD"):
    """Load the main OULAD CSVs into pandas DataFrames."""
    student_info       = pd.read_csv(f"{path}/studentInfo.csv")
    vle                = pd.read_csv(f"{path}/vle.csv")
    student_vle        = pd.read_csv(f"{path}/studentVle.csv")
    student_assess     = pd.read_csv(f"{path}/studentAssessment.csv")
    assessments        = pd.read_csv(f"{path}/assessments.csv")
    return student_info, vle, student_vle, student_assess, assessments

In [None]:
def build_content_features(vle, student_vle):
    """
    Join student_vle to vle to get activity_type, then pivot to get total clicks per resource type per student (content profile).
    """
    # join in the resource type
    sv = student_vle.merge(
        vle[['id', 'activity_type']],
        left_on='id_site', right_on='id',
        how='left'
    )
    # sum clicks per student × activity_type
    clicks_by_type = (sv
        .groupby(['id_student', 'activity_type'])
        ['sum_click']
        .sum()
        .unstack(fill_value=0)
        .add_prefix("click_")
        .reset_index()
    )
    return clicks_by_type