In [1]:
import pandas as pd

In [2]:
# Recompute average session duration per user
path="../data/raw/OULAD"
vle_df= pd.read_csv(f"{path}/vle.csv")
student_vle_df = pd.read_csv(f"{path}/studentVle.csv")
student_info_df = pd.read_csv(f'{path}/studentInfo.csv')

In [3]:
# Define refined media mapping
def map_media_bucket(activity_type):
    reading_types = {'url', 'ouwiki'}
    video_types   = {'ouelluminate', 'oucollaborate'}
    if activity_type in video_types:
        return 'video'
    elif activity_type in reading_types:
        return 'reading'
    else:
        return 'course'

In [4]:
# Merge click logs with activity metadata and apply mapping
df = student_vle_df.merge(
    vle_df[['id_site', 'activity_type']],
    on='id_site', how='left'
)
df['media_type'] = df['activity_type'].apply(map_media_bucket)

In [5]:
# Aggregate raw click counts per user and media_type
counts_df = (
    df
    .groupby(['id_student', 'media_type'])['sum_click']
    .sum()
    .unstack(fill_value=0)
    .reset_index()
)

In [6]:
#  Rename count columns (add '_count' suffix, keeping 'id_student')
media_cols = [col for col in counts_df.columns if col != 'id_student']
counts_df = counts_df.rename(columns={col: f"{col}_count" for col in media_cols})

In [7]:
#  Compute normalized proportions per user
prop_cols = [col for col in counts_df.columns if col.endswith('_count')]
props_df = counts_df[['id_student'] + prop_cols].copy()
props_df[prop_cols] = props_df[prop_cols].div(props_df[prop_cols].sum(axis=1), axis=0)
props_df = props_df.rename(columns={col: col.replace('_count', '_prop') for col in prop_cols})

In [8]:
# Compute average session duration (minutes) per user
df['date'] = pd.to_datetime(df['date'])
session_df = (
    df.groupby(['id_student', 'date'])['sum_click']
    .sum()
    .reset_index(name='clicks_per_session')
)
# assume 0.5 minutes per click
session_df['duration_min'] = session_df['clicks_per_session'] * 0.5
user_sessions = (
    session_df.groupby('id_student')['duration_min']
    .mean()
    .reset_index(name='avg_session_duration_min')
)

In [9]:
# Merge counts, proportions, demographics, and session durations
profiles = counts_df.merge(props_df, on='id_student') \
    .merge(student_info_df[['id_student', 'gender', 'region', 'highest_education', 'imd_band', 'age_band']],
           on='id_student', how='left') \
    .merge(user_sessions, on='id_student', how='left')

In [10]:
# Save the augmented profiles
out_path = "../data/processed/oulad_media_profiles_refined.csv"
profiles.to_csv(out_path, index=False)

In [11]:
print(profiles.head())

   id_student  course_count  reading_count  video_count  course_prop  \
0        6516          2648            143            0     0.948764   
1        8462           602             41           13     0.917683   
2        8462           602             41           13     0.917683   
3       11391           929              5            0     0.994647   
4       23629           161              0            0     1.000000   

   reading_prop  video_prop gender               region   highest_education  \
0      0.051236    0.000000      M             Scotland    HE Qualification   
1      0.062500    0.019817      M        London Region    HE Qualification   
2      0.062500    0.019817      M        London Region    HE Qualification   
3      0.005353    0.000000      M  East Anglian Region    HE Qualification   
4      0.000000    0.000000      F  East Anglian Region  Lower Than A Level   

  imd_band age_band  avg_session_duration_min  
0   80-90%     55<=                  8.77673