# Feature Engineering
#### Purpose:
The purpose of the file is to perform feature engineering steps on our training and validation parquet files to create a new 'final' training and validation files. These new files will be used in a different notebook for training. No columns will be dropped from the files here, only generation of new features and if needed, scaling.

In [1]:
import os
import pandas as pd
import polars as pl # used for reading in parquet files quickly
import matplotlib as plt
from pathlib import Path
from tqdm import tqdm

### Reading in data

In [2]:
train_df = pl.read_parquet(rf"./Data/train_data.parquet").to_pandas() # convert from polars to pandas 
val_df = pl.read_parquet(rf"./Data/val_data.parquet").to_pandas()

In [3]:
train_df

Unnamed: 0,student_id,timestamp,question_id,bundle_id,tags,elapsed_time,correct
0,1,1565096190868,5012,3544,74,38000,0.0
1,1,1565096221062,4706,3238,71,24000,1.0
2,1,1565096293432,4366,2898,103,68000,1.0
3,1,1565096339668,4829,3361,83,42000,0.0
4,1,1565096401774,6528,5060,90,59000,0.0
...,...,...,...,...,...,...,...
76078853,840472,1575306027437,9814,7165,136,37000,0.0
76078854,840472,1575306068437,4712,3244,71,37000,0.0
76078855,840472,1575306087437,3793,2325,82,15000,0.0
76078856,840473,1575306037437,3830,2362,106,25000,1.0


In [4]:
val_df

Unnamed: 0,student_id,timestamp,question_id,bundle_id,tags,elapsed_time,correct
0,4,1566782278107,5177,3709,74,85000,1.0
1,4,1566782311854,8104,5575,8;2;182,28000,1.0
2,4,1566782336708,4291,2823,84,22000,0.0
3,4,1566782351705,4020,2552,86,12000,1.0
4,4,1566782382870,5258,3790,87,28000,1.0
...,...,...,...,...,...,...,...
19215063,840471,1575305806437,10300,7651,76,18000,0.0
19215064,840471,1575305834437,8886,6237,83,25000,0.0
19215065,840471,1575305860437,8556,5907,85,24000,1.0
19215066,840471,1575305880437,9901,7252,74,18000,0.0


## Lecture Tags
#### Purpose:
KT-4 dataset provides information on student interactions. The interaction that we are going to be taking advantage in this case is if a student interacted with a lecture which is denoted by the 'l' char followed by a number. We plan to provide information on whether the student interacted with a lecture associated with a tag for on a question.

In [5]:
def combine_data(file_path, output_file):
    dfs = []
    csv_files = list(Path(file_path).glob("*.csv"))
    
    for csv_file in tqdm(csv_files, desc="Reading CSVs"):
        # getting student_id val
        filename = csv_file.stem # gets name before '.csv' (ex. 'u123')
        student_id = filename[1:] # removes 'u' from name
        print(f"filename: {csv_file}, student_id: {student_id}\n")        
        # scan more memory efficient then read since it does not store
        df = pl.scan_csv(csv_file).with_columns(
            pl.lit(student_id).alias("student_id"))
    
        # add column to df that will represent student_id based of file_num starting at 1
        dfs.append(df)
    
    pl.concat(dfs).sink_parquet(output_file)
    return True # file created successfully

In [6]:
file_path = "./Data/KT3/" 
file = "./Data/combined_kt3.parquet"

if not Path(file).exists():
    result = combine_data(file_path, file)
    if result: print("combined dataset file created successfuly")
    else: print("file failed to create")
else:
    print("combined_kt3.parquet dataset is present in Data folder")

kt3_df = pl.read_parquet("./Data/combined_kt3.parquet").to_pandas()
kt3_df

combined_kt3.parquet dataset is present in Data folder


Unnamed: 0,timestamp,action_type,item_id,source,user_answer,platform,student_id
0,1565096151269,enter,b3544,diagnosis,,mobile,1
1,1565096187972,respond,q5012,diagnosis,b,mobile,1
2,1565096194904,submit,b3544,diagnosis,,mobile,1
3,1565096195001,enter,b3238,diagnosis,,mobile,1
4,1565096218682,respond,q4706,diagnosis,c,mobile,1
...,...,...,...,...,...,...,...
89270649,1568964975390,enter,b3819,sprint,,mobile,9998
89270650,1568964992921,respond,q5287,sprint,c,mobile,9998
89270651,1568964996503,submit,b3819,sprint,,mobile,9998
89270652,1568964996572,enter,e3819,sprint,,mobile,9998


## Rolling Student Accuracy
We only want to use the student accuracy for questions that they've attempted so far, so this feature tracks the students rolling accuracy up to that point. We use Bayesian smoothing to prevent the students first few interactions from heavily skewing this metric

In [7]:
import polars as pl

train_df = pl.read_parquet(r".\Data\train_data.parquet")
val_df = pl.read_parquet(r".\Data\val_data.parquet")

# Baseline accuracy will be used for Bayesian smoothing - it only uses the training data
baseline_acc = train_df["correct"].mean()
print(f"Baseline accuracy: {baseline_acc:.4f}")

def add_rolling_accuracy(df, baseline_acc):

    alpha = 20 # Strength for Bayesian smoothing

    df = df.sort(["student_id", "timestamp"]).to_pandas()

    # Rolling counts
    df["interaction_number"] = df.groupby("student_id").cumcount()
    df["cumulative_correct"] = df.groupby("student_id")["correct"].cumsum()

    # Past-correct and past-interactions
    df["num_correct"] = df["cumulative_correct"] - df["correct"]
    df["num_interactions"] = df["interaction_number"]

    # Bayesian smoothed accuracy:
    df["bayes_rolling_acc"] = (
        (df["num_correct"] + alpha * baseline_acc) /
        (df["num_interactions"] + alpha)
    )
    df.drop(columns=["cumulative_correct", "num_correct", "interaction_number"], inplace=True)

    return pl.from_pandas(df)

train_df = add_rolling_accuracy(train_df, baseline_acc)
val_df = add_rolling_accuracy(val_df, baseline_acc)

#Prove that the accuracy resets between students
print(train_df.head(1085))

# Temporarily store the new columns to save memory before we read in final dataset
temp_train_num_interactions = train_df["num_interactions"]
temp_train_num_bayes_rolling_acc = train_df["bayes_rolling_acc"]
temp_val_num_interactions = val_df["num_interactions"]
temp_val_num_bayes_rolling_acc = val_df["bayes_rolling_acc"]
del train_df
del val_df

# Read in final datasets to add new features
final_train_df = pl.read_parquet(r".\Data\final_train_data.parquet")
final_val_df = pl.read_parquet(r".\Data\final_val_data.parquet")

final_train_df = final_train_df.with_columns([
    temp_train_num_interactions.alias("num_interactions"),
    temp_train_num_bayes_rolling_acc.alias("bayes_rolling_acc")
])

final_val_df = final_val_df.with_columns([
    temp_val_num_interactions.alias("num_interactions"),
    temp_val_num_bayes_rolling_acc.alias("bayes_rolling_acc")
])

# Save updated final file
final_train_df.write_parquet(r".\Data\final_train_data.parquet")
final_val_df.write_parquet(r".\Data\final_val_data.parquet")

Baseline accuracy: 0.6535
shape: (1_085, 9)
┌────────────┬───────────────┬─────────────┬───────────┬───┬──────────────┬─────────┬─────────────────┬────────────────┐
│ student_id ┆ timestamp     ┆ question_id ┆ bundle_id ┆ … ┆ elapsed_time ┆ correct ┆ num_interaction ┆ bayes_rolling_ │
│ ---        ┆ ---           ┆ ---         ┆ ---       ┆   ┆ ---          ┆ ---     ┆ s               ┆ acc            │
│ i64        ┆ i64           ┆ i64         ┆ i64       ┆   ┆ i64          ┆ f64     ┆ ---             ┆ ---            │
│            ┆               ┆             ┆           ┆   ┆              ┆         ┆ i64             ┆ f64            │
╞════════════╪═══════════════╪═════════════╪═══════════╪═══╪══════════════╪═════════╪═════════════════╪════════════════╡
│ 1          ┆ 1565096190868 ┆ 5012        ┆ 3544      ┆ … ┆ 38000        ┆ 0.0     ┆ 0               ┆ 0.653523       │
│ 1          ┆ 1565096221062 ┆ 4706        ┆ 3238      ┆ … ┆ 24000        ┆ 1.0     ┆ 1               ┆ 0.622

## Lecture Tags
#### Purpose:
KT-4 dataset provides information on student interactions. The interaction that we are going to be taking advantage in this case is if a student interacted with a lecture which is denoted by the 'l' char followed by a number. We plan to provide information on whether the student interacted with a lecture associated with a tag for on a question.

In [3]:
import polars as pl

# Read lectures metadata and keep only lecture_id and tags
lec_data = pl.read_csv("./Data/contents/lectures.csv").select(pl.col("lecture_id"), pl.col("tags"))

# Map lecture id to relevant tag
lecture_to_tags = {}
for row in lec_data.iter_rows(named=True):
    lid = str(row["lecture_id"]).strip()
    raw = row["tags"]
    if raw is None:
        continue
    tag_str = str(raw).strip()
    if not tag_str:
        continue
    lecture_to_tags[lid] = int(tag_str) if tag_str.isdigit() else tag_str

# print(lecture_to_tags)

# Get relevant student lecture interactions from KT3
student_lec_data = (
    pl.read_parquet("./Data/combined_kt3.parquet")
    .select(["student_id", "timestamp", "item_id"])
    .with_columns(pl.col("item_id").cast(pl.Utf8).str.strip_chars().alias("lecture_id"))
    .filter(pl.col("lecture_id").str.starts_with("l"))
)

# repare lecture tags in Polars for join
lec_tags_pl = (
    pl.DataFrame({
        "lecture_id": list(lecture_to_tags.keys()),
        "tag": [str(v) for v in lecture_to_tags.values()],
    })
)

# Student lecture events with tag attached
student_lec_tags = (
    student_lec_data
    .join(lec_tags_pl, on="lecture_id", how="left")
    .with_columns([
        pl.col("student_id").cast(pl.Int64),
        pl.col("tag").cast(pl.Utf8),
    ])
    .filter(pl.col("tag").is_not_null() & (pl.col("tag") != "-1"))
    .select([
        pl.col("student_id"),
        pl.col("timestamp").alias("timestamp_lecture"),
        pl.col("tag").alias("tag"),
    ])
)

def add_watched_flag(df):

    # Normalize data types
    df = (
        df.with_columns([
            pl.col("student_id").cast(pl.Int64),
            pl.col("tags").cast(pl.Utf8).alias("tag"),
        ])
        .with_row_index("row_id")
    )

    # Explode tags into multiple rows
    joined = (
        df.select(["row_id", "student_id", "timestamp", "tag"]) 
        .join(student_lec_tags, on=["student_id", "tag"], how="left")
    )

    # Determine if any lecture was watched before the question timestamp
    watched = (
        joined.select([
            pl.col("row_id"),
            (pl.col("timestamp_lecture") < pl.col("timestamp")).fill_null(False).alias("watched_before"),
        ])
        .group_by("row_id")
        .agg(pl.any("watched_before").alias("watched_lecture"))
        .with_columns(pl.col("watched_lecture").cast(pl.Int8))
    )

    #Add watched_lecture flag back to original dataframe
    out = (
        df.join(watched, on="row_id", how="left")
           .with_columns(pl.col("watched_lecture").fill_null(0))
           .drop(["row_id"])
    )
    return out

# Add to dataset
train_df = pl.read_parquet("./Data/train_data.parquet")
val_df = pl.read_parquet("./Data/val_data.parquet")

train_df = add_watched_flag(train_df)
val_df = add_watched_flag(val_df)

train_df.head()


student_id,timestamp,question_id,bundle_id,tags,elapsed_time,correct,tag,watched_lecture
i64,i64,i64,i64,str,i64,i64,str,i8
1,1565096190868,5012,3544,"""74""",38000,0,"""74""",0
1,1565096221062,4706,3238,"""71""",24000,1,"""71""",0
1,1565096293432,4366,2898,"""103""",68000,1,"""103""",0
1,1565096339668,4829,3361,"""83""",42000,0,"""83""",0
1,1565096401774,6528,5060,"""90""",59000,0,"""90""",0


In [4]:
# Sanity check to make sure watched_lecture is correctly added
print(train_df.select(pl.col("watched_lecture").sum().alias("num_ones")))
print(val_df.select(pl.col("watched_lecture").sum().alias("num_ones")))

# Store columns temporarily so we can free memory
temp_train_lec = train_df['watched_lecture']
temp_val_lec = val_df['watched_lecture']

# Give me my memory back please
del train_df
del val_df

# Read in final datasets to add new features
final_train_df = pl.read_parquet(r".\Data\final_train_data.parquet")
final_val_df = pl.read_parquet(r".\Data\final_val_data.parquet")
final_train_df = final_train_df.with_columns([
    temp_train_lec.alias("watched_lecture") 
])
final_val_df = final_val_df.with_columns([
    temp_val_lec.alias("watched_lecture") 
])

# Save updated final file
final_train_df.write_parquet(r".\Data\final_train_data.parquet")
final_val_df.write_parquet(r".\Data\final_val_data.parquet")

final_train_df.head()

shape: (1, 1)
┌──────────┐
│ num_ones │
│ ---      │
│ i64      │
╞══════════╡
│ 1574073  │
└──────────┘
shape: (1, 1)
┌──────────┐
│ num_ones │
│ ---      │
│ i64      │
╞══════════╡
│ 425259   │
└──────────┘


student_id,timestamp,question_id,bundle_id,tags,elapsed_time,correct,num_interactions,bayes_rolling_acc,watched_lecture
i64,i64,i64,i64,str,i64,i64,i64,f64,i8
1,1565096190868,5012,3544,"""74""",38000,0,0,0.653523,0
1,1565096221062,4706,3238,"""71""",24000,1,1,0.622403,0
1,1565096293432,4366,2898,"""103""",68000,1,2,0.639566,0
1,1565096339668,4829,3361,"""83""",42000,0,3,0.655237,0
1,1565096401774,6528,5060,"""90""",59000,0,4,0.627936,0


## Add Forgetful Ben Score
The "Ben" score is our metric of determining how a student has performed on the 5 most recent questions. Each question correct adds one, and each question incorrect subtracts one. So, a ben score of 5 means the student is on a 5 correct streak. 1 means that the student has answered 3 correct, and 2 incorrect, etc...

In [1]:
import os
import polars as pl

def calculate_forgetful_ben_score(df, history):
    """Calculate ben_score based only on the previous N answers within each student"""
    print("Sorting data by student_id and timestamp...")
    df = df.sort(["student_id", "timestamp"])
    
    print("Detecting student changes...")
    # Detect when student ID changes
    df = df.with_columns(
        pl.when(pl.col('student_id') != pl.col('student_id').shift(1))
        .then(1)
        .otherwise(0)
        .alias('student_changed')
    )
    
    print("Creating student groups...")
    # Create groups for each student
    df = df.with_columns(
        pl.col('student_changed').cum_sum().alias('student_group_id')
    )
    
    print(f"Computing forgetful ben_score (window of {history})...")
    # Build signed correctness (+1 / -1)
    df = df.with_columns(
        pl.when(pl.col('correct') == 1).then(1).otherwise(-1).alias('_signed_correct')
    )
    # Exclude current question: shift by 1 within each student group before rolling
    # First row per student becomes null -> filled with 0, so initial forgetful score = 0
    df = df.with_columns(
        pl.col('_signed_correct')
        .shift(1)  # exclude current
        .fill_null(0)
        .rolling_sum(window_size=history, min_periods=1)
        .over('student_group_id')
        .alias(f'forgetful_ben_score')
    )
    
    print("Cleaning up temporary columns...")
    # Drop temporary columns
    df = df.drop(['student_changed', 'student_group_id', '_signed_correct'])
    
    return df

# We split our data into 2 separate sets in the prep_data file
print("Loading training data...")
train_df = pl.read_parquet(rf".\Data\train_data.parquet")
print("Loading validation data...")
val_df = pl.read_parquet(rf".\Data\val_data.parquet")

print("\nCalculating scores for training data...")
train_df = calculate_forgetful_ben_score(train_df, 5)
print("\nCalculating scores for validation data...")
val_df = calculate_forgetful_ben_score(val_df, 5)

print("\nTraining data sample:")
print(train_df.head())

Loading training data...
Loading validation data...

Calculating scores for training data...
Sorting data by student_id and timestamp...
Detecting student changes...
Creating student groups...
Computing forgetful ben_score (window of 5)...


  .rolling_sum(window_size=history, min_periods=1)


Cleaning up temporary columns...

Calculating scores for validation data...
Sorting data by student_id and timestamp...
Detecting student changes...
Creating student groups...
Computing forgetful ben_score (window of 5)...
Cleaning up temporary columns...

Training data sample:
shape: (5, 8)
┌────────────┬──────────────┬─────────────┬───────────┬──────┬─────────────┬─────────┬─────────────┐
│ student_id ┆ timestamp    ┆ question_id ┆ bundle_id ┆ tags ┆ elapsed_tim ┆ correct ┆ forgetful_b │
│ ---        ┆ ---          ┆ ---         ┆ ---       ┆ ---  ┆ e           ┆ ---     ┆ en_score    │
│ i64        ┆ i64          ┆ i64         ┆ i64       ┆ str  ┆ ---         ┆ i64     ┆ ---         │
│            ┆              ┆             ┆           ┆      ┆ i64         ┆         ┆ i32         │
╞════════════╪══════════════╪═════════════╪═══════════╪══════╪═════════════╪═════════╪═════════════╡
│ 1          ┆ 156509619086 ┆ 5012        ┆ 3544      ┆ 74   ┆ 38000       ┆ 0       ┆ 0           │


In [2]:
temp_train_ben = train_df[f'forgetful_ben_score']
temp_val_ben = val_df[f'forgetful_ben_score']

del train_df
del val_df

# Read in final datasets to add new features
final_train_df = pl.read_parquet(r".\Data\final_train_data.parquet")
final_val_df = pl.read_parquet(r".\Data\final_val_data.parquet")

final_train_df = final_train_df.with_columns([
    temp_train_ben.alias("forgetful_ben_score")
])
final_val_df = final_val_df.with_columns([
    temp_val_ben.alias("forgetful_ben_score")
])

# Save updated final file
final_train_df.write_parquet(r".\Data\final_train_data.parquet")
final_val_df.write_parquet(r".\Data\final_val_data.parquet")

final_train_df.head()

student_id,timestamp,question_id,bundle_id,tags,elapsed_time,correct,num_interactions,bayes_rolling_acc,watched_lecture,forgetful_ben_score
i64,i64,i64,i64,str,i64,i64,i64,f64,i8,i32
1,1565096190868,5012,3544,"""74""",38000,0,0,0.653523,0,0
1,1565096221062,4706,3238,"""71""",24000,1,1,0.622403,0,-1
1,1565096293432,4366,2898,"""103""",68000,1,2,0.639566,0,0
1,1565096339668,4829,3361,"""83""",42000,0,3,0.655237,0,1
1,1565096401774,6528,5060,"""90""",59000,0,4,0.627936,0,0
