## Prepping Files for Model Juptyer Notebook
The purpose of the notebook is to take files in the EdNet-KT1 data set and combine them together while also removing and changing some of the values in the columns. The changes will be outputted to a file called 'combined_dataset.csv'

#### Data Merging Preparation
Our original dataset is comprised of one csv file per each of the 784k students, which results in massive overhead when reading in data. To prevent this, we're taking all of the relevant data and merging it into one unified dataset that can be easily read, navigated, and edited.
kt1_dir = "Data/KT1"
questions_fname = "Data/contents/questions.csv"


In [None]:

# Load important columns of questions file (finding correct_answer and tags)
# This only needs to be done once but we'll reference it multiple times per student interaction
questions = (
    pl.read_csv(questions_fname)
    .with_columns([
        pl.col("question_id").str.replace("q", "").cast(pl.Int32),
        pl.col("bundle_id").str.replace("b", "").cast(pl.Int32),
        pl.col("tags").cast(pl.Utf8)
    ])
    .select(["question_id", "correct_answer", "bundle_id", "tags"])
)


student_files = [os.path.join(kt1_dir, f) for f in os.listdir(kt1_dir) if f.endswith(".csv")]

dfs = []

### Data Fetching and Merging
Here, we take all of the information that we need from each KT1 file and combine it into a single .csv

In [None]:
# For each interaction, we take the student_id, question_id, bundle_id, tags, elapsed_time, and whether they answered correctly
for file in tqdm(student_files, desc="Progress"):
    # Take student_id from filename, remove 'u' prefix to make it int
    student_id = int(os.path.basename(file).replace("u", "").replace(".csv", ""))

    df = (
        pl.read_csv(file)
        .with_columns([
            pl.lit(student_id).alias("student_id"),
            pl.col("question_id").str.replace("q", "").cast(pl.Int32), # Remove 'q' prefix to make question_id int
        ])
        .join(questions, on="question_id", how="left")
        .with_columns([
            # Adds 'correct' column, which details if student got the question correct
            (pl.col("user_answer").str.strip_chars().str.to_lowercase() == pl.col("correct_answer").str.strip_chars().str.to_lowercase())
            .cast(pl.Int8)
            .alias("correct")
        ])
        .select(["student_id", "question_id", "bundle_id", "tags", "elapsed_time", "correct"])
    )

    # Tags are currently in a list, we need to flatten them so they work in csv
    df = df.with_columns(
        pl.col("tags")
        .cast(pl.Utf8)
        .str.replace_all(r"\[|\]|\s", "")
        .str.replace_all(",", ";")
        .alias("tags")
    )

    dfs.append(df)

# Sort by student_id, then question_id
final_df = pl.concat(dfs, how="vertical").sort(["student_id", "question_id"])

fname = "combined_dataset.csv"
final_df.write_csv(fname)
print(f"Saved {fname}")