In [96]:
import os
import polars as pl

In [2]:
kaggle_run_type = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', None)
if kaggle_run_type:
    DATA_PATH = "/kaggle/input/linking-writing-processes-to-writing-quality"
else:
    DATA_PATH = "../../data"

In [3]:
logs = pl.read_csv(f"{DATA_PATH}/train_logs.csv")

In [4]:
logs.sample(3)

id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count
str,i64,i64,i64,i64,str,str,str,str,i64,i64
"""be43ca0e""",1753,1307425,1307498,73,"""Input""","""Space""","""Space""",""" """,1305,239
"""fc76950f""",2840,751285,751343,58,"""Input""","""q""","""q""","""q""",1320,326
"""f95ee168""",2515,1650719,1650779,60,"""Input""","""q""","""q""","""q""",2334,425


In [217]:
def reconstruct_essay_text(single_log, strict_checks=False):
    essay_text = ""
    single_log = (
        single_log
        .filter(pl.col("activity").ne("Nonproduction"))
        .select("activity", "text_change", "cursor_position")
    )
    for row in single_log.iter_rows():
        activity = row[0]
        text_change = row[1]
        pos = row[2]
        
        if activity=="Input":
            input_text = text_change
            input_len = len(input_text)
            essay_text = f"{essay_text[:pos - input_len]}{input_text}{essay_text[pos - input_len:]}"
        elif activity=="Remove/Cut":
            removed_text = text_change
            removed_len = len(removed_text)
            essay_text = f"{essay_text[:pos]}{essay_text[pos + removed_len:]}"
        elif activity=="Replace":
            removed_text, input_text = text_change.split(" => ")
            removed_len = len(removed_text)
            input_len = len(input_text)
            essay_text = f"{essay_text[:pos - input_len]}{input_text}{essay_text[pos + removed_len - input_len:]}"
        elif activity=="Paste":
            input_text = text_change
            input_len = len(input_text)
            essay_text = f"{essay_text[:pos - input_len]}{input_text}{essay_text[pos - input_len:]}"
        elif activity[:10]=="Move From ":
            move_from, move_to = activity[10:].split(" To ")
            start_pos, end_pos = [int(x) for x in move_from[1:-1].split(", ")]
            target_pos, _ = [int(x) for x in move_to[1:-1].split(", ")]
            moved_text = text_change
            moved_len = len(text_change)
            essay_text = f"{essay_text[:start_pos]}{essay_text[end_pos:]}"
            essay_text = f"{essay_text[:target_pos]}{moved_text}{essay_text[target_pos:]}"
    return(essay_text)

In [218]:
essays = {}

for essay_id in logs.select(pl.col("id").unique())["id"]:
    essay_text = reconstruct_essay_text(logs.filter(pl.col("id").eq(essay_id)))
    essays[essay_id] = essay_text

essays

{'840c0b4c': 'qqqq qq qqq qqqqq qq qqqq qqqqqqqqq qq qqqqqqq qqqqqqq qqqqqqq qqqqqqqqqqq qq  qqqqqqqqqqq? qq qq qqqqqqq, q qqqqq qqqq qqqqq qqqqqqqqqqq qq q qqqq qqq qq qqqq qqqqqqq, qqq qqqq qqq qq qqqqqqq qqqqqqqqqqq. qq qqqq qqqq, qqqqqqqqqqq qq qqqq qqqq qq qqqq qq qq qq qqqqqqq, qqq qqq qq qqq qqqq qq qqqqqqq. qq qqq qqqqq qqqqqq, qqqq qqq qqqq qqqq qq qqqqq qqqq qqqq qqqq qqq qqqqqq qq qqqq qqq qqqq qqq, qq qqqqqq qqq qqqq qqqq qqq qq qqq qqqqq. qq, qq qqq qqqq qqq qq qq q qqq, qq qq q qqqqq qqq qqqq qqqqqqq qqqq qqqqqqq qq qqqqq qqqqqqqqqqq.\n\nqqq qqqq qqqqqqq qqqq qqqqqqq qq qqqqq qqqqqqqqqqq, qqqqqqq qq qqqqq qqqq qqqq qqqq qqq qqqqqq qq qq q qqq, qq qqqq qq q qqqqq. qqqq, qq qqqqq qqqq qqq qqq qqq qqq qqqqq qqqqqqq qqqqq, qqq qqqq qqqqq qqqq qqq qqqqq. qq qq qqq qqqqqq qq qqq qqqqqqqqqqq qqqqqqq qqq qqq qqq qqqqqq qqqq qqqqqq, qq qqq qqq qqqqqqq qqqqqqq qqqqqq.\n\nqq qq qqqqqqq, q qqqqq qqq qqq qqqqqqq qqqq qqqqqqq qq qqqqq qqqqqqqqqqq, qqq qqqqqqq, qqq qqq qq q qqqq qq q qq

In [250]:
essays["e58d9b29"].replace("q", "").replace(" ", "")

',,.,,,-,.\n\n,...,,.\n\n,,...\n\n-.,-.-.-.\n\n..,,,,.\n\n....,-,-.\n\n,,,.,,,-,.,.'

In [251]:
essays["5b3282ec"].replace("q", "").replace(" ", "")

",.,,,.?.\n\n,.,.,.'.\n\n,..,,,.\n\n,...,,.\n"