In [1]:
import os
import polars as pl

In [2]:
kaggle_run_type = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', None)
if kaggle_run_type:
    DATA_PATH = "/kaggle/input/linking-writing-processes-to-writing-quality"
else:
    DATA_PATH = "../../data"

In [3]:
logs = pl.read_csv(f"{DATA_PATH}/train_logs.csv")
scores = pl.read_csv(f"{DATA_PATH}/train_scores.csv")

In [4]:
logs.sample(4)

id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count
str,i64,i64,i64,i64,str,str,str,str,i64,i64
"""81a86064""",1140,422210,422287,77,"""Input""","""q""","""q""","""q""",377,74
"""568ceeb6""",2086,793330,793428,98,"""Input""","""Space""","""Space""",""" """,1765,313
"""54e4b9ea""",1084,1420986,1421062,76,"""Input""","""q""","""q""","""q""",649,146
"""e4898343""",30,166587,166671,84,"""Remove/Cut""","""Backspace""","""Backspace""","""q""",3,1


In [5]:
scores.sample(4)

id,score
str,f64
"""2d8a6af2""",4.0
"""62ff8c30""",4.5
"""6f54beab""",3.5
"""3938f736""",5.0


In [7]:
essays = (
    logs
    .sort("event_id")
    .group_by("id")
    .agg(
        pl.col("word_count").last(),
        pl.col("up_time").last().sub(pl.col("down_time").first()).alias("duration"),
        pl.count().alias("event_count")
    )
    .join(scores, on="id")
)
essays.sample(4)

id,word_count,duration,event_count,score
str,i64,i64,u32,f64
"""d3a0c370""",630,1694295,5101,5.0
"""143c8cd1""",477,1683095,3605,5.0
"""77f85738""",482,1765079,4643,4.5
"""c238b433""",517,1790266,4717,4.5


In [9]:
pl.DataFrame({
    "x": [1, 2, 3, 4]
}).with_columns(
    pl.col("x").shift().alias("shifted")
)

x,shifted
i64,i64
1,
2,1.0
3,2.0
4,3.0


In [26]:
# sort essays by score, word count and event count to have similar essays next to each other
# then shift the id column by one to match each essay with the next one
same_score_pairs = (
    essays
    .sort("score", "word_count", "event_count")
    .select(
        "id", pl.col("id").shift().alias("id_right"), "score", pl.col("score").shift().alias("score_right"),
    )
    .filter(pl.col("score").eq(pl.col("score_right")))
    .select("id", "id_right", "score")
    .with_columns(
        pl.col("id").alias("id_left"),
        (pl.col("id") + pl.col("id_right")).alias("id")
    )
)

print(f"Additional training samples with same scores: {len(same_score_pairs)*2}")
same_score_pairs.head(5)

Additional training samples with same scores: 4918


id,id_right,score,id_left
str,str,f64,str
"""c3663a2d315bda…","""315bdafd""",0.5,"""c3663a2d"""
"""3bda31e6c3663a…","""c3663a2d""",0.5,"""3bda31e6"""
"""1ebb9b743bda31…","""3bda31e6""",0.5,"""1ebb9b74"""
"""40b285081ebb9b…","""1ebb9b74""",0.5,"""40b28508"""
"""95acfe175a3f0d…","""5a3f0d07""",1.0,"""95acfe17"""


In [27]:
worse_essays = (
    essays
    .with_columns(
        pl.col("score").alias("original_score"),
        pl.col("score")+0.25,
        (pl.col("word_count")+pl.col("event_count").truediv(pl.col("event_count").max())).alias("word_event_count")
    )
    .sort("word_event_count")
)

better_essays = worse_essays.with_columns(pl.col("score")-0.5)

mixed_score_pairs = (
    worse_essays
    .join_asof(better_essays, by="score", on="word_event_count", strategy="nearest")
    .filter(pl.col("id_right").is_not_null())
    .select("id", "id_right", "score")
    .with_columns(
        pl.col("id").alias("id_left"),
        (pl.col("id") + pl.col("id_right")).alias("id")
    )
)

print(f"Additional training samples with mixed scores: {len(mixed_score_pairs)*2}")
mixed_score_pairs.sample(5)

Additional training samples with mixed scores: 4868


id,id_right,score,id_left
str,str,f64,str
"""361ba892118357…","""1183579c""",3.25,"""361ba892"""
"""473d1159793e50…","""793e5089""",3.25,"""473d1159"""
"""f3fcf7ad5327ff…","""5327ff63""",4.75,"""f3fcf7ad"""
"""8cfca5271acb2b…","""1acb2b61""",4.75,"""8cfca527"""
"""6b2a19374e21e1…","""4e21e149""",5.25,"""6b2a1937"""


In [66]:
same_score_pairs_left_logs = (
    same_score_pairs
    .join(logs, left_on="id_left", right_on="id")
    .filter(pl.col("event_id").lt(pl.col("event_id").max().over("id").truediv(2)))
    .with_columns(pl.lit("left").alias("side"))
)
len(same_score_pairs_left_logs)

4190760

In [68]:
same_score_pairs_right_logs = (
    same_score_pairs
    .join(logs, left_on="id_right", right_on="id")
    .filter(pl.col("event_id").gt(pl.col("event_id").max().over("id").truediv(2)))
    .with_columns(pl.lit("right").alias("side"))
)
len(same_score_pairs_right_logs)

4164102

In [118]:
new_logs = (
    pl.concat([same_score_pairs_left_logs, same_score_pairs_right_logs])
    .with_columns(
        (pl.col("event_id")-pl.col("event_id").shift()).alias("event_id_delta"),
        pl.col("event_id").filter(pl.col("side").eq("left")).max().over("id").alias("max_left_event_id"),
        pl.col("event_id").filter(pl.col("side").eq("right")).min().over("id").alias("min_right_event_id"),
    )
    .with_columns(
        pl.when(pl.col("side").eq("left"))
        .then(pl.col("event_id"))
        .otherwise(pl.col("event_id")-pl.col("min_right_event_id")+pl.col("max_left_event_id")+1).alias("event_id")
    )
    .sort("id", "event_id")
    .with_columns(
        (pl.col("down_time")-pl.col("down_time").shift()).alias("time_delta"),
        (pl.col("word_count")-pl.col("word_count").shift()).alias("word_delta")
    )
    .with_columns(
        pl.when(pl.col("time_delta").lt(0))
        .then(pl.col("time_delta").filter(pl.col("time_delta").ge(0)).mean())
        .otherwise(pl.col("time_delta")).cast(pl.Int64).alias("time_delta"),
        pl.col("word_delta").clip(lower_bound=0)
    )
    .with_columns(
        pl.col("time_delta").cumsum().over("id", "side").alias("side_time"),
        pl.col("down_time").filter(pl.col("side").eq("left")).max().over("id").alias("max_left_down_time"),
        pl.col("word_delta").cumsum().over("id", "side").alias("side_word"),
        pl.col("word_count").filter(pl.col("side").eq("left")).max().over("id").alias("max_left_word_count"),
    )
    .with_columns(
        pl.when(pl.col("side").eq("right"))
        .then(pl.col("side_time")+pl.col("max_left_down_time"))
        .otherwise(pl.col("down_time"))
        .alias("down_time"),
        pl.when(pl.col("side").eq("right"))
        .then(pl.col("side_word")+pl.col("max_left_word_count"))
        .otherwise(pl.col("word_count"))
        .alias("word_count")
    )
    .with_columns(
        (pl.col("down_time") + pl.col("action_time")).alias("up_time")
    )
    .drop("event_id_delta", "max_left_event_id", "min_right_event_id", "side_time", "max_left_down_time", "time_delta", "side", "id_right", "id_left", "word_delta", "side_word", "max_left_word_count")
)

In [119]:
new_logs.filter(pl.col("id").str.starts_with("95acfe175a")).filter(pl.col("event_id").is_between(210, 220))

id,score,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count
str,f64,i64,i64,i64,i64,str,str,str,str,i64,i64
"""95acfe175a3f0d…",1.0,210,277844,277899,55,"""Input""","""q""","""q""","""q""",180,35
"""95acfe175a3f0d…",1.0,211,278020,278061,41,"""Input""","""q""","""q""","""q""",181,35
"""95acfe175a3f0d…",1.0,212,278204,278267,63,"""Input""","""Space""","""Space""",""" """,182,35
"""95acfe175a3f0d…",1.0,213,279461,279523,62,"""Input""","""q""","""q""","""q""",183,36
"""95acfe175a3f0d…",1.0,214,280012,280091,79,"""Input""","""q""","""q""","""q""",127,36
"""95acfe175a3f0d…",1.0,215,280140,280236,96,"""Input""","""q""","""q""","""q""",128,36
"""95acfe175a3f0d…",1.0,216,280324,280412,88,"""Input""","""q""","""q""","""q""",129,36
"""95acfe175a3f0d…",1.0,217,283173,283229,56,"""Input""",""".""",""".""",""".""",130,36
"""95acfe175a3f0d…",1.0,218,283349,283405,56,"""Input""","""Space""","""Space""",""" """,131,36
"""95acfe175a3f0d…",1.0,219,288199,288294,95,"""Nonproduction""","""CapsLock""","""CapsLock""","""NoChange""",131,36
