In [None]:
import os
import polars as pl
from sklearn.ensemble import RandomForestRegressor

In [None]:
kaggle_run_type = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', None)
if kaggle_run_type:
    DATA_PATH = "/kaggle/input/linking-writing-processes-to-writing-quality"
else:
    DATA_PATH = "../../data"

In [None]:
logs = pl.scan_csv(f"{DATA_PATH}/train_logs.csv")
scores = pl.scan_csv(f"{DATA_PATH}/train_scores.csv")

test_logs = pl.scan_csv(f"{DATA_PATH}/test_logs.csv")

In [None]:
def generate_features(logs):
    counts = (
        logs
        # only consider text changing activities
        .filter(pl.col("activity").ne("Nonproduction"))
        # split text changes of Replace activity into replace_remove and replace_input
        .with_columns(
            pl.when(pl.col("activity").eq("Replace"))
            .then(pl.col("text_change").str.split(" => "))
            .list.to_struct(fields=["replace_remove", "replace_input"])
            .alias("Replace")
        )
        .unnest("Replace")
        .collect()
        .lazy()
        .with_columns(
            # merge text changes of Input, Paste + Replace(Input)
            pl.when(pl.col("activity").eq("Input") | pl.col("activity").eq("Paste"))
            .then(pl.col("text_change"))
            .otherwise(pl.col("replace_input"))
            .alias("input"),
            # merge text changes of Remove/Cut + Replace(Remove)
            pl.when(pl.col("activity").eq("Remove/Cut"))
            .then(pl.col("text_change"))
            .otherwise(pl.col("replace_remove"))
            .alias("remove")
        )
        # concat all text changes for each essay
        .group_by("id")
        .agg(
            pl.col("input").filter(pl.col("input").is_not_null()).str.concat(""),
            pl.col("remove").filter(pl.col("remove").is_not_null()).str.concat("")
        )
        .melt(id_vars="id")
        # count characters and punctuation marks
        .with_columns(
            pl.col("value").str.len_bytes().alias("total_chars"),
            pl.col("value").str.count_matches("q").alias("word_chars"),
            pl.col("value").str.count_matches("\.").alias("full_stops"),
            pl.col("value").str.count_matches(",").alias("commas"),
            pl.col("value").str.count_matches("\n").alias("line_breaks"),
            pl.col("value").str.count_matches("-").alias("hyphens"),
            pl.col("value").str.count_matches("\?").alias("question_marks"),
            pl.col("value").str.count_matches(";").alias("semicolons"),
            pl.col("value").str.count_matches(":").alias("colons"),
            pl.col("value").str.count_matches("!").alias("exclamation_marks"),
        )
        # subtract counts of removed text from counts of input text
        .with_columns(
            pl.when(pl.col("variable").eq("remove"))
            .then(pl.exclude("id", "variable", "value").mul(-1))
            .otherwise(pl.exclude("id", "variable", "value"))
        )
        .group_by("id")
        .agg(pl.exclude("variable", "value").sum())
    )

    features = (
        logs
        .sort("event_id")
        .group_by("id")
        .agg(
            pl.col("word_count").last(),
            (pl.col("up_time").max() - pl.col("down_time").min()).alias("write_duration"),
            pl.col("event_id").count().alias("event_count")
        )
        .with_columns(
            (pl.col("write_duration")/pl.col("event_count")).alias("frequency")
        )
        .join(counts, on="id")
        .with_columns(
            (pl.col("word_chars")/pl.col("word_count")).alias("avg_word_length"),
            (pl.col("word_chars")/(pl.col("full_stops")+pl.col("question_marks")+pl.col("exclamation_marks")+pl.col("colons"))).alias("avg_sentence_length"),
            ((pl.col("total_chars") - pl.col("word_chars"))/pl.col("total_chars")).alias("non_word_char_pct")
        )
    )

    return(features)
    

In [None]:
train_features = (
    generate_features(logs)
    .join(scores, on="id")
    .collect()
)

In [None]:
# fit a simple linear model
model = RandomForestRegressor()
X_train = train_features.select(pl.exclude("id", "score")).to_numpy()
y_train = train_features.select("score").to_numpy()
model.fit(X_train, y_train)

In [None]:
test_features = generate_features(test_logs).collect()

In [None]:
# predict the test data
X_test = test_features.select(pl.exclude("id", "score")).to_numpy()
y_test = model.predict(X_test).flatten()

In [None]:
# create a submission with the predictions
submission = (
    test_features
    .with_columns(pl.Series(y_test).alias("score"))
    .select("id", "score")
)

In [None]:
# write the submission file
submission.write_csv("submission.csv")