In [None]:
import os
import polars as pl
from sklearn.linear_model import LinearRegression

In [None]:
kaggle_run_type = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', None)
if kaggle_run_type:
    DATA_PATH = "/kaggle/input/linking-writing-processes-to-writing-quality"
else:
    DATA_PATH = "../../data"

In [None]:
# create the train features
train_features = (
    pl.scan_csv(f"{DATA_PATH}/train_logs.csv")
    .sort("event_id")
    .group_by("id")
    .agg(
        pl.col("word_count").last(),
        (pl.col("up_time").max() - pl.col("down_time").min()).alias("write_duration"),
        pl.col("event_id").count().alias("event_count")
    )
    .with_columns(
        (pl.col("write_duration")/pl.col("event_count")).alias("frequency")
    )
    .join(pl.scan_csv(f"{DATA_PATH}/train_scores.csv"), on="id")
    .collect()
)

In [None]:
# fit a simple linear model
model = LinearRegression()
X_train = train_features.select(pl.exclude("id", "score")).to_numpy()
y_train = train_features.select("score").to_numpy()
model.fit(X_train, y_train)

In [None]:
# create the test features
test_features = (
    pl.scan_csv(f"{DATA_PATH}/test_logs.csv")
    .sort("event_id")
    .group_by("id")
    .agg(
        pl.col("word_count").last(),
        (pl.col("up_time").max() - pl.col("down_time").min()).alias("write_duration"),
        pl.col("event_id").count().alias("event_count")
    )
    .with_columns(
        (pl.col("write_duration")/pl.col("event_count")).alias("frequency")
    )
    .collect()
)

In [None]:
# predict the test data
X_test = test_features.select(pl.exclude("id", "score")).to_numpy()
y_test = model.predict(X_test).flatten()

In [None]:
# create a submission with the predictions
submission = (
    test_features
    .with_columns(pl.Series(y_test).alias("score"))
    .select("id", "score")
)

In [None]:
# write the submission file
submission.write_csv("submission.csv")