In [None]:
import os
import random
import polars as pl

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

import lightgbm as lgb

In [None]:
kaggle_run_type = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', None)
if kaggle_run_type:
    DATA_PATH = "/kaggle/input/linking-writing-processes-to-writing-quality"
else:
    DATA_PATH = "../../data"

In [None]:
logs = pl.read_csv(f"{DATA_PATH}/train_logs.csv")
scores = pl.read_csv(f"{DATA_PATH}/train_scores.csv")

In [None]:
def safediv(dividend, divisor, fill_expr=pl.lit(0)):
    div_expr = (
        pl.when(divisor.ne(0))
        .then(dividend.truediv(divisor))
        .otherwise(fill_expr)
    )
    return(div_expr)

In [None]:
counts = (
    logs
    # only consider text changing activities
    .filter(pl.col("activity").ne("Nonproduction"))
    # split text changes of Replace activity into replace_remove and replace_input
    .with_columns(
        pl.when(pl.col("activity").eq("Replace"))
        .then(pl.col("text_change").str.split(" => "))
        .list.to_struct(fields=["replace_remove", "replace_input"])
        .alias("Replace")
    )
    .unnest("Replace")
    .with_columns(
        # merge text changes of Input, Paste + Replace(Input)
        pl.when(pl.col("activity").eq("Input") | pl.col("activity").eq("Paste"))
        .then(pl.col("text_change"))
        .otherwise(pl.col("replace_input"))
        .alias("input"),
        # merge text changes of Remove/Cut + Replace(Remove)
        pl.when(pl.col("activity").eq("Remove/Cut"))
        .then(pl.col("text_change"))
        .otherwise(pl.col("replace_remove"))
        .alias("remove")
    )
    # concat all text changes for each essay
    .group_by("id")
    .agg(
        pl.col("input").filter(pl.col("input").is_not_null()).str.concat(""),
        pl.col("remove").filter(pl.col("remove").is_not_null()).str.concat("")
    )
    .melt(id_vars="id")
    # count characters and punctuation marks
    .with_columns(
        pl.col("value").str.len_bytes().alias("total_chars"),
        pl.col("value").str.count_matches("q").alias("word_chars"),
        pl.col("value").str.count_matches("\.").alias("full_stops"),
        pl.col("value").str.count_matches(",").alias("commas"),
        pl.col("value").str.count_matches("\n").alias("line_breaks"),
        pl.col("value").str.count_matches("-").alias("hyphens"),
        pl.col("value").str.count_matches("\?").alias("question_marks"),
        pl.col("value").str.count_matches(";").alias("semicolons"),
        pl.col("value").str.count_matches(":").alias("colons"),
        pl.col("value").str.count_matches("!").alias("exclamation_marks"),
    )
    # subtract counts of removed text from counts of input text
    .with_columns(
        pl.when(pl.col("variable").eq("remove"))
        .then(pl.exclude("id", "variable", "value").mul(-1))
        .otherwise(pl.exclude("id", "variable", "value"))
    )
    .group_by("id")
    .agg(pl.exclude("variable", "value").sum())
)

In [None]:
essay_stats = (
    logs
    .sort("event_id")
    .group_by("id")
    .agg(
        pl.col("word_count").last(),
        (pl.col("up_time").max() - pl.col("down_time").min()).alias("write_duration"),
        pl.col("event_id").count().alias("event_count"),
    )
    .join(scores, on="id")
    .join(counts, on="id")
    .with_columns(
        (pl.col("full_stops")+pl.col("question_marks")+pl.col("exclamation_marks")+pl.col("colons")).alias("sentence_count")
    )
    .with_columns(
        safediv(pl.col("word_chars"), pl.col("word_count")).alias("avg_word_length"),
        safediv(pl.col("word_chars"), pl.col("sentence_count")).alias("avg_chars_per_sentence"),
        safediv(pl.col("word_count"), pl.col("sentence_count")).alias("avg_words_per_sentence"),
        ((pl.col("total_chars") - pl.col("word_chars"))/pl.col("total_chars")).alias("non_word_char_pct"),
        safediv(pl.col("word_count"), pl.col("line_breaks")).alias("avg_words_per_paragraph"),
        safediv(pl.col("total_chars"), pl.col("line_breaks")).alias("avg_chars_per_paragraph"),
        safediv(pl.col("commas"), pl.col("sentence_count")).alias("avg_commas_per_sentence"),
    )
)

In [None]:
essay_stats.sample(5)

In [None]:
lightgbm_params = {
    "objective": "regression",
    "metric": "rmse",
    "n_estimators" : 100,
    "boosting_type": "gbdt",
}

In [None]:
def cv(features):
    # Define the pipeline
    pipeline = Pipeline([
        #("scaler", StandardScaler()),  # Normalize the features
        #("knn", KNeighborsRegressor())  # Use KNN model
        #("gb", HistGradientBoostingRegressor())  # Use Gradient Boosting
        ("rf", RandomForestRegressor())
        #("lm", LinearRegression())
        #("lgb", lgb.LGBMRegressor(**lightgbm_params, verbose=-1))
    ])

    # Split into features and target
    #X = essay_stats.select(pl.exclude("score", "id")).to_numpy()
    X = essay_stats.select(features).to_numpy()
    y = essay_stats.select("score").to_numpy()

    # Cross validate the pipeline using 4 folds
    cv_scores = cross_val_score(pipeline, X, y, cv=4, scoring="neg_root_mean_squared_error")

    # Print the mean score and standard deviation
    return(cv_scores.mean())

In [None]:
def get_next_best_feature(current_features, current_score):
    features = [col for col in essay_stats.columns if (col not in current_features) & (col!="id") & (col!="score")]
    next_best_feature = None
    next_best_score = current_score
    for col in features:
        score = (cv(current_features + [col]) + cv(current_features + [col]))/2
        if score > next_best_score:
            next_best_feature = col
            next_best_score = score
    return (next_best_feature, next_best_score)

In [None]:
def try_features(current_features = [], current_score = -1.0247):
    while True:
        next_best_feature, next_best_score = get_next_best_feature(current_features, current_score)
        if next_best_feature is None:
            break
        current_features.append(next_best_feature)
        improvement = (current_score - next_best_score)/current_score
        print(f"Added '{next_best_feature}' with score {next_best_score:.4f} improving by {improvement:.2%}")
        current_score = next_best_score
    return(current_features)

In [None]:
try_features()

In [None]:
base_features = ["commas", "word_chars", "avg_chars_per_sentence", "non_word_char_pct", "event_count", "avg_word_length", "line_breaks"]
base_score = -0.6400
try_features(base_features, base_score)