In [1]:
from google.colab import drive
# drive.mount('/content/drive')

# Init

In [2]:
import re

import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns

In [3]:
train = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Data/Kaggle/LinkWrite/raw/train_logs.csv.gz")
train_score = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Data/Kaggle/LinkWrite/raw/train_scores.csv.gz")
test_logs = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Data/Kaggle/LinkWrite/raw/test_logs.csv.gz")

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8405898 entries, 0 to 8405897
Data columns (total 11 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   id               object
 1   event_id         int64 
 2   down_time        int64 
 3   up_time          int64 
 4   action_time      int64 
 5   activity         object
 6   down_event       object
 7   up_event         object
 8   text_change      object
 9   cursor_position  int64 
 10  word_count       int64 
dtypes: int64(6), object(5)
memory usage: 705.5+ MB


In [5]:
train.isnull().sum()

id                 0
event_id           0
down_time          0
up_time            0
action_time        0
activity           0
down_event         0
up_event           0
text_change        0
cursor_position    0
word_count         0
dtype: int64

In [6]:
train.head()

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count
0,001519c8,1,4526,4557,31,Nonproduction,Leftclick,Leftclick,NoChange,0,0
1,001519c8,2,4558,4962,404,Nonproduction,Leftclick,Leftclick,NoChange,0,0
2,001519c8,3,106571,106571,0,Nonproduction,Shift,Shift,NoChange,0,0
3,001519c8,4,106686,106777,91,Input,q,q,q,1,1
4,001519c8,5,107196,107323,127,Input,q,q,q,2,1


## Simple Features

In [7]:
train["activity_general"] = train["activity"].str.replace(r"Move From .*", "Move From", regex=True)
train["word_count-1"] = train.groupby("id")["word_count"].shift(1)
train["word_count_delta"] = train["word_count"] - train["word_count-1"]

train["up_time-1"] = train.groupby("id")["up_time"].shift(1)
train["in_between_event_pause_ms"] = train["down_time"] - train["up_time-1"]

# Feature Engineering

## Compile

In [8]:
class FeatureEngineering:
    def __init__(self, data):
        self.data = data
        self.derived = None

    def feature_l1(self):
        colname_pat = re.compile(r"_.*_.*_(.*)")
        # total time to complete essay
        df = self.data.groupby("id").agg({
            "down_time": "min",
            "up_time": "max",
            "action_time": "sum",
            "cursor_position": "std",
            "event_id": "count",
            "in_between_event_pause_ms": "mean"
        }).rename(columns={
            "cursor_position": "_cursor_position_std",
            "event_id": "_total_event_count"
        })
        df["_total_time_m"] = (df["up_time"] - df["down_time"]) / 1000 / 60
        df["_total_action_time_m"] = df["action_time"] / 1000 / 60
        df["_proportion_action_time"] = df["_total_action_time_m"] / df["_total_time_m"]
        df["_proportion_pause_time"] = 1 - df["_proportion_action_time"]
        df["_total_pause_time_m"] = df["_total_time_m"] - df["_total_action_time_m"]
        df["_mean_pause_time_m"] = df["in_between_event_pause_ms"] / 1000 / 60
        self.derived = df[df.columns[df.columns.str.startswith("_")]]

        # number of words in essay
        self.derived = self.derived.join(self.data.loc[self.data.groupby("id")["event_id"].idxmax(), ["id", "word_count"]].set_index("id").rename(columns={"word_count": "_word_count"}))
        # event per minute
        self.derived["_event_per_m"] = self.derived["_total_event_count"] / self.derived["_total_time_m"]
        # word per mins
        self.derived["_word_per_m"] = self.derived["_word_count"] / self.derived["_total_time_m"]
        # time spent in removing or nonproduction
        self.derived["_unproductive_m"] = self.data.loc[self.data["activity"] != "Input"].groupby(["id"])["action_time"].sum() / 1000 / 60
        # productive time
        self.derived["_productive_m"] = self.derived["_total_time_m"] - self.derived["_unproductive_m"]
        # input per minute
        self.derived["_char_per_m"] = self.data.loc[self.data["activity"] == "Input", ["id", "event_id"]].groupby("id")["event_id"].count() / self.derived["_productive_m"]
        # count of activity
        self.derived = self.derived.join(pd.crosstab(self.data["id"], self.data["activity_general"], self.data["event_id"], aggfunc="count").fillna(0).rename(columns=lambda x: "_activity_count_" + x.lower()).astype(int))
        # proportion of activity
        for c in self.derived.columns[self.derived.columns.str.contains("activity_count")]:
            self.derived["_activity_proportion_count_" + colname_pat.findall(c)[0]] = self.derived[c] / self.derived["_total_event_count"]
        # time of each activity
        df = (pd.crosstab(self.data["id"], self.data["activity_general"], self.data["action_time"], aggfunc=["sum", "mean"]) / 1000 / 60).fillna(0)
        df.columns = ["_activity_" + c[0] + "_" + c[1].lower() for c in df.columns.values]
        for c in df.columns[df.columns.str.contains("activity_sum")]:
            df["_activity_proportion_sum_" + colname_pat.findall(c)[0]] = df[c] / self.derived["_total_action_time_m"]
        self.derived = self.derived.join(df)

        # number of erased words
        self.derived["_count_erased_words"] = self.data.query("word_count_delta < 0").groupby("id")["event_id"].count()
        # euclidean distance of moved segments
        df = self.data.loc[self.data["activity"].str.contains("Move From"), ["id", "activity"]].set_index("id")
        df = df["activity"].str.extractall(r"(\[[0-9]+,\s[0-9]+\])").rename(columns={0: "coord"})["coord"].str.strip("[]").str.split(",", expand=True).rename(columns={0: "x", 1: "y"}).reset_index(level="match")
        df["x"] = pd.to_numeric(df["x"])
        df["y"] = pd.to_numeric(df["y"])
        df = np.sqrt((df.query("match == 0")["x"] - df.query("match == 1")["x"])**2 + (df.query("match == 0")["y"] - df.query("match == 1")["y"])**2)
        df = df.to_frame("euclid_distance")
        df = df.groupby(df.index).agg({"euclid_distance": ["min", "max"]})
        df.columns = ["_" + c[1] + "_" + c[0] for c in df.columns.values]
        self.derived = self.derived.join(df).fillna({"_min_euclid_distance": 0, "_max_euclid_distance": 0})

    def to_modelling_data(self, dec: int = 4):
        return self.derived[self.derived.columns[self.derived.columns.str.startswith("_")]].round(dec)

In [9]:
fe = FeatureEngineering(train)
fe.feature_l1()
df_model = fe.to_modelling_data()
df_model.shape

(2471, 47)

In [12]:
df_model = df_model.join(train_score.set_index("id"), how="left")
df_model.shape

(2471, 48)

# Analysis

In [56]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoCV, OrthogonalMatchingPursuitCV
from sklearn.metrics import mean_squared_error
from sklearn.dummy import DummyRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [25]:
df_model.corr()["score"].sort_values()

_mean_pause_time_m                         -0.585539
_proportion_pause_time                     -0.521020
_total_pause_time_m                        -0.257949
_activity_mean_remove/cut                  -0.157697
_activity_proportion_sum_paste             -0.052672
_activity_proportion_sum_nonproduction     -0.047315
_activity_mean_input                       -0.042464
_activity_proportion_count_nonproduction   -0.034668
_activity_proportion_count_move from       -0.029077
_activity_proportion_count_paste           -0.021478
_activity_count_move from                  -0.019384
_max_euclid_distance                       -0.016995
_activity_sum_move from                    -0.016052
_activity_mean_move from                   -0.013703
_activity_proportion_sum_move from         -0.012916
_activity_proportion_sum_remove/cut        -0.012708
_min_euclid_distance                       -0.000762
_activity_sum_paste                         0.011823
_activity_proportion_count_input            0.

## Feature Selection

In [70]:
rng = np.random.RandomState(1234567890)
X_train, X_test, y_train, y_test = train_test_split(df_model[df_model.columns[:-1]], df_model["score"], test_size=0.2, random_state=rng, shuffle=True)

In [72]:
dummy_pipe = make_pipeline(StandardScaler(), DummyRegressor(strategy="mean")).fit(X_train, y_train)
mean_squared_error(y_train, dummy_pipe.predict(X_train), squared=False), mean_squared_error(y_test, dummy_pipe.predict(X_test), squared=False)

(1.0228942188611492, 1.032051742017164)

In [60]:
mean_squared_error(y_train, dummy_pipe.predict(X_train), squared=False)

0.16773716685838302

# Modelling

In [39]:
!pip install optuna

Collecting optuna
  Downloading optuna-3.3.0-py3-none-any.whl (404 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.2/404.2 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.12.0-py3-none-any.whl (226 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.0/226.0 kB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cmaes>=0.10.0 (from optuna)
  Downloading cmaes-0.10.0-py3-none-any.whl (29 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, cmaes, alembic, optuna
Successfully installed Mako-1.2.4 alembic-1.12.0 cmaes-0.10.0 colorlog-6.7.0 optuna-3.3.0


In [40]:
import optuna