In [4]:
IS_KAGGLE = False

if IS_KAGGLE:
    INPUT_FOLDER = '/kaggle/input/linking-writing-processes-to-writing-quality'
else:
    INPUT_FOLDER = '../data'

In [7]:
!pip install lightgbm

Collecting lightgbm
  Using cached lightgbm-4.1.0.tar.gz (1.7 MB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: lightgbm
  Building wheel for lightgbm (pyproject.toml) ... [?25l/^C
[?25canceled
[31mERROR: Operation cancelled by user[0m[31m
[0m

In [6]:
import numpy as np
import pandas as pd
import lightgbm as lgb

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, mean_squared_error

ModuleNotFoundError: No module named 'lightgbm'

**id** - The unique ID of the essay

**event_id** - The index of the event, ordered chronologically

**down_time** - The time of the down event in milliseconds

**up_time** - The time of the up event in milliseconds

**action_time** - The duration of the event (the difference between down_time and up_time)

**activity** - The category of activity which the event belongs to

- **Nonproduction** - The event does not alter the text in any way

- **Input** - The event adds text to the essay

- **Remove/Cut** - The event removes text from the essay

- **Paste** - The event changes the text through a paste input

- **Replace** - The event replaces a section of text with another string

- **Move From [x1, y1] To [x2, y2]** - The event moves a section of text spanning character index x1, y1 to a new location x2, y2

**down_event** - The name of the event when the key/mouse is pressed

**up_event** - The name of the event when the key/mouse is released

**text_change** - The text that changed as a result of the event (if any)

**cursor_position** - The character index of the text cursor after the event

**word_count** - The word count of the essay after the event

In [None]:
def preprocess():
    df_raw_train = pd.read_csv(f'{INPUT_FOLDER}/train_logs.csv')
    df_raw_test = pd.read_csv(f'{INPUT_FOLDER}/test_logs.csv')
    df_raw = pd.concat((df_raw_train, df_raw_test)).reset_index(False)
    df = pd.DataFrame({
    'id': df_raw["id"].unique()
    })
    df['is_test'] = df['id'].isin(df_raw_test['id'].unique())
    df_word = df_raw.groupby("id")['word_count'].max()
    df = pd.merge(df, df_word, on="id", how="left")
    df_raw['current_min'] = df_raw["down_time"]//60000
    df_event = df_raw.groupby("id")['event_id'].count()

    df = pd.merge(df, df_event, on="id", how="left")
    df = df.rename(columns={"event_id": "event_count"})
    df_time = df_raw.groupby("id")['current_min'].max()
    df = pd.merge(df, df_time, on="id", how="left")
    df = df.rename(columns={"current_min": "writing_time_min"})
    df['events_per_min'] = df["event_count"] / df['writing_time_min']
    filter_text_change = df_raw["text_change"]!="NoChange"
    df_filtered = df_raw[filter_text_change]
    df_text_change = df_filtered.groupby("id")['text_change'].count()

    df = pd.merge(df, df_text_change, on="id", how="left")
    df = df.rename(columns={"text_change": "text_change_count"})
    df['text_changes_per_min'] = df["text_change_count"] / df['writing_time_min']
    sentence_filter = df_raw[(df_raw["text_change"].str.match('\.')) & (df_raw['activity'] != 'Remove/Cut')]
    sentence_filter = df_raw[df_raw["up_event"] == '.']
    df_sentence_count = sentence_filter.groupby("id")['text_change'].count()

    df = pd.merge(df, df_sentence_count, on="id", how="left")
    df = df.rename(columns={"text_change": "sentence_count"})
    for activity in ['Input', 'Remove/Cut', 'Paste', 'Replace', 'Nonproduction']:
      df_activity_count = df_raw[df_raw['activity'] == activity].groupby("id")['activity'].count()
      column_name = f'{activity}_count'
      df_activity_count.name = column_name
      df = pd.merge(df, df_activity_count, on="id", how="left")
      df[column_name] = df[column_name].fillna(0)
    df_raw['IKI'] = df_raw['down_time'] - df_raw.groupby('id')['up_time'].shift(1)
    df_raw['IKI'] = df_raw['IKI'].fillna(0)
    df_raw['IKI'] = df_raw['IKI'].clip(0)

    PAUSE_THRESHOLD = 2000

    # Total number of pauses (over 2000 ms) during writing process
    pauses_count = df_raw[df_raw['IKI'] > PAUSE_THRESHOLD].groupby('id')['IKI'].count()
    pauses_count.name = 'pauses_count'
    df = pd.merge(df, pauses_count, on="id", how="left")
    pauses_mean_duration = df_raw[df_raw['IKI'] > PAUSE_THRESHOLD].groupby('id')['IKI'].mean().round(0)
    pauses_mean_duration.name = 'pause_mean_duration'
    df = pd.merge(df, pauses_mean_duration, on="id", how="left")
    pauses_time_sum = df_raw[df_raw['IKI'] > PAUSE_THRESHOLD].groupby('id')['IKI'].sum()
    pauses_time_sum.name = 'pauses_time_sum'
    df = pd.merge(df, pauses_time_sum, on="id", how="left")
    writing_time_ms = df_raw.groupby('id')['up_time'].max()
    writing_time_ms.name = 'writing_time_ms'
    df = pd.merge(df, writing_time_ms, on="id", how="left")
    # Average number of pauses per minute
    df['pause_per_min'] = (df["pauses_count"] / df['writing_time_min']).round(2)

    # Proportion of pauses during the writing process
    df['pauses_share'] = (df["pauses_time_sum"] / df['writing_time_ms']).round(2)
    df_freq = df_raw.groupby("id").agg({
      'down_time': ['min', 'max'],
      'activity': lambda x: x.value_counts().index[0]
    }).reset_index()
    df_freq.columns = ['id', 'start_time', 'end_time', 'most_frequent_activity']
    df = pd.merge(df, df_freq, on="id", how='left')
    # Calculating Average action time
    df['total_writing_time'] = df['end_time'] - df['start_time']
    df['Avg_time_between_events'] = df['total_writing_time'] / df['event_count']
    # If overtime writing (if the writing time exceeded 30 minutes)
    df['overtime_writing'] = df['writing_time_min'] > 30
    # Average Action Time - average duration of the actions for each essay.
    # Summing the durations of all actions in an essay and then dividing by the total number of actions.
    # I leave values in the milliseconds due to it is very short in duration

    total_action_time = df_raw.groupby('id')['action_time'].sum()
    df = pd.merge(df, total_action_time, on='id', how='left')

    df['average_action_time'] = df['action_time'] / df['event_count']
    df = df.drop(columns=['action_time'])
    df_raw['chars_added'] = df_raw['activity'].apply(lambda x: 1 if x in ['Input', 'Paste'] else 0)
    df_raw['chars_removed'] = df_raw['activity'].apply(lambda x: 1 if x == 'Remove/Cut' else 0)

    total_chars_added = df_raw.groupby('id')['chars_added'].sum()
    total_chars_removed = df_raw.groupby('id')['chars_removed'].sum()

    df = pd.merge(df, total_chars_added, on='id', how='left')
    df = pd.merge(df, total_chars_removed, on='id', how='left')

    df['total_characters'] = df['chars_added'] - df['chars_removed']
    df.drop(columns=['chars_added', 'chars_removed'], inplace=True)
    df['avg_characters_per_min'] = df['total_characters'] / df['writing_time_min']
    return df

In [None]:
df = preprocess()
df.head()

# Multicorrelation

In [None]:
df_labels = pd.read_csv(f"{INPUT_FOLDER}/train_scores.csv")
df = pd.merge(df, df_labels, on='id', how='left')
df.head()

In [None]:
#Pearson correlation matrix
numeric_df = df.select_dtypes(include=[np.number])

# Calculate the correlation matrix
matrix = numeric_df.corr()
matrix

Multicollinearity is indicated by the fact that the correlation between some pairs of features is stronger than their correlation with the dependent variable (score). For example the matrix shows the maximum correlation between the characteristics total_writing_time and writing_time_min (0.99) and it is greater than the correlation of either argument with the investigated characteristic, so we can draw the conclusion that these 2 characteristics are too strongly related to each other and in the same model both should not appear at the same time.

To simplify the task we will choose a treshold of 0.9, considering that to be a very strong correlation.

We will start with writing down pairs of features that are highly correlated.

#total_characters - word_count

events_per_min - event_count

text_change_count - event_count

Input_count - event_count

#end_time - writing_time_min

#writing_time_ms - writing_time_min

text_changes_per_min - events_per_min

total_characters - text_change_count

text_changes_per_min - text_change_count

Input_count - text_change_count

text_changes_per_min - Input_count

text_changes_per_min - avg_characters_per_min

#Input_count - word_count

Input_count - total_characters

#writing_time_ms - end_time

#total_characters - avg_characters_per_min

#pause_per_min - pause_count


In [None]:
import copy
df_model = copy.deepcopy(df)

In [None]:
df_model

In [None]:
#We will drop Input_count, writing_time_min, end_time, avg_characters_per_min, pause_per_min,
#events_per_min, total_characters,text_change_count.


#Drop one feature from the pair.

columns_to_drop = ['Input_count', 'writing_time_min', 'end_time', 'avg_characters_per_min',  'pause_per_min', 'events_per_min', 'total_characters', 'text_change_count']
df_model.drop(columns=columns_to_drop , inplace=True)
df_model

In [None]:
df_model = df_model.drop(columns='score')
df_model.head()

## Preprocessing to be done:

- Handle any missing values.
- Encode categorical features if present.
- Splitting the Data
- Dealing with the imbalansed data
- Normalize or standardize numerical features.

In [None]:
missing_values_features = df_model.isnull().sum()
missing_values_features

In [None]:
df_model['pauses_count'].fillna(df_model['pauses_count'].mean(), inplace=True)
df_model['pause_mean_duration'].fillna(df_model['pause_mean_duration'].median(), inplace=True)
df_model['pauses_time_sum'].fillna(df_model['pauses_time_sum'].mean(), inplace=True)
df_model['sentence_count'].fillna(df_model['sentence_count'].mean(), inplace=True)
df_model['pauses_share'].fillna(df_model['pauses_share'].mean(), inplace=True)

In [None]:
missing_values_features_imputated = df_model.isnull().sum()
missing_values_features_imputated

In [None]:
df_model.dtypes

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Encoding 'most_frequent_activity' using one-hot encoding
encoder = OneHotEncoder(sparse=False)
most_frequent_activity_encoded = encoder.fit_transform(df_model[['most_frequent_activity']])

# Converting encoded data into a DataFrame
columns = encoder.get_feature_names_out(['most_frequent_activity'])
df_most_frequent_activity_encoded = pd.DataFrame(most_frequent_activity_encoded, columns=columns)

# Converting 'overtime_writing' from boolean to numeric (0 and 1)
df_model['overtime_writing'] = df_model['overtime_writing'].astype(int)

# Dropping the original 'most_frequent_activity' column and adding the encoded columns
df_features_encoded = df_model.drop(['most_frequent_activity'], axis=1)
df_features_encoded = pd.concat([df_features_encoded, df_most_frequent_activity_encoded], axis=1)

df_features_encoded

In [None]:
df_labels['class'] = (df_labels['score'] * 2).astype(int)
df_labels

# Preparations for model training

In [None]:
from sklearn.model_selection import train_test_split
X = df_features_encoded.drop(['id'], axis=1)
X = X.fillna(0)
y_class = df_labels['class']
y_score = df_labels['score']  # Keep for regression

X_test = X[X['is_test'] == True].drop(['is_test'], axis=1)
X = X[X['is_test'] == False].drop(['is_test'], axis=1)

X_train, X_val, y_train_class, y_val_class = train_test_split(X, y_class, random_state=0, test_size=0.20, stratify=y_class)

In [None]:
X_train.shape, X_val.shape, X_test.shape

In [None]:
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE

smote_tomek = SMOTETomek(random_state=27, smote=SMOTE(k_neighbors=3))
X_overunder, y_overunder = smote_tomek.fit_resample(X_train, y_train_class)

In [None]:
y_overunder.hist()

In [None]:
y_train_resampled = y_overunder / 2.0

In [None]:
# Normalising the data
# More info: https://towardsdatascience.com/what-and-why-behind-fit-transform-vs-transform-in-scikit-learn-78f915cf96fe

from sklearn import preprocessing

scaler = preprocessing.MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_train_normalized = pd.DataFrame(X_train_normalized, columns=X_train.columns)
X_train_normalized.head()

In [None]:
X_val_normalized = scaler.transform(X_val)
X_val_normalized = pd.DataFrame(X_val_normalized, columns=X_val.columns)
#X_val_normalized.head()

# Model Training

## Linear Regression

In [None]:
l_reg = LogisticRegression()
l_reg.fit(X_train, y_train_class)

In [None]:
y_pred = l_reg.predict(X_val)
mean_squared_error(y_pred, y_val_class, squared=False)

In [None]:
test_pred_lr = l_reg.predict(X_test_normalized)
test_pred_lr = test_pred_lr/2.0
test_pred_lr

In [None]:
df_result_lr = pd.DataFrame({
    "id": df[df['is_test'] == True]['id'],
    "score": test_pred_lr
})
df_result_lr

In [None]:
df_result_lr.to_csv("submission.csv", index=False)

## LGBMRegressor

In [None]:
best_params = {'reg_alpha': 0.007678095440286993,
               'reg_lambda': 0.34230534302168353,
               'colsample_bytree': 0.627061253588415,
               'subsample': 0.854942238828458,
               'learning_rate': 0.04,   #0.038697981947473245,
               'num_leaves': 22,
               'max_depth': 37,
               'min_child_samples': 18,
               'n_jobs':4
              }
params = {
      "objective": "regression",
      "metric": "rmse",
      'random_state': 42,
      "n_estimators" : 12001,
      "verbosity": -1,
      **best_params
  }
model = lgb.LGBMRegressor(**params)
early_stopping_callback = lgb.early_stopping(100, first_metric_only=True, verbose=False)

In [None]:
model.fit(X_train_normalized, y_train_resampled, eval_set=[(X_val_normalized, y_val_class)],
                  callbacks=[early_stopping_callback],
        )

In [None]:
val_pred = model.predict(X_val_normalized)

In [None]:
mean_squared_error(val_pred, y_val_class, squared=False)

In [None]:
test_pred_lgb = model.predict(X_test_normalized)
test_pred_lgb = test_pred_lgb
test_pred_lgb

In [None]:
df_result_lgb = pd.DataFrame({
    "id": df[df['is_test'] == True]['id'],
    "score": test_pred_lgb
})
df_result_lgb

In [None]:
df_result_lgb.to_csv("submission.csv", index=False)

## RandomForest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf_reg = RandomForestRegressor()

In [None]:
rf_reg.fit(X_train, y_train_class)

In [None]:
val_pred = rf_reg.predict(X_val)

In [None]:
mean_squared_error(val_pred, y_val_class, squared=False)

In [None]:
test_pred_rf = rf_reg.predict(X_test_normalized)
test_pred_rf = test_pred_lgb
test_pred_rf

In [None]:
df_result_rf = pd.DataFrame({
    "id": df[df['is_test'] == True]['id'],
    "score": test_pred_rf
})
df_result_rf

In [None]:
df_result_rf.to_csv("submission.csv", index=False)