# Model Training Jupyter Notebook

### Importing packages and loading data

In [1]:
# Imports
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from scipy.stats import uniform, randint


from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import mean_squared_error

from sklearn.ensemble import HistGradientBoostingRegressor

from sklearn.inspection import permutation_importance




In [2]:
### Importing data
# Reading data
df_train = pd.read_csv('data/train_logs.csv', 
                 header=0)
df_test = pd.read_csv('data/test_logs.csv', 
                 header=0)
df_train_scores = pd.read_csv('data/train_scores.csv')

df_train.head()

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count
0,001519c8,1,4526,4557,31,Nonproduction,Leftclick,Leftclick,NoChange,0,0
1,001519c8,2,4558,4962,404,Nonproduction,Leftclick,Leftclick,NoChange,0,0
2,001519c8,3,106571,106571,0,Nonproduction,Shift,Shift,NoChange,0,0
3,001519c8,4,106686,106777,91,Input,q,q,q,1,1
4,001519c8,5,107196,107323,127,Input,q,q,q,2,1


### Feature Engineering

In [3]:
def word_constructor(df):
    """
    Group words in a DataFrame based on the 'activity' column.

    Parameters:
    df (DataFrame): The input DataFrame containing the 'activity' column.

    Returns:
    DataFrame: The modified DataFrame with additional columns 'word_start' and 'word_end' indicating the boundaries of words.
    """
    
    # Initialize columns for word beginnings and endings
    df['word_start'] = 0
    df['word_end'] = 0
    
    # Shifting the activity columns up and down one for subsequent calculations
    shifted_activity_prev = df['activity'].shift(1)
    shifted_activity_next = df['activity'].shift(-1)
    
    # Identification of word boundaries
    df['word_start'] = ((df['activity'] == 'Input') & (shifted_activity_prev != 'Input')).astype(int)
    df['word_end'] = ((df['activity'] == 'Input') & (shifted_activity_next != 'Input')).astype(int)
    
    # Handling edge cases: adressing first and last column of datafraem
    df.at[0, 'word_start'] = int(df.iloc[0]['activity'] == 'Input')
    df.at[df.index[-1], 'word_end'] = int(df.iloc[-1]['activity'] == 'Input')
    
    return df

In [4]:
def sentence_constructor(df):
    """
    
    """
    
    

In [5]:
def features(df):
    
    df = word_constructor(df)
    #df = sentence_constructor(df)
    
    # Calculate IKI for all events
    df['iki'] = df['down_time'].diff().fillna(0)

    # Initialize columns for intra-word IKI and inter-word IKI with NaN
    df['intra_word_iki'] = np.nan
    df['inter_word_iki'] = np.nan

    # Identify the start and end of words
    word_starts = df['word_start'] == 1
    word_ends = df['word_end'] == 1

    # Compute intra-word and inter-word IKI
    df.loc[word_starts, 'inter_word_iki'] = df.loc[word_starts, 'iki']
    df.loc[~word_starts & ~word_ends, 'intra_word_iki'] = df.loc[~word_starts & ~word_ends, 'iki']
    
    # IKI FEATURES
    # Create a DataFrame to store the features with a single column of IDs
    features = pd.DataFrame({'id': df['id'].unique()})
    
    # Computing median, standard deviation, and maximum IKI, intra-word IKI, and inter-word IKI

    agg_functions = ['median', 'std', 'max']
    iki_basics = df.groupby('id')['iki'].agg(agg_functions).reset_index()
    intra_word_iki_basics = df.groupby('id')['intra_word_iki'].agg(agg_functions).reset_index()
    inter_word_iki_basics = df.groupby('id')['inter_word_iki'].agg(agg_functions).reset_index()

    # Renaming the columns
    iki_basics.columns = ['id'] + [f'iki_{f}' for f in agg_functions]
    intra_word_iki_basics.columns = ['id'] + [f'intra_word_iki_{f}' for f in agg_functions]
    inter_word_iki_basics.columns = ['id'] + [f'inter_word_iki_{f}' for f in agg_functions]

    # Computing number of IKIs within length intervals
    
    # Define the length intervals
    #intervals = [0.5, 1, 1.5, 2, 2.5, 3, np.inf]
    

    # Merging IKI features
    features = features.merge(iki_basics, on='id')
    features = features.merge(intra_word_iki_basics, on='id')
    features = features.merge(inter_word_iki_basics, on='id')


    # REVISION FEATURES
    
    return features

In [6]:
features_train = features(df_train)
features_test = features(df_test)

In [7]:
# Merging training features with training scores
df_train_merged = features_train.merge(df_train_scores, on='id')

# Splitting the merged data into features and target variable
X_train = df_train_merged.drop(['id', 'score'], axis=1)  # Dropping 'id' as it's not a feature
y_train = df_train_merged['score']

X_test = features_test.drop('id', axis=1)  # Dropping 'id' as it's not a feature


### Hyperparemter Tuning

In [8]:
# Define the parameter distribution
param_dist = {
    'learning_rate': uniform(0.01, 0.2),
    'max_depth': randint(3, 10),
    'min_samples_leaf': randint(1, 4),
    'max_leaf_nodes': randint(31, 51),
    'max_iter': randint(100, 300),
    'l2_regularization': uniform(0, 1)
}

# Initialize the model
model = HistGradientBoostingRegressor()

# Initialize the random search model
random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=100, cv=5, scoring='neg_mean_squared_error', verbose=1, random_state=42)

# Fit the random search model
random_search.fit(X_train, y_train)

# Best parameters
print("Best parameters:", random_search.best_params_)


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best parameters: {'l2_regularization': 0.2962735057040824, 'learning_rate': 0.0430533878126005, 'max_depth': 3, 'max_iter': 170, 'max_leaf_nodes': 39, 'min_samples_leaf': 1}


### K-Fold CV, Model Training and Preliminary Evaluation

In [9]:
# Doing K-fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

mse_scores = []

# Loop over each fold
for train_index, val_index in kf.split(X_train):
    # Split the data into training and validation sets for this fold
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]


    # Create and train the model
    model = HistGradientBoostingRegressor(**random_search.best_params_)
    model.fit(X_train_fold, y_train_fold)

    # Make predictions on the validation set and calculate MSE
    y_val_pred = model.predict(X_val_fold)
    mse = mean_squared_error(y_val_fold, y_val_pred)
    mse_scores.append(mse)

# Calculate the average MSE across all folds
average_mse = np.mean(mse_scores)
median_mse = np.median(mse_scores)
print("Average Mean Squared Error across folds (training):", average_mse)
print("Median Mean Squared Error across folds (training):", median_mse)

Average Mean Squared Error across folds (training): 0.5806110502440186
Median Mean Squared Error across folds (training): 0.5972007786787631


In [10]:
# Predict
y_test_pred = model.predict(X_test)

### Test Prediction (for Kaggle)

In [11]:
submission = pd.DataFrame({
    'id': features_test['id'],
    'score': y_test_pred
})

# Export the submission DataFrame to a CSV file
submission.to_csv('submission.csv', index=False)

In [12]:
print(y_test_pred)

[3.22343428 3.03557762 3.59668367]


### Model Evaluation

##### Feature Importance

In [13]:
import plotly.express as px
import plotly.graph_objects as go

perm_importance = permutation_importance(model, X_train, y_train, n_repeats=10, random_state=42)

feature_names = X_train.columns
importances_df = pd.DataFrame({'features': feature_names, 'importance': perm_importance.importances_mean})

# Create the initial figure
fig = px.bar(importances_df.sort_values('importance', ascending=False),
             x='importance', 
             y='features',
             title='Permutation Feature Importances from HistGradientBoostingRegressor',
             labels={'features': 'Features', 'importance': 'Importance'},
             orientation='h')

# Customize the colors of the bars
cm = px.colors.sequential.Rainbow  # Rainbow color map
n_colors = len(importances_df)
fig.for_each_trace(
    lambda t: t.update(
        marker=dict(
            color=[cm[i % len(cm)] for i in range(n_colors)],
            line=dict(color='black', width=1),
            opacity=0.8  # Adjust opacity for less saturation
        )
    )
)

# Update layout
fig.update_layout(
    yaxis={'categoryorder':'total ascending'},
    xaxis_title='Importance',
    yaxis_title='Feature',
    showlegend=False  # Hide the legend if not necessary
)

fig.show()
