# Model Training Jupyter Notebook

In [30]:
# Importing Packages
import numpy as np
import pandas as pd
#import polars as pl

from sklearn.ensemble import HistGradientBoostingClassifier

In [31]:
def word_constructor(df):
    """
    Group words in a DataFrame based on the 'activity' column.

    Parameters:
    df (DataFrame): The input DataFrame containing the 'activity' column.

    Returns:
    DataFrame: The modified DataFrame with additional columns 'word_start' and 'word_end' indicating the boundaries of words.
    """
    
    # Initialize columns for word beginnings and endings
    df['word_start'] = 0
    df['word_end'] = 0
    
    # Shifting the activity columns up and down one for subsequent calculations
    shifted_activity_prev = df['activity'].shift(1)
    shifted_activity_next = df['activity'].shift(-1)
    
    # Identification of word boundaries
    df['word_start'] = ((df['activity'] == 'Input') & (shifted_activity_prev != 'Input')).astype(int)
    df['word_end'] = ((df['activity'] == 'Input') & (shifted_activity_next != 'Input')).astype(int)
    
    # Handling edge cases: adressing first and last column of datafraem
    df.at[0, 'word_start'] = int(df.iloc[0]['activity'] == 'Input')
    df.at[df.index[-1], 'word_end'] = int(df.iloc[-1]['activity'] == 'Input')
    
    return df

In [None]:
def sentence_constructor(df):
    """
    
    """
    
    

In [37]:
def features(df):
    
    df = word_constructor(df)
    df = sentence_constructor(df)
    
    # Calculate IKI for all events
    df['iki'] = df['down_time'].diff().fillna(0)

    # Initialize columns for intra-word IKI and inter-word IKI with NaN
    df['intra_word_iki'] = np.nan
    df['inter_word_iki'] = np.nan

    # Identify the start and end of words
    word_starts = df['word_start'] == 1
    word_ends = df['word_end'] == 1

    # Compute intra-word and inter-word IKI
    df.loc[word_starts, 'inter_word_iki'] = df.loc[word_starts, 'iki']
    df.loc[~word_starts & ~word_ends, 'intra_word_iki'] = df.loc[~word_starts & ~word_ends, 'iki']
    
    # IKI FEATURES
    # Create a DataFrame to store the features with a single column of IDs
    features = pd.DataFrame({'id': df['id'].unique()})
    
    # Computing median, standard deviation, and maximum IKI, intra-word IKI, and inter-word IKI

    agg_functions = ['median', 'std', 'max']
    iki_basics = df.groupby('id')['iki'].agg(agg_functions).reset_index()
    intra_word_iki_basics = df.groupby('id')['intra_word_iki'].agg(agg_functions).reset_index()
    inter_word_iki_basics = df.groupby('id')['inter_word_iki'].agg(agg_functions).reset_index()

    # Renaming the columns
    iki_basics.columns = ['id'] + [f'iki_{f}' for f in agg_functions]
    intra_word_iki_basics.columns = ['id'] + [f'intra_word_iki_{f}' for f in agg_functions]
    inter_word_iki_basics.columns = ['id'] + [f'inter_word_iki_{f}' for f in agg_functions]

    # Computing number of IKIs within length intervals
    
    # Define the length intervals
    intervals = [0.5, 1, 1.5, 2, 2.5, 3, np.inf]
    









    # Merging IKI features
    features = features.merge(iki_basics, on='id')
    features = features.merge(intra_word_iki_basics, on='id')
    features = features.merge(inter_word_iki_basics, on='id')


    # REVISION FEATURES
    
    return features

In [39]:
# Reading data
df_train = pd.read_csv('data/train_logs.csv', 
                 header=0)
df_test = pd.read_csv('data/test_logs.csv', 
                 header=0)
df_train_scores = pd.read_csv('data/train_scores.csv')

In [40]:
features_train = features(df_train)
features_test = features(df_test)

In [41]:
# Merging training features with training scores
df_train_merged = features_train.merge(df_train_scores, on='id')

# Splitting the merged data into features and target variable
X_train = df_train_merged.drop(['id', 'score'], axis=1)  # Dropping 'id' as it's not a feature
y_train = df_train_merged['score']

# Map scores to integers
score_mapping = {0.5: 0, 1: 1, 1.5: 2, 2: 3, 2.5: 4, 3: 5, 3.5: 6, 4: 7, 4.5: 8, 5: 9, 5.5: 10, 6: 11}
reverse_mapping = {v: k for k, v in score_mapping.items()}
y_train_mapped = y_train.map(score_mapping)


X_test = features_test.drop('id', axis=1)  # Dropping 'id' as it's not a feature


In [42]:
df_train_merged.head()
print(type(y_train[4]))

<class 'numpy.float64'>


In [43]:
model = HistGradientBoostingClassifier(max_iter=100, max_leaf_nodes=31, early_stopping='auto', random_state=42, verbose=1, scoring='loss')
model.fit(X_train, y_train_mapped)


Binning 0.000 GB of training data: 0.015 s
Fitting gradient boosted rounds:
[1/100] 

12 trees, 336 leaves (28 on avg), max depth = 12, in 0.089s
[2/100] 12 trees, 372 leaves (31 on avg), max depth = 15, in 0.055s
[3/100] 12 trees, 372 leaves (31 on avg), max depth = 16, in 0.133s
[4/100] 12 trees, 372 leaves (31 on avg), max depth = 14, in 0.053s
[5/100] 12 trees, 372 leaves (31 on avg), max depth = 15, in 0.047s
[6/100] 12 trees, 372 leaves (31 on avg), max depth = 16, in 0.044s
[7/100] 12 trees, 372 leaves (31 on avg), max depth = 13, in 0.048s
[8/100] 12 trees, 372 leaves (31 on avg), max depth = 19, in 0.052s
[9/100] 12 trees, 372 leaves (31 on avg), max depth = 19, in 0.052s
[10/100] 12 trees, 372 leaves (31 on avg), max depth = 20, in 0.079s
[11/100] 12 trees, 372 leaves (31 on avg), max depth = 17, in 0.050s
[12/100] 12 trees, 372 leaves (31 on avg), max depth = 20, in 0.049s
[13/100] 12 trees, 372 leaves (31 on avg), max depth = 22, in 0.053s
[14/100] 12 trees, 372 leaves (31 on avg), max depth = 20, in 0.048s
[15/100] 12 trees, 372 leaves (31 on avg), max dept

In [None]:
# Predict and reverse map
y_pred_mapped = model.predict(X_test)
y_pred = pd.Series(y_pred_mapped).map(reverse_mapping)


In [20]:
print(y_pred)

0    3.5
1    3.0
2    4.0
dtype: float64
