# Feature Engineering Jupyter Notebook Testing

In [2]:
# Importing Packages
import numpy as np
import pandas as pd

In [3]:
# Reading data
df_train = pd.read_csv('data/train_logs.csv', 
                 header=0)
df_test = pd.read_csv('data/test_logs.csv', 
                 header=0)

In [4]:
def word_grouping(df):
    
    # Initialize columns for word beginnings and endings
    df['word_begin'] = 0
    df['word_end'] = 0

    # Vectorized identification of word boundaries
    df['word_begin'] = ((df['activity'] == 'Input') & (df['activity'].shift(1) != 'Input')).astype(int)
    df['word_end'] = ((df['activity'] == 'Input') & (df['activity'].shift(-1) != 'Input')).astype(int)

    # Handle edge case for the first input
    if df.iloc[0]['activity'] == 'Input':
        df.at[0, 'word_begin'] = 1

    # Handle edge case for the last input
    if df.iloc[-1]['activity'] == 'Input':
        df.at[df.index[-1], 'word_end'] = 1
        
    return df

In [5]:
def compute_iw_iki(df):
    # Calculate the IKI for all events
    df['iki'] = df['down_time'].diff().fillna(0)

    # Initialize columns for intra-word IKI and inter-word IKI with NaN
    df['intra_word_iki'] = np.nan
    df['inter_word_iki'] = np.nan
    
    # Track the state of being within a word
    within_word = False

    for i in range(1, len(df)):
        if df.at[i, 'word_begin'] == 1:
            within_word = True
            continue  # We do not consider the IKI at the start of a word as intra-word IKI
        
        if df.at[i-1, 'word_end'] == 1:
            within_word = False
            # The IKI at the end of a word is considered as inter-word IKI
            df.at[i, 'inter_word_iki'] = df.at[i, 'iki']
            continue

        if within_word:
            # We are within a word
            df.at[i, 'intra_word_iki'] = df.at[i, 'iki']
        else:
            # We are between words
            df.at[i, 'inter_word_iki'] = df.at[i, 'iki']

    return df


In [6]:
def iki_features(df):
    
    # Grouping the data into words
    df = word_grouping(df)
    
    # BASIC FEATURES
    # Create a DataFrame to store the features with a single column of IDs
    features = pd.DataFrame({'id': df['id'].unique()})

    # Computing the interkeystroke interval (IKI)
    iki = df.groupby('id')['down_time'].diff().fillna(0)

    # Computing mean, median, standard, and maximum IKI
    mean_iki = iki.groupby(df['id']).mean().reset_index()
    mean_iki.columns = ['id', 'mean_iki']

    median_iki = iki.groupby(df['id']).median().reset_index()
    median_iki.columns = ['id', 'median_iki']

    std_iki = iki.groupby(df['id']).std().reset_index()
    std_iki.columns = ['id', 'std_iki']

    max_iki = iki.groupby(df['id']).max().reset_index()
    max_iki.columns = ['id', 'max_iki']

    # Merge with features DataFrame
    features = features.merge(mean_iki, on='id', how='left')
    features = features.merge(median_iki, on='id', how='left')
    features = features.merge(std_iki, on='id', how='left')
    features = features.merge(max_iki, on='id', how='left')

    # ADVANCED FEATURES
    df = compute_iw_iki(df)
    
    # Computing mean, median, standard, and maximum intra-word and inter-word IKI
    mean_intra_word_iki = df.groupby('id')['intra_word_iki'].mean().reset_index()
    mean_intra_word_iki.columns = ['id', 'mean_intra_word_iki']

    mean_inter_word_iki = df.groupby('id')['inter_word_iki'].mean().reset_index()
    mean_inter_word_iki.columns = ['id', 'mean_inter_word_iki']
    
    median_intra_word_iki = df.groupby('id')['intra_word_iki'].median().reset_index()
    median_intra_word_iki.columns = ['id', 'median_intra_word_iki']

    median_inter_word_iki = df.groupby('id')['inter_word_iki'].median().reset_index()
    median_inter_word_iki.columns = ['id', 'median_inter_word_iki']
    
    std_intra_word_iki = df.groupby('id')['intra_word_iki'].std().reset_index()
    std_intra_word_iki.columns = ['id', 'std_intra_word_iki']

    std_inter_word_iki = df.groupby('id')['inter_word_iki'].std().reset_index()
    std_inter_word_iki.columns = ['id', 'std_inter_word_iki']
    
    max_intra_word_iki = df.groupby('id')['intra_word_iki'].max().reset_index()
    max_intra_word_iki.columns = ['id', 'max_intra_word_iki']

    max_inter_word_iki = df.groupby('id')['inter_word_iki'].max().reset_index()
    max_inter_word_iki.columns = ['id', 'max_inter_word_iki']
    
    # Merge the advanced features with the features DataFrame
    features = features.merge(mean_intra_word_iki, on='id', how='left')
    features = features.merge(mean_inter_word_iki, on='id', how='left')
    features = features.merge(median_intra_word_iki, on='id', how='left')
    features = features.merge(median_inter_word_iki, on='id', how='left')
    features = features.merge(std_intra_word_iki, on='id', how='left')
    features = features.merge(std_inter_word_iki, on='id', how='left')
    features = features.merge(max_intra_word_iki, on='id', how='left')
    features = features.merge(max_inter_word_iki, on='id', how='left')
    
    return features


In [7]:
features = iki_features(df_train)

In [8]:
features.head()

Unnamed: 0,id,mean_iki,median_iki,std_iki,max_iki,mean_intra_word_iki,mean_inter_word_iki,median_intra_word_iki,median_inter_word_iki,std_intra_word_iki,std_inter_word_iki,max_intra_word_iki,max_inter_word_iki
0,001519c8,702.913962,151.0,4295.447374,154173.0,342.365323,1934.520147,141.0,178.0,1204.527987,8897.245491,28662.0,154173.0
1,0022f953,716.47066,160.0,4894.385161,145968.0,265.497247,-1092.996124,160.0,192.0,710.316453,78771.950378,20306.0,145968.0
2,0042269b,427.170696,94.0,3939.226278,153955.0,173.878491,-1109.582931,89.0,157.0,1575.33512,72284.862646,52656.0,153955.0
3,0059420b,875.963368,256.0,4247.568454,101808.0,584.713235,-4996.90873,240.0,377.0,1346.288021,109488.969669,19733.0,101808.0
4,0075873a,625.807981,166.0,3896.405072,110824.0,351.062399,-773.704584,171.0,37.0,1750.918393,55188.898543,60072.0,110824.0
