# Feature Engineering Jupyter Notebook

In [39]:
# Importing Packages
import numpy as np
import pandas as pd

In [40]:
# Reading data
df_train = pd.read_csv('data/train_logs.csv', 
                 header=0)
df_test = pd.read_csv('data/test_logs.csv', 
                 header=0)

In [41]:
def word_grouping(df):
    
    # Initialize columns for word beginnings and endings
    df['word_begin'] = 0
    df['word_end'] = 0

    # The first input starts a word
    if df.iloc[0]['activity'] == 'Input':
        df.at[0, 'word_begin'] = 1

    # Vectorized identification of word boundaries
    df['word_begin'] = ((df['activity'] == 'Input') & (df['activity'].shift(1) == 'Space')).astype(int)
    df['word_end'] = ((df['activity'] == 'Space') & (df['activity'].shift(-1) == 'Input')).astype(int)

    # Handle edge case for the last word
    if df.iloc[-1]['activity'] == 'Input':
        df.at[df.index[-1], 'word_end'] = 1
        
    return df


In [34]:
def compute_iw_iki(df):
    # Calculate the IKI for all events
    df['iki'] = df['down_time'].diff().fillna(0)

    # Initialize columns for intra-word IKI and inter-word IKI with NaN
    df['intra_word_iki'] = np.nan
    df['inter_word_iki'] = np.nan
    
    # Track the state of being within a word
    within_word = False

    for i in range(1, len(df)):
        if df.at[i, 'word_begin'] == 1:
            within_word = True
            continue  # We do not consider the IKI at the start of a word as intra-word IKI
        
        if df.at[i-1, 'word_end'] == 1:
            within_word = False
            # The IKI at the end of a word is considered as inter-word IKI
            df.at[i, 'inter_word_iki'] = df.at[i, 'iki']
            continue

        if within_word:
            # We are within a word
            df.at[i, 'intra_word_iki'] = df.at[i, 'iki']
        else:
            # We are between words
            df.at[i, 'inter_word_iki'] = df.at[i, 'iki']

    return df


In [42]:
def iki_features(df):
    
    # Grouping the data into words
    df = word_grouping(df)
    
    # BASIC FEATURES
    # Create a DataFrame to store the features with a single column of IDs
    features = pd.DataFrame({'id': df['id'].unique()})

    # Computing the interkeystroke interval (IKI)
    iki = df.groupby('id')['down_time'].diff().fillna(0)

    # Computing mean, median, standard, and maximum IKI
    mean_iki = iki.groupby(df['id']).mean().reset_index()
    mean_iki.columns = ['id', 'mean_iki']

    median_iki = iki.groupby(df['id']).median().reset_index()
    median_iki.columns = ['id', 'median_iki']

    std_iki = iki.groupby(df['id']).std().reset_index()
    std_iki.columns = ['id', 'std_iki']

    max_iki = iki.groupby(df['id']).max().reset_index()
    max_iki.columns = ['id', 'max_iki']

    # Merge with features DataFrame
    features = features.merge(mean_iki, on='id', how='left')
    features = features.merge(median_iki, on='id', how='left')
    features = features.merge(std_iki, on='id', how='left')
    features = features.merge(max_iki, on='id', how='left')

    # ADVANCED FEATURES
    df = compute_iw_iki(df)
    
    # Computing mean, median, standard, and maximum intra-word and inter-word IKI
    mean_intra_word_iki = df.groupby('id')['intra_word_iki'].mean().reset_index()
    mean_intra_word_iki.columns = ['id', 'mean_intra_word_iki']

    mean_inter_word_iki = df.groupby('id')['inter_word_iki'].mean().reset_index()
    mean_inter_word_iki.columns = ['id', 'mean_inter_word_iki']
    
    median_intra_word_iki = df.groupby('id')['intra_word_iki'].median().reset_index()
    median_intra_word_iki.columns = ['id', 'median_intra_word_iki']

    median_inter_word_iki = df.groupby('id')['inter_word_iki'].median().reset_index()
    median_inter_word_iki.columns = ['id', 'median_inter_word_iki']
    
    std_intra_word_iki = df.groupby('id')['intra_word_iki'].std().reset_index()
    std_intra_word_iki.columns = ['id', 'std_intra_word_iki']

    std_inter_word_iki = df.groupby('id')['inter_word_iki'].std().reset_index()
    std_inter_word_iki.columns = ['id', 'std_inter_word_iki']
    
    max_intra_word_iki = df.groupby('id')['intra_word_iki'].max().reset_index()
    max_intra_word_iki.columns = ['id', 'max_intra_word_iki']

    max_inter_word_iki = df.groupby('id')['inter_word_iki'].max().reset_index()
    max_inter_word_iki.columns = ['id', 'max_inter_word_iki']
    
    # Merge the advanced features with the features DataFrame
    features = features.merge(mean_intra_word_iki, on='id', how='left')
    features = features.merge(mean_inter_word_iki, on='id', how='left')
    features = features.merge(median_intra_word_iki, on='id', how='left')
    features = features.merge(median_inter_word_iki, on='id', how='left')
    features = features.merge(std_intra_word_iki, on='id', how='left')
    features = features.merge(std_inter_word_iki, on='id', how='left')
    features = features.merge(max_intra_word_iki, on='id', how='left')
    features = features.merge(max_inter_word_iki, on='id', how='left')
    
    return features


In [43]:
features = iki_features(df_train)

KeyError: 'Column not found: intraw_iki'

In [None]:
print(features)

            id    mean_iki  median_iki      std_iki   max_iki  \
0     001519c8  702.913962       151.0  4295.447374  154173.0   
1     0022f953  716.470660       160.0  4894.385161  145968.0   
2     0042269b  427.170696        94.0  3939.226278  153955.0   
3     0059420b  875.963368       256.0  4247.568454  101808.0   
4     0075873a  625.807981       166.0  3896.405072  110824.0   
...        ...         ...         ...          ...       ...   
2466  ffb8c745  373.309559       118.0  3457.675123  128628.0   
2467  ffbef7e5  682.562212       258.0  5632.013483  268008.0   
2468  ffccd6fd  631.991838       207.0  5399.385611  229911.0   
2469  ffec5b38  459.114744       168.0  3460.439398  127799.0   
2470  fff05981  561.021829       169.0  2987.199508  137693.0   

      mean_intraw_iki  mean_interw_iki  median_intraw_iki  median_interw_iki  \
0                 NaN              NaN                NaN                NaN   
1                 NaN              NaN                NaN  

In [33]:
# Find where median_intraw_iki isnt null
print(features[features['median_intraw_iki'].notnull()])

Empty DataFrame
Columns: [id, mean_iki, median_iki, std_iki, max_iki, mean_intraw_iki, mean_interw_iki, median_intraw_iki, median_interw_iki, std_intraw_iki, std_interw_iki, max_intraw_iki, max_interw_iki]
Index: []
