In [1]:
import pandas as pd
import numpy as np

from functools import reduce
from typing import Tuple, List
from collections import Counter


import seaborn as sns
import matplotlib.colors
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import matplotlib as mpl

import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import lightgbm as lgb
import numpy as np
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_squared_error

In [2]:
train_logs = pd.read_csv('input/train_logs.csv')
train_scores = pd.read_csv('input/train_scores.csv')

In [3]:
essay_event_counts = train_logs.groupby("id")["event_id"].count().reset_index(name="event_count")

# Using Plotly for a violin plot
fig = px.violin(essay_event_counts, y='event_count', box=True, points="all", title='Event Count per Essay', color_discrete_sequence=["#c07156"])

# Update layout to customize
fig.update_layout(
    showlegend=False,
    plot_bgcolor='white',
    paper_bgcolor='white',
    title=f"<span style='font-size:26px; font-family:Times New Roman'>Number of Events per Essay</span>",
    font = dict(color = '#7b6b59'),
    xaxis_title='', 
    yaxis_title='Number of Events'
)

fig.show()

In [6]:
#train_logs["activity"].value_counts(dropna=False)
print(f"Total distinct activity types: {train_logs['activity'].nunique()}")
train_logs["activity_type"] = train_logs["activity"].apply(lambda x: "Move" if x.startswith('Move') else x)
print(f"Total general activity types: {train_logs['activity_type'].nunique()}")

Total distinct activity types: 50
Total general activity types: 6


In [7]:
# Let's assume 'activity_counts' is a dictionary with your activity types as keys and their counts as values.
# Example: activity_counts = {'Walking': 150, 'Running': 100, 'Swimming': 50, ...}
activity_counts = train_logs["activity_type"].value_counts(dropna=False).to_dict()

# Calculate total counts and percentages
total_counts = sum(activity_counts.values())
activity_percentages = {activity: f"{(count / total_counts) * 100:.2f}%" for activity, count in activity_counts.items()}

# Sort activities by counts
sorted_activities = sorted(activity_counts.items(), key=lambda item: item[1], reverse=True)

# Separate activities and their counts into two lists for plotting
activities, counts = zip(*sorted_activities)

# Create horizontal bar plot
fig = go.Figure(go.Bar(
    x=counts,
    y=activities,
    text=[f"{activity_percentages[activity]}" for activity, count in sorted_activities],
    textposition='auto',
    orientation='h',
    marker_color=["#E6b6a4"]*len(activity_counts)
))


# Update layout to customize
fig.update_layout(
    showlegend=False,
    plot_bgcolor='white',
    paper_bgcolor='white',
    title=f"<span style='font-size:26px; font-family:Times New Roman'>Most Common Activities by Count and Percentage</span>",
    font=dict(color='#7b6b59'),
    xaxis_title='Count',
    yaxis_title='Activity',
    bargap=0.2,  # Gap between bars can be adjusted as needed
    yaxis={'categoryorder':'total ascending'},  # This will order the bars by count
)

# Show the plot
fig.show()

In [8]:
# Count the frequency of each score
score_counts = Counter(train_scores["score"])

# Sorting the scores and counts so they are plotted in order
sorted_scores = sorted(score_counts.items())

# Separate the scores and their counts into two lists for plotting
x, y = zip(*sorted_scores)
total = sum(y)

# Calculate percentages after sorting
percentages = [f'{(count/total)*100:.2f}%' for count in y]

# Create bar plot with the sorted scores and their corresponding percentages
fig = go.Figure([go.Bar(x=x, y=y, text=percentages, textposition='outside', marker_color=["#c07156"]*len(x))])

# Update layout to customize
fig.update_layout(
    showlegend=False,
    plot_bgcolor='white',
    paper_bgcolor='white',
    title=f"<span style='font-size:26px; font-family:Times New Roman'>Essays Score Distribution</span>",
    font=dict(color='#7b6b59'),
    xaxis_title='Scores',
    yaxis_title='Frequency'
)

# Show the plot
fig.show()

In [9]:

# Group by essay id and calculate the start and end times
grouped_logs = train_logs.groupby("id").agg(start_time=("down_time", "min"), end_time=("up_time", "max"))

# Calculate the duration for each essay
grouped_logs["duration_ms"] = grouped_logs["end_time"] - grouped_logs["start_time"]

# Convert duration from milliseconds to a more readable format if desired, e.g., minutes
grouped_logs["duration_min"] = grouped_logs["duration_ms"] / 60000

# You can calculate summary statistics for these durations
duration_stats = grouped_logs["duration_min"].describe()
grouped_logs.head(10)

Unnamed: 0_level_0,start_time,end_time,duration_ms,duration_min
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
001519c8,4526,1801969,1797443,29.957383
0022f953,30623,1788969,1758346,29.305767
0042269b,4441,1771669,1767228,29.4538
0059420b,41395,1404469,1363074,22.7179
0075873a,78470,1662472,1584002,26.400033
0081af50,42636,1778916,1736280,28.938
0093f095,6572,1768197,1761625,29.360417
009e23ab,106549,1799303,1692754,28.212567
00e048f1,5273,1797498,1792225,29.870417
00e1f05a,10731,1853697,1842966,30.7161


In [10]:
# Assuming 'grouped_logs' is your DataFrame and it has a column named 'duration_min'.

# Create a subplot with 1 row and 2 columns
fig = make_subplots(rows=1, cols=2, subplot_titles=('Boxplot of Essay Completion Times', 'Density Plot of Essay Completion Times'))

# Boxplot
box_trace = go.Box(y=grouped_logs['duration_min'], name='',
                   marker_color='#e5b01c', line_color='#e5b01c')

# Density Plot
density_trace = go.Histogram(x=grouped_logs['duration_min'], name='', 
                             histnorm='probability density', 
                             marker_color='#e5b01c')

# Adding box trace to the first column
fig.add_trace(box_trace, row=1, col=1)

# Adding density trace to the second column
fig.add_trace(density_trace, row=1, col=2)

# Update layout to customize
fig.update_layout(
    showlegend=False,
    plot_bgcolor='white',
    paper_bgcolor='white',
    title=f"<span style='font-size:26px; font-family:Times New Roman'>Essay Completion Time Analysis</span>",
    font = dict(color = '#7b6b59'),
)

# Customize x-axis and y-axis of the density plot
fig.update_xaxes(title_text="Duration (minutes)", row=1, col=2)

# Customize y-axis of the boxplot
fig.update_yaxes(title_text="Duration (minutes)", row=1, col=1)


# Remove gridlines
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)

# Show the figure
fig.show()

In [12]:
# Sort the DataFrame based on the essay 'id' and 'event_id'
df_sorted = train_logs.sort_values(by=['id', 'event_id'])

# Group by 'id' and get the last entry for each essay
df_last_event = df_sorted.groupby('id').last().reset_index()

# The 'word_count' column of this DataFrame now holds the word count after the last event for each essay
final_word_counts = df_last_event['word_count']

# If you need to calculate the average word count across all essays:
average_word_count = final_word_counts.mean()

print(f"The average word count per essay after the last event is: {average_word_count}")

The average word count per essay after the last event is: 386.112100364225


In [13]:
# Create the density plot
fig = ff.create_distplot([df_last_event['word_count']], ['Word Count'], bin_size=30, colors=['#e5b01c'])

# Update layout to customize
fig.update_layout(
    showlegend=False,
    plot_bgcolor='white',
    paper_bgcolor='white',
    title=f"<span style='font-size:26px; font-family:Times New Roman'>Density Plot of Essay Word Counts</span>",
    font = dict(color = '#7b6b59'),
    xaxis=dict(title='Word Count'),
    yaxis=dict(title='Density')
)

# Show the figure
fig.show()

In [14]:
train_logs = train_logs.sort_values(by=['id', 'event_id'])

# Calculate IKI with a groupby and transform, which will reset the diff() calculation for each group
train_logs["IKI"] = train_logs.groupby('id')['down_time'].diff().fillna(0)

# Define a pause threshold (in milliseconds)
pause_threshold = 2000  # This can be adjusted based on your needs

# Identify pauses
train_logs["is_pause"] = train_logs["IKI"] > pause_threshold

# Group data by essay ID
grouped_data = train_logs.groupby("id")

# Calculate features
pause_features = grouped_data.apply(lambda x: pd.Series({
    "number_of_pauses": x["is_pause"].sum(),
    "total_pause_time": x[x["is_pause"]]['IKI'].sum(),
    "mean_pause_length": x[x["is_pause"]]['IKI'].mean()
}))

# Calculate total writing time for each essay
pause_features["total_writing_time"] = grouped_data["action_time"].sum()

# Calculate the proportion of pause time
pause_features["proportion_of_pause_time"] = pause_features["total_pause_time"] / pause_features["total_writing_time"]

# Fill NaN values with appropriate values for mean_pause_length
pause_features['mean_pause_length'] = pause_features['mean_pause_length'].fillna(0)

# Reset index to flatten the DataFrame after groupby apply
pause_features.reset_index(inplace=True)

In [17]:
def plot_event_intervals(sample_data, essay_id, score):
    """Plots the intervals between events over time for a specified essay.
    
    Args:
        sample_data: A pandas DataFrame containing the event data.
        essay_id: A string representing the ID of the essay to plot.
        score: The score of the essay to include in the plot title.
    
    Returns:
        A Plotly graph object.
    """
    # Sort the sample data by event_id
    sample_data.sort_values('event_id', inplace=True)

    # Calculate the intervals (IKI)
    sample_data['IKI'] = sample_data['IKI'] / 1000  # Convert milliseconds to seconds

    # Convert 'down_time' to a datetime format for plotting
    sample_data['down_time'] = pd.to_datetime(sample_data['down_time'], unit='ms')

    # Create the plot using Plotly
    fig = px.line(sample_data, x='down_time', y='IKI', title=f"Time between events for Essay ID: {essay_id} with score {score}")

    # Format the x-axis to show time nicely
    fig.update_xaxes(
        tickformat='%H:%M:%S',
        dtick=60000  # 60-second interval; adjust as needed
    )
    fig.update_traces(line_color='#c07156')  # You can use any valid CSS color.

    # Label your axes and update the layout
    fig.update_layout(
        plot_bgcolor='white',
        paper_bgcolor='white',
        xaxis_title='Time',
        yaxis_title='Interval between events (seconds)',
        xaxis_tickangle=-45,
        font=dict(color='#7b6b59'),
        title=f"<span style='font-size:26px; font-family:Times New Roman'>Time between events for Essay ID: {essay_id} with score {score}</span>",
    )

    # Show the plot
    fig.show()

In [18]:
import plotly.graph_objs as go
from plotly.subplots import make_subplots

def plot_essay_activities(dataframe, essay_id, title):
    """
    Creates a Plotly subplot visualization for different activity types over time for a given essay.
    
    Args:
        dataframe (pd.DataFrame): The DataFrame containing the essay activities data.
        essay_id (str): The ID of the essay to visualize.
        title (str): The title for the plot.
    
    Returns:
        None: Displays a Plotly figure.
    """
    # Define specific colors for each activity type
    activity_colors = {
        'Nonproduction': '#edc860',
        'Input': '#c07156',
        'Remove/Cut': '#beb29e',
        'Replace': '#b39a74',
        'Move': '#beb29e',
        'Paste': '#a43725'
    }

    # Select the data for a single essay
    single_essay_df = dataframe[dataframe['id'] == essay_id]
    # Convert 'down_time' to a datetime format for plotting
    single_essay_df['down_time'] = pd.to_datetime(single_essay_df['down_time'], unit='ms')

    # Create a subplot for each activity type
    activities = single_essay_df['activity_type'].unique()
    subplot_titles = [f"Activity: {activity}" for activity in activities]
    fig = make_subplots(rows=len(activities), cols=1, subplot_titles=subplot_titles)

    # Add a scatter plot for each activity type with its specific color
    for i, activity in enumerate(activities, start=1):
        activity_df = single_essay_df[single_essay_df['activity_type'] == activity]
        # Use the specific color for the activity if it's defined, else default to black
        color = activity_colors.get(activity, '#000000')
        fig.add_trace(
            go.Scatter(x=activity_df['down_time'], y=activity_df['word_count'],
                       mode='lines+markers', name=activity, line=dict(color=color)),
            row=i, col=1
        )
    
    # Format the x-axis to show time nicely
    fig.update_xaxes(
        tickformat='%H:%M:%S',
        dtick=60000  # 60-second interval; adjust as needed
    )
    # Update layout
    fig.update_layout(
        showlegend=True,
        plot_bgcolor='white',
        paper_bgcolor='white',
        height=300*len(activities),
        title_text=title,
        font=dict(color='#7b6b59'),
        xaxis_tickangle=-45
    )

    # Show the plot
    fig.show()

In [20]:
# Filter the data for the chosen essay ID
#essay_id = "40b28508"
essay_id = "c3663a2d"
#essay_id = train_scores[train_scores["score"] == 0.5]["id"].sample(1).iloc[0]
sample_data = train_logs[
    train_logs["id"] == essay_id
]
plot_event_intervals(sample_data, "c3663a2d", "0.5")

In [21]:
# Normalize 'down_time' to the start of each essay
train_logs['time_since_start'] = train_logs.groupby('id')['down_time'].transform(lambda x: x - x.min())

# Convert 'time_since_start' to seconds for plotting
train_logs['time_since_start'] = train_logs['time_since_start'] / 1000  # convert milliseconds to seconds

In [22]:

essay_id = "c3663a2d"
plot_essay_activities(train_logs, essay_id, f"Word Count and Activities Over Time for Essay ID {essay_id} with score 0.5")

In [23]:

# Filter the data for the chosen essay ID
#essay_id = "e01ec054"
#essay_id = "f9fd3268"
#print(essay_id)
essay_id = "1343a4d2"
#essay_id = train_scores[train_scores["score"] == 6.0]["id"].sample(1).iloc[0]
sample_data = train_logs[
    train_logs["id"] == essay_id
]
plot_event_intervals(sample_data, essay_id, "6.0")

In [24]:

essay_id = "1343a4d2"
plot_essay_activities(train_logs, essay_id, f"Word Count and Activities Over Time for Essay ID {essay_id} with score 6.0")

In [26]:
# Define a global list of all possible activity types
ALL_ACTIVITY_TYPES = ["Nonproduction", "Input", "Remove/Cut", "Paste", "Replace", "Move"]

def generate_behavioral_features(df):
    """Generates aggregated features from a log dataset of essay compositions.

    Args:
        df (pandas.DataFrame): A dataframe containing keystroke and mouse click logs of essay compositions.

    Returns:
        pandas.DataFrame: A dataframe with aggregated features for each essay.
    """
    df = df.sort_values(by=["id", "event_id"])
    
    # Calculate the number of activities per category for each essay
    events_per_category = df.groupby(["id", "activity_type"]).size().unstack(fill_value=0)
    # Ensure all activity types are represented
    events_per_category = events_per_category.reindex(columns=ALL_ACTIVITY_TYPES, fill_value=0)

    # Calculate the total number of activities for each essay
    total_events = df.groupby("id").size().rename("total_activities")

    # Merge the DataFrames
    final_df = pd.merge(events_per_category, total_events.to_frame(), left_index=True, right_index=True)

    # Calculate the average time per event for each essay and activity
    average_time_per_activity = df.groupby(["id", "activity_type"])["action_time"].mean().unstack(fill_value=0)
    average_time_per_activity = average_time_per_activity.reindex(columns=ALL_ACTIVITY_TYPES, fill_value=0)
    final_df = pd.merge(final_df, average_time_per_activity, left_index=True, right_index=True, suffixes=('', '_avg_time'))

    # Group by essay id and calculate the start and end times
    grouped_logs = df.groupby("id").agg(start_time=("down_time", "min"), end_time=("up_time", "max"))

    # Calculate the duration for each essay
    grouped_logs["duration_ms"] = grouped_logs["end_time"] - grouped_logs["start_time"]

    # Convert duration from milliseconds to a more readable format if desired, e.g., minutes
    grouped_logs["duration_min"] = grouped_logs["duration_ms"] / 60000

    final_df = pd.merge(final_df, grouped_logs, left_index=True, right_index=True)

    # Get the final word count for each essay
    final_word_count = df.groupby("id")["word_count"].last()
    final_df = pd.merge(final_df, final_word_count, left_index=True, right_index=True)

    # Calculate the deletion to insertion ratio
    deletion_to_insertion_ratio = final_df["Remove/Cut"] / final_df["Input"]

    # Handle division by zero if there are no 'Input' events
    deletion_to_insertion_ratio = deletion_to_insertion_ratio.fillna(0)

    # Add this as a new column to your DataFrame
    final_df["deletion_to_insertion_ratio"] = deletion_to_insertion_ratio
    
    # Calculate basic statistics for action_time for each essay
    action_time_stats = df.groupby("id")["action_time"].agg(["mean", "median", "std", "min", "max", "skew"])

    # Rename the columns to include '_action_time' suffix
    action_time_stats.columns = [f"{col}_action_time" for col in action_time_stats.columns]

    # Merge these statistics with your existing DataFrame
    # Make sure that 'id' column exists in both DataFrames
    final_df = pd.merge(final_df, action_time_stats, on="id")

    # Calculate words per minute for each essay
    final_df["WPM"] = final_df["word_count"] / final_df["duration_min"]
    final_df = final_df.reset_index()
    final_df = final_df[[
        "id",
        "Nonproduction",
        "Input",
        "Remove/Cut",
        "Paste",
        "Replace",
        "Move",
        "total_activities",
        "duration_min",
        "Nonproduction_avg_time",
        "Input_avg_time",
        "Remove/Cut_avg_time",
        "Paste_avg_time",
        "Replace_avg_time",
        "Move_avg_time",
        "word_count",
        "deletion_to_insertion_ratio",
        "WPM",
        "mean_action_time", 
        "median_action_time", 
        "std_action_time",
        "min_action_time",
        "max_action_time",
        "skew_action_time"
    ]]

    return final_df

In [27]:
train_features = generate_behavioral_features(train_logs)

In [29]:
def calculate_pause_features(essay_df, pause_threshold=2000):
    """Calculate various pause-related features for an essay.

    This function computes the Inter-Keystroke Intervals (IKIs) for each keystroke
    in the given dataframe, identifies pauses based on a specified threshold,
    calculates features such as the number of pauses, total pause time, mean pause length,
    and the proportion of pause time.

    Args:
        essay_df (pd.DataFrame): A dataframe representing the keystroke log of a single essay. 
                                 It should contain at least 'down_time' and 'up_time' columns.
        pause_threshold (int, optional): The threshold (in milliseconds) to define a pause. 
                                         A pause is considered when the IKI is greater than this value. 
                                         Defaults to 2000 milliseconds.

    Returns:
        pd.Series: A Series containing the calculated pause features.
    """
    # Calculate IKIs
    # This line shifts the down_time column up by one row, so that each down_time aligns with the up_time of the previous keystroke.
    # The subtraction then gives the IKI for each pair of consecutive keystrokes.
    #essay_df["IKI"] = essay_df["down_time"].shift(-1) - essay_df["up_time"]    
    # Calculate IKI with a groupby and transform, which will reset the diff() calculation for each group
    essay_df["IKI"] = essay_df["down_time"].diff().fillna(0)

    # Identify pauses (IKI > 2000 milliseconds)
    pauses = essay_df[essay_df["IKI"] > pause_threshold]

    # Calculate pause features
    num_pauses = len(pauses)

    total_pause_time = pauses["IKI"].sum()
    total_writing_time = essay_df["up_time"].max() - essay_df["down_time"].min()
    proportion_pause_time = (total_pause_time / total_writing_time) * 100 if total_writing_time != 0 else 0
    mean_pause_length = pauses["IKI"].mean() if num_pauses != 0 else 0
    mean_pause_length = mean_pause_length / 60000
    total_pause_time = total_pause_time / 60000

    return pd.Series({
        "num_pauses": num_pauses,
        "total_pause_time": total_pause_time,
        "proportion_pause_time": proportion_pause_time,
        "mean_pause_length": mean_pause_length
    })

In [30]:
# Sort the DataFrame based on the essay 'id' and 'event_id'
train_logs = train_logs.sort_values(by=["id", "event_id"])

# Group by essay ID and calculate features for each essay
# This calculation is done for each essay separately to ensure
# that IKIs are only calculated within the boundaries of a single essay.
pause_features = train_logs.groupby("id").apply(calculate_pause_features).reset_index()

# Display the results
pause_features.head()

Unnamed: 0,id,num_pauses,total_pause_time,proportion_pause_time,mean_pause_length
0,001519c8,135.0,20.384133,68.043771,0.150994
1,0022f953,85.0,19.905383,67.923094,0.234181
2,0042269b,84.0,18.99445,64.488962,0.226124
3,0059420b,93.0,13.226567,58.220904,0.142221
4,0075873a,92.0,17.758617,67.267402,0.193028


In [31]:
final_df = pd.merge(train_features, pause_features, on="id")

In [32]:
final_df = final_df.merge(train_scores, on="id")

# Separate features and target variable
X = final_df.drop(["id", "score"], axis=1)
y = final_df["score"]

In [33]:
def round_scores(scores):
    scores = ((scores * 2) // 1) * .5
    scores = np.where(scores < 0, 0.0, scores)
    scores = np.where(scores > 6, 6.0, scores)
    return scores

In [36]:
# Initialize the XGBoost regressor
from xgboost import XGBRegressor

def RMSE(y_true,y_pred):
    return np.sqrt(np.mean((y_true-y_pred)**2))

y=final_df["score"].values
X=final_df.drop(["id", "score"], axis=1).values
# model=XGBRegressor(base_score=None, booster=None, callbacks=[],
#              colsample_bylevel=0.4893144868569437, colsample_bynode=None,
#              colsample_bytree=1.0, early_stopping_rounds=None,
#              enable_categorical=False, eval_metric=None, feature_types=None,
#              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
#              interaction_constraints=None, learning_rate=0.030367573025864663,
#              max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None,
#              max_delta_step=None, max_depth=2, max_leaves=None,
#              min_child_weight=4.085961297024359,
#              monotone_constraints=None, n_estimators=739, n_jobs=-1,
#              num_parallel_tree=None, predictor=None, random_state=2023)
# model = CatBoostRegressor(
#     iterations=200,         
#     learning_rate=0.05,       
#     depth=6,                
#     loss_function='RMSE',    
#     logging_level='Silent', 
#     random_seed=42
# )

best_params = {'boosting_type': 'gbdt', 
               'metric': 'rmse',
              # 'reg_alpha': 0.003188447814669599, 
              # 'reg_lambda': 0.0010228604507564066, 
               #'colsample_bytree': 0.5420247656839267, 
               #'subsample': 0.9778252382803456, 
               #'feature_fraction': 0.8,
               'bagging_freq': 1,
               'bagging_fraction': 0.75,
               'learning_rate': 0.05, 
               'num_leaves': 19, 
               'min_child_samples': 46,
               'verbosity': -1,
               'random_state': 42,
               'n_estimators': 500,
               'device_type': 'cpu'}

params = {
            "objective": "regression",
            "metric": "rmse",
            'random_state': 42,
            "n_estimators" : 12001,
            "verbosity": -1,
            "device_type": "cpu",
            **best_params
}

model = lgb.LGBMRegressor(**params)
model.fit(X,y)
train_pred=model.predict(X)
train_pred=np.where(train_pred<=0,0,train_pred)
train_pred=np.where(train_pred>=6,6,train_pred)
print(f"RMSE:{RMSE(y,train_pred)}")

RMSE:0.3002744171013128
