In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#training_data scores
data_scores = pd.read_csv("../train_scores.csv")
data_scores.head()

Unnamed: 0,id,score
0,001519c8,3.5
1,0022f953,3.5
2,0042269b,6.0
3,0059420b,2.0
4,0075873a,4.0


In [3]:
# training_data features
data_features = pd.read_csv("../train_logs.csv")
data_features.head()

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count
0,001519c8,1,4526,4557,31,Nonproduction,Leftclick,Leftclick,NoChange,0,0
1,001519c8,2,4558,4962,404,Nonproduction,Leftclick,Leftclick,NoChange,0,0
2,001519c8,3,106571,106571,0,Nonproduction,Shift,Shift,NoChange,0,0
3,001519c8,4,106686,106777,91,Input,q,q,q,1,1
4,001519c8,5,107196,107323,127,Input,q,q,q,2,1


In [4]:
# Define feature extraction functions
def total_writing_time(df):
    return df['action_time'].sum()

def average_writing_speed(df):
    # Assuming each Input action corresponds to one character
    total_characters = df[df['activity'] == 'Input'].shape[0]
    total_time = total_writing_time(df)
    return total_characters / total_time if total_time > 0 else 0

def total_edits(df):
    return df[df['activity'].isin(['Remove/Cut', 'Paste', 'Replace'])].shape[0]

def text_changes_frequency(df):
    return df[df['text_change'] != 'NoChange'].shape[0]

def final_word_count(df):
    return df.iloc[-1]['word_count']

def cursor_movement_frequency(df):
    # Assuming cursor movements are captured in 'Move From [x1, y1] To [x2, y2]' events
    return df[df['activity'].str.contains("Move")].shape[0]

def error_correction_rate(df):
    return df[df['activity'] == 'Remove/Cut'].shape[0]

def use_of_special_characters(df):
    # Assuming special characters are any non-'q' characters in 'text_change'
    return df[df['text_change'].str.contains(r'[^q]', regex=True)].shape[0]

In [6]:
# Group the data by essay id
grouped_data = data_features.groupby('id')

# Create a new DataFrame for the features
features_df = grouped_data.apply(lambda df: pd.Series({
    'Total Writing Time': total_writing_time(df),
    'Average Writing Speed': average_writing_speed(df),
    'Total Edits': total_edits(df),
    'Text Changes Frequency': text_changes_frequency(df),
    'Final Word Count': final_word_count(df),
    'Cursor Movement Frequency': cursor_movement_frequency(df),
    'Error Correction Rate': error_correction_rate(df),
    'Use of Special Characters': use_of_special_characters(df)
}))

In [7]:
features_df

Unnamed: 0_level_0,Total Writing Time,Average Writing Speed,Total Edits,Text Changes Frequency,Final Word Count,Cursor Movement Frequency,Error Correction Rate,Use of Special Characters
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
001519c8,297243.0,0.006762,424.0,2437.0,255.0,3.0,417.0,616.0
0022f953,275391.0,0.007037,262.0,2200.0,320.0,0.0,260.0,756.0
0042269b,421201.0,0.008345,446.0,3961.0,404.0,0.0,439.0,879.0
0059420b,189596.0,0.006878,153.0,1457.0,206.0,0.0,151.0,410.0
0075873a,313702.0,0.006191,517.0,2459.0,252.0,0.0,517.0,567.0
...,...,...,...,...,...,...,...,...
ffb8c745,499670.0,0.007181,962.0,4550.0,273.0,0.0,960.0,1133.0
ffbef7e5,214221.0,0.011180,61.0,2456.0,438.0,0.0,60.0,684.0
ffccd6fd,231580.0,0.012302,88.0,2937.0,201.0,0.0,88.0,2032.0
ffec5b38,289439.0,0.010002,276.0,3171.0,413.0,0.0,276.0,649.0


In [9]:
# Merge the features with the scores
features_with_scores_df = pd.merge(features_df, data_scores, on='id')
