In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

In [None]:
#training_data scores
data_scores = pd.read_csv("/content/drive/MyDrive/linking-writing-processes-to-writing-quality-local/train_scores.csv")
data_scores.head()

Unnamed: 0,id,score
0,001519c8,3.5
1,0022f953,3.5
2,0042269b,6.0
3,0059420b,2.0
4,0075873a,4.0


In [None]:
# training_data features
data_features = pd.read_csv("/content/drive/MyDrive/linking-writing-processes-to-writing-quality-local/train_logs.csv")
data_features.head()

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count
0,001519c8,1,4526,4557,31,Nonproduction,Leftclick,Leftclick,NoChange,0,0
1,001519c8,2,4558,4962,404,Nonproduction,Leftclick,Leftclick,NoChange,0,0
2,001519c8,3,106571,106571,0,Nonproduction,Shift,Shift,NoChange,0,0
3,001519c8,4,106686,106777,91,Input,q,q,q,1,1
4,001519c8,5,107196,107323,127,Input,q,q,q,2,1


In [None]:
# Define feature extraction functions
def total_writing_time(df):
    return df['action_time'].sum()

def average_writing_speed(df):
    # Assuming each Input action corresponds to one character
    total_characters = df[df['activity'] == 'Input'].shape[0]
    total_time = total_writing_time(df)
    return total_characters / total_time if total_time > 0 else 0

def total_edits(df):
    return df[df['activity'].isin(['Remove/Cut', 'Paste', 'Replace'])].shape[0]

def count_remove_cut(df):
    """Counts occurrences of 'Remove/Cut' activity."""
    return df[df['activity'] == 'Remove/Cut'].shape[0]

def count_paste(df):
    """Counts occurrences of 'Paste' activity."""
    return df[df['activity'] == 'Paste'].shape[0]

def count_replace(df):
    """Counts occurrences of 'Replace' activity."""
    return df[df['activity'] == 'Replace'].shape[0]

def text_changes_frequency(df):
    return df[df['text_change'] != 'NoChange'].shape[0]

def final_word_count(df):
    return df.iloc[-1]['word_count']

def cursor_movement_frequency(df):
    # Assuming cursor movements are captured in 'Move From [x1, y1] To [x2, y2]' events
    return df[df['activity'].str.contains("Move")].shape[0]

def error_correction_rate(df):
    return df[df['activity'] == 'Remove/Cut'].shape[0]

def use_of_special_characters(df):
    # Assuming special characters are any non-'q' characters in 'text_change'
    return df[df['text_change'].str.contains(r'[^q]', regex=True)].shape[0]

def average_sentence_length(df):
    # Assuming that 'text_change' column contains sentence delimiters to calculate sentence length
    # This is a placeholder; you would need to replace it with the actual logic for sentence detection
    df['sentence_count'] = df['text_change'].apply(lambda x: x.count('.') + x.count('!') + x.count('?'))
    total_sentences = df['sentence_count'].sum()
    total_words = df['word_count'].sum()
    return total_words / total_sentences if total_sentences else 0

# Function to analyze keystroke dynamics
def keystroke_dynamics(df):
    # Assuming 'down_time' and 'up_time' are in milliseconds
    df['keystroke_interval'] = df['up_time'] - df['down_time']
    average_time_between_keystrokes = df['keystroke_interval'].mean()
    return average_time_between_keystrokes

# Function to count the frequency of different actions
def frequency_of_actions(df):
    action_counts = df['activity'].value_counts().to_dict()
    return action_counts

# Function to analyze text structure changes
def text_structure_changes(df):
    # Assuming that large text changes can be inferred from 'text_change' events
    large_changes = df[df['text_change'].str.len() > 50]  # Placeholder threshold for large text blocks
    return large_changes.shape[0]

# Function to measure the frequency and duration of pauses
def frequency_of_pauses(df):
    # Assuming that a pause is when there is no 'activity'
    pauses = df[df['activity'] == 'Nonproduction']
    average_pause_duration = pauses['action_time'].mean()
    frequency_of_pauses = pauses.shape[0]
    return frequency_of_pauses, average_pause_duration

In [None]:
# Group the data by essay id
grouped_data = data_features.groupby('id')

# Create a new DataFrame for the features
features_df = grouped_data.apply(lambda df: pd.Series({
    'Total Writing Time': total_writing_time(df),
    'Average Writing Speed': average_writing_speed(df),
    'Total Edits': total_edits(df),
    'Text Changes Frequency': text_changes_frequency(df),
    'Final Word Count': final_word_count(df),
    'Cursor Movement Frequency': cursor_movement_frequency(df),
    'Error Correction Rate': error_correction_rate(df),
    'Use of Special Characters': use_of_special_characters(df),
    'Number of Remove/Cut': count_remove_cut(df),
    'Number of Paste': count_paste(df),
    'Number of Replace': count_replace(df),
    'Average Sentence Length': average_sentence_length(df),
    'Keystroke Dynamics': keystroke_dynamics(df),
    # 'Frequency of Actions': frequency_of_actions(df),  # This should be a scalar value or a series that can be summarized in a single cell
    'Text Structure Changes': text_structure_changes(df),
    'Frequency of Pauses': frequency_of_pauses(df)[0],  # Assuming this function returns a tuple, take the first element
    'Average Pause Duration': frequency_of_pauses(df)[1]
}))

In [None]:
# Merge the features and scores on 'id'
data = pd.merge(data_features, data_scores, on='id', how='left')

def aggregate_data(df, agg_methods):
    # Perform the aggregation using the provided methods
    data_aggregated = df.groupby('id').agg(agg_methods).reset_index()

    # Flatten the MultiIndex columns and add the aggregation methods to the column names
    if isinstance(data_aggregated.columns, pd.MultiIndex):
        # Use a conditional expression to leave 'score' as is
        data_aggregated.columns = [f"{col[0]}_{col[1]}" if col[0] != 'score' else col[0] for col in data_aggregated.columns.values]

    return data_aggregated

# Define the aggregation methods outside of the function
aggregation_methods = {
    'down_time': ['sum'],
    'up_time': ['sum'],
    'action_time': ['sum'],
    'cursor_position': ['max'],
    'word_count': ['max'],
    'score': ['mean'],  # This will be left as 'score' in the column name
}

# Assume 'data' is a DataFrame already loaded
# Call the function with the data and the aggregation methods
data_aggregated = aggregate_data(data, aggregation_methods)
data_aggregated.rename(columns={'id_': 'id'}, inplace=True)
print(data_aggregated)

            id  down_time_sum  up_time_sum  action_time_sum  \
0     001519c8     2168798234   2169095477           297243   
1     0022f953     1273271023   1273546414           275391   
2     0042269b     3426641982   3427063183           421201   
3     0059420b     1222211589   1222401185           189596   
4     0075873a     1805499474   1805813176           313702   
...        ...            ...          ...              ...   
2466  ffb8c745     3488386746   3488886416           499670   
2467  ffbef7e5     2192480040   2192694261           214221   
2468  ffccd6fd     3764472937   3764704517           231580   
2469  ffec5b38     1869073112   1869362551           289439   
2470  fff05981     3897099261   3897400496           301235   

      cursor_position_max  word_count_max  score  
0                    1539             256   3.50  
1                    1676             323   3.50  
2                    2291             404   6.00  
3                    1047             2

In [None]:
features_df.head()

Unnamed: 0_level_0,Total Writing Time,Average Writing Speed,Total Edits,Text Changes Frequency,Final Word Count,Cursor Movement Frequency,Error Correction Rate,Use of Special Characters,Number of Remove/Cut,Number of Paste,Number of Replace,Average Sentence Length,Keystroke Dynamics,Text Structure Changes,Frequency of Pauses,Average Pause Duration
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
001519c8,297243.0,0.01,424.0,2437.0,255.0,3.0,417.0,616.0,417.0,0.0,7.0,11699.75,116.25,0.0,120.0,154.22
0022f953,275391.0,0.01,262.0,2200.0,320.0,0.0,260.0,756.0,260.0,1.0,1.0,21351.52,112.22,0.0,254.0,54.26
0042269b,421201.0,0.01,446.0,3961.0,404.0,0.0,439.0,879.0,439.0,0.0,7.0,35025.22,101.84,5.0,175.0,194.01
0059420b,189596.0,0.01,153.0,1457.0,206.0,0.0,151.0,410.0,151.0,1.0,1.0,12402.38,121.85,0.0,99.0,30.93
0075873a,313702.0,0.01,517.0,2459.0,252.0,0.0,517.0,567.0,517.0,0.0,0.0,9311.32,123.94,0.0,72.0,97.06


In [None]:
data_aggregated

Unnamed: 0,id,down_time_sum,up_time_sum,action_time_sum,cursor_position_max,word_count_max,score
0,001519c8,2168798234,2169095477,297243,1539,256,3.50
1,0022f953,1273271023,1273546414,275391,1676,323,3.50
2,0042269b,3426641982,3427063183,421201,2291,404,6.00
3,0059420b,1222211589,1222401185,189596,1047,206,2.00
4,0075873a,1805499474,1805813176,313702,1402,252,4.00
...,...,...,...,...,...,...,...
2466,ffb8c745,3488386746,3488886416,499670,1634,461,3.50
2467,ffbef7e5,2192480040,2192694261,214221,1877,438,4.00
2468,ffccd6fd,3764472937,3764704517,231580,2761,201,1.50
2469,ffec5b38,1869073112,1869362551,289439,2133,413,5.00


In [None]:
# Merge the features with the scores
merged_df = pd.merge(features_df, data_aggregated, on='id')

In [None]:
# Convert scores to string categories
y = merged_df['score'].apply(lambda x: '{:.1f}'.format(x))

# Define features
X = merged_df.drop(['id', 'score'], axis=1)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Train the model
dt_classifier.fit(X_train, y_train)

# Predict on training data for the training report and accuracy
y_train_pred = dt_classifier.predict(X_train)
training_accuracy = accuracy_score(y_train, y_train_pred)
training_report = classification_report(y_train, y_train_pred)

# Predict on validation set for the validation report and accuracy
y_val_pred = dt_classifier.predict(X_val)
validation_accuracy = accuracy_score(y_val, y_val_pred)
validation_report = classification_report(y_val, y_val_pred)

# Print training and validation accuracies
print(f"Training Accuracy: {training_accuracy}")
print(f"Validation Accuracy: {validation_accuracy}")

# Print classification reports
print("\nTraining Classification Report:\n", training_report)
print("Validation Classification Report:\n", validation_report)

Training Accuracy: 1.0
Validation Accuracy: 0.24646464646464647

Training Classification Report:
               precision    recall  f1-score   support

         0.5       1.00      1.00      1.00         5
         1.0       1.00      1.00      1.00        31
         1.5       1.00      1.00      1.00        56
         2.0       1.00      1.00      1.00        73
         2.5       1.00      1.00      1.00       160
         3.0       1.00      1.00      1.00       267
         3.5       1.00      1.00      1.00       384
         4.0       1.00      1.00      1.00       395
         4.5       1.00      1.00      1.00       331
         5.0       1.00      1.00      1.00       142
         5.5       1.00      1.00      1.00       102
         6.0       1.00      1.00      1.00        30

    accuracy                           1.00      1976
   macro avg       1.00      1.00      1.00      1976
weighted avg       1.00      1.00      1.00      1976

Validation Classification Report:
 

In [None]:
from lazypredict.Supervised import LazyClassifier

# Initialize LazyClassifier
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

# Fit models and retrieve performance metrics
models, predictions = clf.fit(X_train, X_val, y_train, y_val)

# Print model performance
print(models)

 90%|████████▉ | 26/29 [00:15<00:00,  3.69it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000520 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4388
[LightGBM] [Info] Number of data points in the train set: 1976, number of used features: 21
[LightGBM] [Info] Start training from score -5.979392
[LightGBM] [Info] Start training from score -4.154843
[LightGBM] [Info] Start training from score -3.563478
[LightGBM] [Info] Start training from score -3.298370
[LightGBM] [Info] Start training from score -2.513656
[LightGBM] [Info] Start training from score -2.001581
[LightGBM] [Info] Start training from score -1.638187
[LightGBM] [Info] Start training from score -1.609944
[LightGBM] [Info] Start training from score -1.786712
[LightGBM] [Info] Start training from score -2.633003
[LightGBM] [Info] Start training from score -2.963857
[LightGBM] [Info] Start training from score -4.187632


100%|██████████| 29/29 [00:18<00:00,  1.59it/s]

                               Accuracy  Balanced Accuracy ROC AUC  F1 Score  \
Model                                                                          
ExtraTreesClassifier               0.31               0.22    None      0.29   
BaggingClassifier                  0.30               0.22    None      0.30   
KNeighborsClassifier               0.26               0.22    None      0.26   
LGBMClassifier                     0.33               0.22    None      0.31   
LogisticRegression                 0.34               0.21    None      0.31   
ExtraTreeClassifier                0.25               0.21    None      0.26   
RandomForestClassifier             0.29               0.20    None      0.28   
SVC                                0.35               0.19    None      0.30   
NearestCentroid                    0.18               0.19    None      0.20   
LinearDiscriminantAnalysis         0.34               0.19    None      0.30   
CalibratedClassifierCV             0.34 




In [None]:
merged_df

Unnamed: 0,id,Total Writing Time,Average Writing Speed,Total Edits,Text Changes Frequency,Final Word Count,Cursor Movement Frequency,Error Correction Rate,Use of Special Characters,Number of Remove/Cut,...,Keystroke Dynamics,Text Structure Changes,Frequency of Pauses,Average Pause Duration,down_time_sum,up_time_sum,action_time_sum,cursor_position_max,word_count_max,score
0,001519c8,297243.00,0.01,424.00,2437.00,255.00,3.00,417.00,616.00,417.00,...,116.25,0.00,120.00,154.22,2168798234,2169095477,297243,1539,256,3.50
1,0022f953,275391.00,0.01,262.00,2200.00,320.00,0.00,260.00,756.00,260.00,...,112.22,0.00,254.00,54.26,1273271023,1273546414,275391,1676,323,3.50
2,0042269b,421201.00,0.01,446.00,3961.00,404.00,0.00,439.00,879.00,439.00,...,101.84,5.00,175.00,194.01,3426641982,3427063183,421201,2291,404,6.00
3,0059420b,189596.00,0.01,153.00,1457.00,206.00,0.00,151.00,410.00,151.00,...,121.85,0.00,99.00,30.93,1222211589,1222401185,189596,1047,206,2.00
4,0075873a,313702.00,0.01,517.00,2459.00,252.00,0.00,517.00,567.00,517.00,...,123.94,0.00,72.00,97.06,1805499474,1805813176,313702,1402,252,4.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2466,ffb8c745,499670.00,0.01,962.00,4550.00,273.00,0.00,960.00,1133.00,960.00,...,105.44,1.00,189.00,27.53,3488386746,3488886416,499670,1634,461,3.50
2467,ffbef7e5,214221.00,0.01,61.00,2456.00,438.00,0.00,60.00,684.00,60.00,...,82.27,0.00,148.00,44.48,2192480040,2192694261,214221,1877,438,4.00
2468,ffccd6fd,231580.00,0.01,88.00,2937.00,201.00,0.00,88.00,2032.00,88.00,...,75.61,0.00,126.00,81.21,3764472937,3764704517,231580,2761,201,1.50
2469,ffec5b38,289439.00,0.01,276.00,3171.00,413.00,0.00,276.00,649.00,276.00,...,89.28,1.00,71.00,79.21,1869073112,1869362551,289439,2133,413,5.00


In [None]:
data.head()

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count,score
0,001519c8,1,4526,4557,31,Nonproduction,Leftclick,Leftclick,NoChange,0,0,3.5
1,001519c8,2,4558,4962,404,Nonproduction,Leftclick,Leftclick,NoChange,0,0,3.5
2,001519c8,3,106571,106571,0,Nonproduction,Shift,Shift,NoChange,0,0,3.5
3,001519c8,4,106686,106777,91,Input,q,q,q,1,1,3.5
4,001519c8,5,107196,107323,127,Input,q,q,q,2,1,3.5
