In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

In [5]:
#training_data scores
data_scores = pd.read_csv("/content/drive/MyDrive/linking-writing-processes-to-writing-quality-local/train_scores.csv")
data_scores.head()

Unnamed: 0,id,score
0,001519c8,3.5
1,0022f953,3.5
2,0042269b,6.0
3,0059420b,2.0
4,0075873a,4.0


In [6]:
# training_data features
data_features = pd.read_csv("/content/drive/MyDrive/linking-writing-processes-to-writing-quality-local/train_logs.csv")
data_features.head()

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count
0,001519c8,1,4526,4557,31,Nonproduction,Leftclick,Leftclick,NoChange,0,0
1,001519c8,2,4558,4962,404,Nonproduction,Leftclick,Leftclick,NoChange,0,0
2,001519c8,3,106571,106571,0,Nonproduction,Shift,Shift,NoChange,0,0
3,001519c8,4,106686,106777,91,Input,q,q,q,1,1
4,001519c8,5,107196,107323,127,Input,q,q,q,2,1


In [7]:
# Merge the features and scores on 'id'
data = pd.merge(data_features, data_scores, on='id', how='left')

def aggregate_data(df, agg_methods):
    # Perform the aggregation using the provided methods
    data_aggregated = df.groupby('id').agg(agg_methods).reset_index()

    # Flatten the MultiIndex columns and add the aggregation methods to the column names
    if isinstance(data_aggregated.columns, pd.MultiIndex):
        # Use a conditional expression to leave 'score' as is
        data_aggregated.columns = [f"{col[0]}_{col[1]}" if col[0] != 'score' else col[0] for col in data_aggregated.columns.values]

    return data_aggregated

# Define the aggregation methods outside of the function
aggregation_methods = {
    'down_time': ['sum'],
    'up_time': ['sum'],
    'action_time': ['sum'],
    'cursor_position': ['max'],
    'word_count': ['max'],
    'score': ['mean'],  # This will be left as 'score' in the column name
}

# Assume 'data' is a DataFrame already loaded
# Call the function with the data and the aggregation methods
data_aggregated = aggregate_data(data, aggregation_methods)
data_aggregated.rename(columns={'id_': 'id'}, inplace=True)
print(data_aggregated)

            id  down_time_sum  up_time_sum  action_time_sum  \
0     001519c8     2168798234   2169095477           297243   
1     0022f953     1273271023   1273546414           275391   
2     0042269b     3426641982   3427063183           421201   
3     0059420b     1222211589   1222401185           189596   
4     0075873a     1805499474   1805813176           313702   
...        ...            ...          ...              ...   
2466  ffb8c745     3488386746   3488886416           499670   
2467  ffbef7e5     2192480040   2192694261           214221   
2468  ffccd6fd     3764472937   3764704517           231580   
2469  ffec5b38     1869073112   1869362551           289439   
2470  fff05981     3897099261   3897400496           301235   

      cursor_position_max  word_count_max  score  
0                    1539             256    3.5  
1                    1676             323    3.5  
2                    2291             404    6.0  
3                    1047             2

In [8]:
# Define feature extraction functions
def total_writing_time(df):
    return df['action_time'].sum()

def average_writing_speed(df):
    # Assuming each Input action corresponds to one character
    total_characters = df[df['activity'] == 'Input'].shape[0]
    total_time = total_writing_time(df)
    return total_characters / total_time if total_time > 0 else 0

def total_edits(df):
    return df[df['activity'].isin(['Remove/Cut', 'Paste', 'Replace'])].shape[0]

def count_remove_cut(df):
    """Counts occurrences of 'Remove/Cut' activity."""
    return df[df['activity'] == 'Remove/Cut'].shape[0]

def count_paste(df):
    """Counts occurrences of 'Paste' activity."""
    return df[df['activity'] == 'Paste'].shape[0]

def count_replace(df):
    """Counts occurrences of 'Replace' activity."""
    return df[df['activity'] == 'Replace'].shape[0]

def text_changes_frequency(df):
    return df[df['text_change'] != 'NoChange'].shape[0]

def final_word_count(df):
    return df.iloc[-1]['word_count']

def cursor_movement_frequency(df):
    # Assuming cursor movements are captured in 'Move From [x1, y1] To [x2, y2]' events
    return df[df['activity'].str.contains("Move")].shape[0]

def error_correction_rate(df):
    return df[df['activity'] == 'Remove/Cut'].shape[0]

def use_of_special_characters(df):
    # Assuming special characters are any non-'q' characters in 'text_change'
    return df[df['text_change'].str.contains(r'[^q]', regex=True)].shape[0]

def average_sentence_length(df):
    # Assuming that 'text_change' column contains sentence delimiters to calculate sentence length
    # This is a placeholder; you would need to replace it with the actual logic for sentence detection
    df['sentence_count'] = df['text_change'].apply(lambda x: x.count('.') + x.count('!') + x.count('?'))
    total_sentences = df['sentence_count'].sum()
    total_words = df['word_count'].sum()
    return total_words / total_sentences if total_sentences else 0

# Function to analyze keystroke dynamics
def keystroke_dynamics(df):
    # Assuming 'down_time' and 'up_time' are in milliseconds
    df['keystroke_interval'] = df['up_time'] - df['down_time']
    average_time_between_keystrokes = df['keystroke_interval'].mean()
    return average_time_between_keystrokes

# Function to count the frequency of different actions
def frequency_of_actions(df):
    action_counts = df['activity'].value_counts().to_dict()
    return action_counts

# Function to analyze text structure changes
def text_structure_changes(df):
    # Assuming that large text changes can be inferred from 'text_change' events
    large_changes = df[df['text_change'].str.len() > 50]  # Placeholder threshold for large text blocks
    return large_changes.shape[0]

# Function to measure the frequency and duration of pauses
def frequency_of_pauses(df):
    # Assuming that a pause is when there is no 'activity'
    pauses = df[df['activity'] == 'Nonproduction']
    average_pause_duration = pauses['action_time'].mean()
    frequency_of_pauses = pauses.shape[0]
    return frequency_of_pauses, average_pause_duration

In [9]:
# Group the data by essay id
grouped_data = data_features.groupby('id')

# Create a new DataFrame for the features
features_df = grouped_data.apply(lambda df: pd.Series({
    'Total Writing Time': total_writing_time(df),
    'Average Writing Speed': average_writing_speed(df),
    'Total Edits': total_edits(df),
    'Text Changes Frequency': text_changes_frequency(df),
    'Final Word Count': final_word_count(df),
    'Cursor Movement Frequency': cursor_movement_frequency(df),
    'Error Correction Rate': error_correction_rate(df),
    'Use of Special Characters': use_of_special_characters(df),
    'Number of Remove/Cut': count_remove_cut(df),
    'Number of Paste': count_paste(df),
    'Number of Replace': count_replace(df),
    'Average Sentence Length': average_sentence_length(df),
    'Keystroke Dynamics': keystroke_dynamics(df),
    # 'Frequency of Actions': frequency_of_actions(df),  # This should be a scalar value or a series that can be summarized in a single cell
    'Text Structure Changes': text_structure_changes(df),
    'Frequency of Pauses': frequency_of_pauses(df)[0],  # Assuming this function returns a tuple, take the first element
    'Average Pause Duration': frequency_of_pauses(df)[1]
}))

In [10]:
# Merge the features with the scores
merged_df = pd.merge(features_df, data_aggregated, on='id')

In [11]:
merged_df

Unnamed: 0,id,Total Writing Time,Average Writing Speed,Total Edits,Text Changes Frequency,Final Word Count,Cursor Movement Frequency,Error Correction Rate,Use of Special Characters,Number of Remove/Cut,...,Keystroke Dynamics,Text Structure Changes,Frequency of Pauses,Average Pause Duration,down_time_sum,up_time_sum,action_time_sum,cursor_position_max,word_count_max,score
0,001519c8,297243.0,0.006762,424.0,2437.0,255.0,3.0,417.0,616.0,417.0,...,116.246774,0.0,120.0,154.216667,2168798234,2169095477,297243,1539,256,3.5
1,0022f953,275391.0,0.007037,262.0,2200.0,320.0,0.0,260.0,756.0,260.0,...,112.221271,0.0,254.0,54.255906,1273271023,1273546414,275391,1676,323,3.5
2,0042269b,421201.0,0.008345,446.0,3961.0,404.0,0.0,439.0,879.0,439.0,...,101.837766,5.0,175.0,194.005714,3426641982,3427063183,421201,2291,404,6.0
3,0059420b,189596.0,0.006878,153.0,1457.0,206.0,0.0,151.0,410.0,151.0,...,121.848329,0.0,99.0,30.929293,1222211589,1222401185,189596,1047,206,2.0
4,0075873a,313702.0,0.006191,517.0,2459.0,252.0,0.0,517.0,567.0,517.0,...,123.943896,0.0,72.0,97.055556,1805499474,1805813176,313702,1402,252,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2466,ffb8c745,499670.0,0.007181,962.0,4550.0,273.0,0.0,960.0,1133.0,960.0,...,105.437856,1.0,189.0,27.529101,3488386746,3488886416,499670,1634,461,3.5
2467,ffbef7e5,214221.0,0.011180,61.0,2456.0,438.0,0.0,60.0,684.0,60.0,...,82.266129,0.0,148.0,44.479730,2192480040,2192694261,214221,1877,438,4.0
2468,ffccd6fd,231580.0,0.012302,88.0,2937.0,201.0,0.0,88.0,2032.0,88.0,...,75.605615,0.0,126.0,81.206349,3764472937,3764704517,231580,2761,201,1.5
2469,ffec5b38,289439.0,0.010002,276.0,3171.0,413.0,0.0,276.0,649.0,276.0,...,89.277915,1.0,71.0,79.211268,1869073112,1869362551,289439,2133,413,5.0


In [12]:
df = merged_df

In [19]:
from sklearn.decomposition import PCA
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Assuming 'df' is your DataFrame and 'score' is the target column

# Prepare feature matrix X and target vector y
X = df.drop(['id', 'score'], axis=1)

# Convert score to numeric labels if it's not already numeric
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['score'])

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
pca = PCA(n_components=0.95) # Keep 95% of variance
X_pca = pca.fit_transform(X_scaled)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, stratify=y, random_state=42)

# Initialize base classifiers
base_classifiers = [
    ('lr', LogisticRegression(max_iter=1000)), # Increase max_iter if convergence issues occur
    ('svc', SVC(probability=True)),
    ('dt', DecisionTreeClassifier()),
    ('knn', KNeighborsClassifier())
]

# Initialize stacking classifier with a logistic regression as the final estimator
stacking_clf = StackingClassifier(estimators=base_classifiers, final_estimator=LogisticRegression(max_iter=1000))

# Fit the model
stacking_clf.fit(X_train, y_train)

# Predict on the test set
y_pred = stacking_clf.predict(X_test)

# Evaluate the model with a classification report
print(f"Classification Report:\n{classification_report(y_test, y_pred)}")

# Evaluate the model with a confusion matrix
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")

# Stratified K-Fold for cross-validation
strat_k_fold = StratifiedKFold(n_splits=5)

# Cross-validation to check model reliability
cross_val_scores = cross_val_score(stacking_clf, X_pca, y, cv=strat_k_fold, scoring='accuracy')
print(f"Cross-validated accuracy: {np.mean(cross_val_scores)}")

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.00      0.00      0.00         7
           2       0.00      0.00      0.00        14
           3       0.00      0.00      0.00        19
           4       0.36      0.10      0.16        40
           5       0.31      0.39      0.34        67
           6       0.32      0.40      0.36        97
           7       0.33      0.44      0.38       100
           8       0.38      0.51      0.44        81
           9       0.21      0.08      0.12        36
          10       0.21      0.19      0.20        26
          11       0.00      0.00      0.00         7

    accuracy                           0.33       495
   macro avg       0.18      0.18      0.17       495
weighted avg       0.29      0.33      0.30       495

Confusion Matrix:
[[ 0  0  0  0  0  0  1  0  0  0  0  0]
 [ 0  0  1  0  1  3  0  1  1  0  0  0]
 [ 0  0  0  0 



Cross-validated accuracy: 0.3124132008342535
