In [1]:
import joblib
import pandas as pd
from quadratic_weighted_kappa import quadratic_weighted_kappa
from scipy.stats import pearsonr
from sklearn.metrics import accuracy_score, mean_squared_error
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

In [2]:
scores = pd.read_csv('../training_set_rel3.tsv', sep='\t', encoding='ISO-8859-1')
y =  joblib.load('score_asap7')

In [3]:
scores = scores[scores['essay_set'] == 7]
scores = scores[['rater1_domain1', 'rater2_domain1', 'rater3_domain1']]
len(scores)

1569

### Gradient Boosting : Quantitative Assessment of AES Model

In [5]:
model_score = joblib.load('score_model_gb_normalized_float_ngram')

In [6]:
qwk_model = quadratic_weighted_kappa(y, model_score)
print("QWK Score: ", qwk_model)

qwk_human = quadratic_weighted_kappa(scores['rater1_domain1'], scores['rater2_domain1'])
print("Human Agreement: ", qwk_human)

print("Degradation: ", qwk_human - qwk_model)

rater1_score = scores['rater1_domain1']
rater2_score = scores['rater2_domain1']

human1_score = np.array(rater1_score)
human2_score = np.array(rater2_score)

model_mean = np.mean(model_score)
#print(model_mean)
y_mean = np.mean(y)
#print(y_mean)
mean_diff = abs(model_mean-y_mean)
#print(mean_diff)

model_variance = np.var(model_score)
#print(model_variance)
y_variance = np.var(y)
#print(y_variance)

z = mean_diff / np.sqrt((model_variance + y_variance)/2)
print("Z : ",z)

# Compute Pearson correlation
corr, p_value = pearsonr(y, model_score)

print(f"Pearson correlation: {corr:.3f}")
print(f"P-value: {p_value:.3f}")

# Round the model predictions
model_score_rounded = np.rint(model_score).astype(int)

# Compute absolute errors for adjacent accuracy
errors = np.abs(y - model_score_rounded)
adjacent = np.sum(errors <= 3)
adjacent_accuracy = adjacent / len(y)
print(f'Adjacent accuracy: {adjacent_accuracy:.2%}')

# Compute exact accuracy
exact_matches = np.sum(y == model_score_rounded)
exact_accuracy = exact_matches / len(y)
print(f'Exact accuracy: {exact_accuracy:.2%}')

# Calculate MSE
mse = mean_squared_error(y, model_score)

# Calculate RMSE
rmse = np.sqrt(mse)

print(f'RMSE: {rmse:.3f}')

QWK Score:  0.7075644334445189
Human Agreement:  0.7214784742548883
Degradation:  0.013914040810369399
Z :  0.006302297994221158
Pearson correlation: 0.733
P-value: 0.000
Adjacent accuracy: 75.65%
Exact accuracy: 13.89%
RMSE: 3.129


### Random Forest : Quantitative Assessment of AES Model

In [7]:
model_score = joblib.load('score_model_rf_normalized_float_ngram')

In [8]:
qwk_model = quadratic_weighted_kappa(y, model_score)
print("QWK Score: ", qwk_model)

qwk_human = quadratic_weighted_kappa(scores['rater1_domain1'], scores['rater2_domain1'])
print("Human Agreement: ", qwk_human)

print("Degradation: ", qwk_human - qwk_model)

rater1_score = scores['rater1_domain1']
rater2_score = scores['rater2_domain1']

human1_score = np.array(rater1_score)
human2_score = np.array(rater2_score)

model_mean = np.mean(model_score)
#print(model_mean)
y_mean = np.mean(y)
#print(y_mean)
mean_diff = abs(model_mean-y_mean)
#print(mean_diff)

model_variance = np.var(model_score)
#print(model_variance)
y_variance = np.var(y)
#print(y_variance)

z = mean_diff / np.sqrt((model_variance + y_variance)/2)
print("Z : ",z)

# Compute Pearson correlation
corr, p_value = pearsonr(y, model_score)

print(f"Pearson correlation: {corr:.3f}")
print(f"P-value: {p_value:.3f}")

# Round the model predictions
model_score_rounded = np.rint(model_score).astype(int)

# Compute absolute errors for adjacent accuracy
errors = np.abs(y - model_score_rounded)
adjacent = np.sum(errors <= 3)
adjacent_accuracy = adjacent / len(y)
print(f'Adjacent accuracy: {adjacent_accuracy:.2%}')

# Compute exact accuracy
exact_matches = np.sum(y == model_score_rounded)
exact_accuracy = exact_matches / len(y)
print(f'Exact accuracy: {exact_accuracy:.2%}')

# Calculate MSE
mse = mean_squared_error(y, model_score)

# Calculate RMSE
rmse = np.sqrt(mse)

print(f'RMSE: {rmse:.3f}')

QWK Score:  0.6371129009157637
Human Agreement:  0.7214784742548883
Degradation:  0.08436557333912464
Z :  0.02334240053554678
Pearson correlation: 0.706
P-value: 0.000
Adjacent accuracy: 73.04%
Exact accuracy: 13.19%
RMSE: 3.258


### Ridge Regression : Quantitative Assessment of AES Model

In [9]:
model_score = joblib.load('score_model_rr_normalized_float_ngram')

In [10]:
qwk_model = quadratic_weighted_kappa(y, model_score)
print("QWK Score: ", qwk_model)

qwk_human = quadratic_weighted_kappa(scores['rater1_domain1'], scores['rater2_domain1'])
print("Human Agreement: ", qwk_human)

print("Degradation: ", qwk_human - qwk_model)

rater1_score = scores['rater1_domain1']
rater2_score = scores['rater2_domain1']

human1_score = np.array(rater1_score)
human2_score = np.array(rater2_score)

model_mean = np.mean(model_score)
#print(model_mean)
y_mean = np.mean(y)
#print(y_mean)
mean_diff = abs(model_mean-y_mean)
#print(mean_diff)

model_variance = np.var(model_score)
#print(model_variance)
y_variance = np.var(y)
#print(y_variance)

z = mean_diff / np.sqrt((model_variance + y_variance)/2)
print("Z : ",z)

# Compute Pearson correlation
corr, p_value = pearsonr(y, model_score)

print(f"Pearson correlation: {corr:.3f}")
print(f"P-value: {p_value:.3f}")

# Round the model predictions
model_score_rounded = np.rint(model_score).astype(int)

# Compute absolute errors for adjacent accuracy
errors = np.abs(y - model_score_rounded)
adjacent = np.sum(errors <= 3)
adjacent_accuracy = adjacent / len(y)
print(f'Adjacent accuracy: {adjacent_accuracy:.2%}')

# Compute exact accuracy
exact_matches = np.sum(y == model_score_rounded)
exact_accuracy = exact_matches / len(y)
print(f'Exact accuracy: {exact_accuracy:.2%}')

# Calculate MSE
mse = mean_squared_error(y, model_score)

# Calculate RMSE
rmse = np.sqrt(mse)

print(f'RMSE: {rmse:.3f}')

QWK Score:  0.6575605058513458
Human Agreement:  0.7214784742548883
Degradation:  0.06391796840354247
Z :  0.011892040653394859
Pearson correlation: 0.679
P-value: 0.000
Adjacent accuracy: 70.62%
Exact accuracy: 12.75%
RMSE: 3.419
