In [1]:
import joblib
import pandas as pd
from quadratic_weighted_kappa import quadratic_weighted_kappa
from scipy.stats import pearsonr
from sklearn.metrics import accuracy_score, mean_squared_error
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

In [2]:
scores = pd.read_csv('../training_set_rel3.tsv', sep='\t', encoding='ISO-8859-1')
y =  joblib.load('score_asap7')

In [3]:
scores = scores[scores['essay_set'] == 7]
scores = scores[['rater1_domain1', 'rater2_domain1', 'rater3_domain1']]
len(scores)

1569

### Gradient Boosting : Quantitative Assessment of AES Model

In [4]:
model_score = joblib.load('score_model_gb_normalized_float_tfidf')

In [5]:
qwk_model = quadratic_weighted_kappa(y, model_score)
print("QWK Score: ", qwk_model)

qwk_human = quadratic_weighted_kappa(scores['rater1_domain1'], scores['rater2_domain1'])
print("Human Agreement: ", qwk_human)

print("Degradation: ", qwk_human - qwk_model)

rater1_score = scores['rater1_domain1']
rater2_score = scores['rater2_domain1']

human1_score = np.array(rater1_score)
human2_score = np.array(rater2_score)

model_mean = np.mean(model_score)
#print(model_mean)
y_mean = np.mean(y)
#print(y_mean)
mean_diff = abs(model_mean-y_mean)
#print(mean_diff)

model_variance = np.var(model_score)
#print(model_variance)
y_variance = np.var(y)
#print(y_variance)

z = mean_diff / np.sqrt((model_variance + y_variance)/2)
print("Z : ",z)

# Compute Pearson correlation
corr, p_value = pearsonr(y, model_score)

print(f"Pearson correlation: {corr:.3f}")
print(f"P-value: {p_value:.3f}")

# Round the model predictions
model_score_rounded = np.rint(model_score).astype(int)

# Compute absolute errors for adjacent accuracy
errors = np.abs(y - model_score_rounded)
adjacent = np.sum(errors <= 3)
adjacent_accuracy = adjacent / len(y)
print(f'Adjacent accuracy: {adjacent_accuracy:.2%}')

# Compute exact accuracy
exact_matches = np.sum(y == model_score_rounded)
exact_accuracy = exact_matches / len(y)
print(f'Exact accuracy: {exact_accuracy:.2%}')

# Calculate MSE
mse = mean_squared_error(y, model_score)

# Calculate RMSE
rmse = np.sqrt(mse)

print(f'RMSE: {rmse:.3f}')

QWK Score:  0.7031758641855228
Human Agreement:  0.7214784742548883
Degradation:  0.01830261006936551
Z :  0.0038715551792620874
Pearson correlation: 0.736
P-value: 0.000
Adjacent accuracy: 75.08%
Exact accuracy: 13.00%
RMSE: 3.106


### Random Forest : Quantitative Assessment of AES Model

In [6]:
model_score = joblib.load('score_model_rf_normalized_float_tfidf')

In [7]:
qwk_model = quadratic_weighted_kappa(y, model_score)
print("QWK Score: ", qwk_model)

qwk_human = quadratic_weighted_kappa(scores['rater1_domain1'], scores['rater2_domain1'])
print("Human Agreement: ", qwk_human)

print("Degradation: ", qwk_human - qwk_model)

rater1_score = scores['rater1_domain1']
rater2_score = scores['rater2_domain1']

human1_score = np.array(rater1_score)
human2_score = np.array(rater2_score)

model_mean = np.mean(model_score)
#print(model_mean)
y_mean = np.mean(y)
#print(y_mean)
mean_diff = abs(model_mean-y_mean)
#print(mean_diff)

model_variance = np.var(model_score)
#print(model_variance)
y_variance = np.var(y)
#print(y_variance)

z = mean_diff / np.sqrt((model_variance + y_variance)/2)
print("Z : ",z)

# Compute Pearson correlation
corr, p_value = pearsonr(y, model_score)

print(f"Pearson correlation: {corr:.3f}")
print(f"P-value: {p_value:.3f}")

# Round the model predictions
model_score_rounded = np.rint(model_score).astype(int)

# Compute absolute errors for adjacent accuracy
errors = np.abs(y - model_score_rounded)
adjacent = np.sum(errors <= 3)
adjacent_accuracy = adjacent / len(y)
print(f'Adjacent accuracy: {adjacent_accuracy:.2%}')

# Compute exact accuracy
exact_matches = np.sum(y == model_score_rounded)
exact_accuracy = exact_matches / len(y)
print(f'Exact accuracy: {exact_accuracy:.2%}')

# Calculate MSE
mse = mean_squared_error(y, model_score)

# Calculate RMSE
rmse = np.sqrt(mse)

print(f'RMSE: {rmse:.3f}')

QWK Score:  0.6017753749611858
Human Agreement:  0.7214784742548883
Degradation:  0.11970309929370249
Z :  0.0005954065465389005
Pearson correlation: 0.701
P-value: 0.000
Adjacent accuracy: 70.55%
Exact accuracy: 12.17%
RMSE: 3.315


### Ridge Regression : Quantitative Assessment of AES Model

In [8]:
model_score = joblib.load('score_model_rr_normalized_float_tfidf')

In [9]:
qwk_model = quadratic_weighted_kappa(y, model_score)
print("QWK Score: ", qwk_model)

qwk_human = quadratic_weighted_kappa(scores['rater1_domain1'], scores['rater2_domain1'])
print("Human Agreement: ", qwk_human)

print("Degradation: ", qwk_human - qwk_model)

rater1_score = scores['rater1_domain1']
rater2_score = scores['rater2_domain1']

human1_score = np.array(rater1_score)
human2_score = np.array(rater2_score)

model_mean = np.mean(model_score)
#print(model_mean)
y_mean = np.mean(y)
#print(y_mean)
mean_diff = abs(model_mean-y_mean)
#print(mean_diff)

model_variance = np.var(model_score)
#print(model_variance)
y_variance = np.var(y)
#print(y_variance)

z = mean_diff / np.sqrt((model_variance + y_variance)/2)
print("Z : ",z)

# Compute Pearson correlation
corr, p_value = pearsonr(y, model_score)

print(f"Pearson correlation: {corr:.3f}")
print(f"P-value: {p_value:.3f}")

# Round the model predictions
model_score_rounded = np.rint(model_score).astype(int)

# Compute absolute errors for adjacent accuracy
errors = np.abs(y - model_score_rounded)
adjacent = np.sum(errors <= 3)
adjacent_accuracy = adjacent / len(y)
print(f'Adjacent accuracy: {adjacent_accuracy:.2%}')

# Compute exact accuracy
exact_matches = np.sum(y == model_score_rounded)
exact_accuracy = exact_matches / len(y)
print(f'Exact accuracy: {exact_accuracy:.2%}')

# Calculate MSE
mse = mean_squared_error(y, model_score)

# Calculate RMSE
rmse = np.sqrt(mse)

print(f'RMSE: {rmse:.3f}')

QWK Score:  0.684814317483551
Human Agreement:  0.7214784742548883
Degradation:  0.036664156771337275
Z :  0.0023977072063504417
Pearson correlation: 0.730
P-value: 0.000
Adjacent accuracy: 73.49%
Exact accuracy: 12.49%
RMSE: 3.132
