In [1]:
import xgboost
import joblib
import numpy as np
from quadratic_weighted_kappa import quadratic_weighted_kappa
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.model_selection import train_test_split

In [17]:
x = joblib.load('essay_ease10_sbert768_simbow_langerr_780_normalized_asap1')
x_gib = joblib.load('essay_asap1_780_with200gibberish')
y = joblib.load("score_asap1")
y_gib = joblib.load('score_asap1_with200gibberish')
gib = joblib.load('gibberish_200text_780')

In [19]:
x_gib = x_gib[:1883]
y_gib = y_gib[:1883]

In [20]:
print(x.shape)
print(x_gib.shape)
print(gib.shape)

(1783, 780)
(1883, 780)
(200, 780)


In [4]:
def get_feature_names_extended():
    ease_feats = ['Answer Length', 'Word Counts', 'Average Word Length', 'Good n-gram', 'Prompt Overlap',
              'Prompt Overlap (synonyms)', 'Punctuation Counts', 'Spelling Error', 'Unique Words', 'Prompt Similarity SBert']

    sbert_feats = []
    sbert_dim = 768

    for i in range(0, sbert_dim):
    	fname = "sbert_" + str(i) 
    	sbert_feats.append(fname)
    
    prompt_similarity_bow = ["Prompt Similarity BOW"]
    lang_error = ["Language Error"]
    
    feature_names = ease_feats + prompt_similarity_bow + lang_error + sbert_feats 

    print("len feature names: ", len(feature_names))
    
    return feature_names

In [5]:
feature_names = get_feature_names_extended()

len feature names:  780


### create model

In [73]:
indices = np.arange(2000)
X_train, X_test, Y_train, Y_test, idx1, idx2 = train_test_split(x_gib, y_gib, indices, test_size=0.2, random_state=42)
X_train.shape

(1600, 780)

In [74]:
d_train = xgboost.DMatrix(X_train, label=Y_train, feature_names=feature_names)
d_test = xgboost.DMatrix(X_test, label=Y_test, feature_names=feature_names)

In [75]:
model = xgboost.train({"learning_rate": 0.05, "max_depth":3}, d_train, 200, evals = [(d_test, "test")], early_stopping_rounds=20)

[0]	test-rmse:2.13962
Will train until test-rmse hasn't improved in 20 rounds.
[1]	test-rmse:2.03985
[2]	test-rmse:1.94737
[3]	test-rmse:1.85841
[4]	test-rmse:1.77496
[5]	test-rmse:1.69600
[6]	test-rmse:1.62066
[7]	test-rmse:1.54949
[8]	test-rmse:1.48256
[9]	test-rmse:1.41866
[10]	test-rmse:1.35860
[11]	test-rmse:1.30172
[12]	test-rmse:1.24795
[13]	test-rmse:1.19829
[14]	test-rmse:1.15120
[15]	test-rmse:1.10686
[16]	test-rmse:1.06420
[17]	test-rmse:1.02468
[18]	test-rmse:0.98770
[19]	test-rmse:0.95350
[20]	test-rmse:0.92035
[21]	test-rmse:0.88962
[22]	test-rmse:0.86178
[23]	test-rmse:0.83444
[24]	test-rmse:0.80985
[25]	test-rmse:0.78634
[26]	test-rmse:0.76407
[27]	test-rmse:0.74371
[28]	test-rmse:0.72492
[29]	test-rmse:0.70738
[30]	test-rmse:0.69159
[31]	test-rmse:0.67605
[32]	test-rmse:0.66101
[33]	test-rmse:0.64775
[34]	test-rmse:0.63582
[35]	test-rmse:0.62449
[36]	test-rmse:0.61338
[37]	test-rmse:0.60433
[38]	test-rmse:0.59550
[39]	test-rmse:0.58647
[40]	test-rmse:0.57874
[41]	test-

In [76]:
yxgb_pred = model.predict(d_test)
yxgb_pred = np.round(yxgb_pred)
print("accuracy: ", accuracy_score(yxgb_pred, Y_test))
print("qwk: ", quadratic_weighted_kappa(yxgb_pred, Y_test))

accuracy:  0.6975
qwk:  0.8908181594054873


In [78]:
joblib.dump(model, "model_asap6_extended_780_gibberish")

['model_asap6_extended_780_gibberish_1']

### create model 5-fold

In [21]:
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)
#kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
#kf.get_n_splits(x, y)
print(kf)

KFold(n_splits=5, random_state=42, shuffle=True)


In [22]:
model2 = xgboost.XGBRegressor(objective ='reg:squarederror',
                colsample_bytree=0.4,
                 gamma=0,                 
                 learning_rate=0.07,
                 max_depth=4,
                 min_child_weight=1.5,
                 n_estimators=1000,                                                                    
                 reg_alpha=0.75,
                 reg_lambda=0.45,
                 subsample=0.6,
                 seed=42)

### training using original + gibberish data (1983 essays)

In [23]:
qwk_scores = []
qwk_scores_ori = []

acc_scores = []
acc_scores_ori = []
acc_scores_gib = []


test_indices = []
test_indices_ori = []
test_indices_gib = []

pred_labels = []
pred_labels_ori = []
pred_labels_gib = []

counter = 1

for train_index, test_index in kf.split(x_gib, y_gib):
    
    print()
    print("Loop -", counter)
    print("========")
    counter = counter + 1
    
    X_train, X_test, Y_train, Y_test = x_gib[train_index], x_gib[test_index], y_gib[train_index], y_gib[test_index]
         
    model2.fit(X_train, Y_train)
    
    
    # PREDICT AND EVALUATE ALL ESSAYS
    predict = model2.predict(X_test)
    predict = np.round(predict)
    
    pred_labels.extend(predict)
    test_indices.extend(test_index)
    
    result_qwk = quadratic_weighted_kappa(Y_test, predict)
    print("Qwk : ", result_qwk)
    qwk_scores.append(result_qwk)
    
    result_acc = accuracy_score(Y_test, predict)
    print("Acc : ", result_acc)
    acc_scores.append(result_acc)
    
    print("len all : ", len(test_index))

    # PREDICT AND EVALUATE ONLY ORIGINAL ESSAY
    test_index_ori = [a for a in test_index if a < 1783]
    x_test_ori = x_gib[test_index_ori]
    y_test_ori = y_gib[test_index_ori]
    predict_ori = model2.predict(x_test_ori)
    predict_ori = np.round(predict_ori)
    pred_labels_ori.extend(predict_ori)
    
    result_qwk_ori = quadratic_weighted_kappa(y_test_ori, predict_ori)
    print("Qwk original : ", result_qwk_ori)
    qwk_scores_ori.append(result_qwk_ori)
    
    result_acc_ori = accuracy_score(y_test_ori, predict_ori)
    print("Acc original : ", result_acc_ori)
    acc_scores_ori.append(result_acc_ori)
    
    print("len ori : ", len(test_index_ori))
    
    # PREDICT AND EVALUATE ONLY GIBBERISH ESSAY
    test_index_gib = [a for a in test_index if a > 1782]
    x_test_gib = x_gib[test_index_gib]
    y_test_gib = y_gib[test_index_gib]
    predict_gib = model2.predict(x_test_gib)
    predict_gib = np.round(predict_gib)
    pred_labels_gib.extend(predict_gib)
    
    result_acc_gib = accuracy_score(y_test_gib, predict_gib)
    print("Acc gibberish : ", result_acc_gib)
    acc_scores_gib.append(result_acc_gib)
    
    print("len gib : ", len(test_index_gib))

print("\nMean QWK : ", np.mean(qwk_scores))
print("\nMean QWK Original : ", np.mean(qwk_scores_ori))

print("\nMean Accuracy : ", np.mean(acc_scores))
print("\nMean Accuracy Original : ", np.mean(acc_scores_ori))
print("\nMean Accuracy Gibberish : ", np.mean(acc_scores_gib))


Loop - 1
Qwk :  0.9332221766140263
Acc :  0.5145888594164456
len all :  377
Qwk original :  0.7685594603303295
Acc original :  0.4871060171919771
len ori :  349
Acc gibberish :  0.8571428571428571
len gib :  28

Loop - 2
Qwk :  0.9234258282062929
Acc :  0.5358090185676393
len all :  377
Qwk original :  0.7790504963314631
Acc original :  0.5167597765363129
len ori :  358
Acc gibberish :  0.8947368421052632
len gib :  19

Loop - 3
Qwk :  0.9110625482058251
Acc :  0.5039787798408488
len all :  377
Qwk original :  0.7826257833288863
Acc original :  0.481994459833795
len ori :  361
Acc gibberish :  1.0
len gib :  16

Loop - 4
Qwk :  0.9298745939412254
Acc :  0.5478723404255319
len all :  376
Qwk original :  0.7918463347706781
Acc original :  0.5225988700564972
len ori :  354
Acc gibberish :  0.9545454545454546
len gib :  22

Loop - 5
Qwk :  0.9043023578029181
Acc :  0.4946808510638298
len all :  376
Qwk original :  0.7620191508299019
Acc original :  0.47645429362880887
len ori :  361
Acc g

### training using original data (1783 essays)

In [12]:
qwk_scores = []

acc_scores = []
acc_scores_gib = []

test_indices = []
test_indices_gib = []

pred_labels = []
pred_labels_gib = []

counter = 1

for train_index, test_index in kf.split(x, y):
    
    print()
    print("Loop -", counter)
    print("========")
    counter = counter + 1
    
    X_train, X_test, Y_train, Y_test = x[train_index], x[test_index], y[train_index], y[test_index]
         
    model2.fit(X_train, Y_train)    
    
    # PREDICT AND EVALUATE ORIGINAL ESSAYS
    predict = model2.predict(X_test)
    predict = np.round(predict)
    
    pred_labels.extend(predict)
    test_indices.extend(test_index)
    
    result_qwk = quadratic_weighted_kappa(Y_test, predict)
    print("Qwk : ", result_qwk)
    qwk_scores.append(result_qwk)
    
    result_acc = accuracy_score(Y_test, predict)
    print("Acc : ", result_acc)
    acc_scores.append(result_acc)
    
    print("len all : ", len(test_index))
    
    # PREDICT AND EVALUATE ONLY GIBBERISH ESSAY
    x_test_gib = gib
    y_test_gib = np.zeros(200)
    predict_gib = model2.predict(x_test_gib)
    predict_gib = np.round(predict_gib)
    pred_labels_gib.extend(predict_gib)
    
    result_acc_gib = accuracy_score(y_test_gib, predict_gib)
    print("Acc gibberish : ", result_acc_gib)
    acc_scores_gib.append(result_acc_gib)
    
    print("len gib : ", len(x_test_gib))

print("\nMean QWK : ", np.mean(qwk_scores))

print("\nMean Accuracy : ", np.mean(acc_scores))
print("\nMean Accuracy Gibberish : ", np.mean(acc_scores_gib))


Loop - 1
Qwk :  0.8082407525645937
Acc :  0.484593837535014
len all :  357
Acc gibberish :  0.0
len gib :  200

Loop - 2
Qwk :  0.7596110180240844
Acc :  0.4677871148459384
len all :  357
Acc gibberish :  0.0
len gib :  200

Loop - 3
Qwk :  0.7823694002942279
Acc :  0.48179271708683474
len all :  357
Acc gibberish :  0.0
len gib :  200

Loop - 4
Qwk :  0.7724125555173393
Acc :  0.5365168539325843
len all :  356
Acc gibberish :  0.0
len gib :  200

Loop - 5
Qwk :  0.7906373334375365
Acc :  0.4859550561797753
len all :  356
Acc gibberish :  0.0
len gib :  200

Mean QWK :  0.7826542119675564

Mean Accuracy :  0.4913291159160293

Mean Accuracy Gibberish :  0.0


In [13]:
print(len(pred_labels_gib))

1000


In [15]:
from collections import Counter
Counter(pred_labels_gib)

Counter({4.0: 411,
         5.0: 319,
         6.0: 91,
         3.0: 132,
         8.0: 12,
         7.0: 25,
         9.0: 8,
         2.0: 2})

In [16]:
print(len(pred_labels))
print(len(test_indices))

1783
1783


In [54]:
a = dict(zip(test_indices, pred_labels))
len(a)

2000

In [55]:
b = { key: a[key] for key in a.keys() if key > 1799 }
len(b)

200

In [47]:
# Training with 100 gibberish
Counter(b.values())

Counter({0.0: 96, 1.0: 4})

In [56]:
# Training with 200 gibberish
Counter(b.values())

Counter({-0.0: 196, 1.0: 4})

In [10]:
# with 100 other gibberish (by 100 training)
x_gib.shape

(1900, 780)

In [17]:
#x_gib_100 = x_gib[1900:2000]
x_gib_100 = x_gib[1800:1900]
x_gib_100.shape

(100, 780)

In [18]:
d_gib_100 = xgboost.DMatrix(x_gib_100, feature_names=feature_names)
pred_gib_100 = model.predict(d_gib_100)
pred_gib_100 = np.round(pred_gib_100)

In [19]:
from collections import Counter
Counter(pred_gib_100)

Counter({-0.0: 98, 1.0: 2})

In [14]:
pred_labels_int = list(map(int, pred_labels))
print(pred_labels)
print(pred_labels_int)

[4.0, 2.0, 3.0, 2.0, 3.0, 3.0, 2.0, 2.0, 2.0, 1.0, 4.0, 3.0, 1.0, 3.0, 2.0, 2.0, 3.0, 3.0, 2.0, 1.0, 1.0, 2.0, 2.0, 3.0, 2.0, 3.0, 3.0, 3.0, 2.0, 3.0, 3.0, 3.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0, 4.0, 2.0, 3.0, 2.0, 4.0, 3.0, 3.0, 1.0, 3.0, 3.0, 2.0, 3.0, 2.0, 2.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 2.0, 3.0, 1.0, 3.0, 1.0, 3.0, 3.0, 2.0, 3.0, 3.0, 4.0, 2.0, 2.0, 4.0, 2.0, 4.0, 2.0, 3.0, 3.0, 4.0, 2.0, 3.0, 2.0, 3.0, 2.0, 4.0, 2.0, 4.0, 3.0, 4.0, 3.0, 3.0, 4.0, 3.0, 2.0, 2.0, 4.0, 4.0, 3.0, 3.0, 2.0, 3.0, 3.0, 0.0, 3.0, 2.0, 2.0, 3.0, 4.0, 2.0, 2.0, 3.0, 3.0, 4.0, 3.0, 3.0, 3.0, 3.0, 3.0, 2.0, 3.0, 3.0, 2.0, 4.0, 3.0, 2.0, 3.0, 3.0, 1.0, 3.0, 3.0, 3.0, 3.0, 4.0, 2.0, 2.0, 4.0, 1.0, 3.0, 2.0, 2.0, 4.0, 3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 3.0, 2.0, 3.0, 4.0, 1.0, 2.0, 2.0, 4.0, 3.0, 2.0, 3.0, 3.0, 3.0, 3.0, 1.0, 2.0, 3.0, 3.0, 3.0, 2.0, 3.0, 2.0, 1.0, 3.0, 2.0, 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 2.0, 0.0, 2.0,

In [15]:
pred_labels_par_int = list(map(int, pred_labels_par))
print(pred_labels_par)
print(pred_labels_par_int)

[3.0, 2.0, 3.0, 2.0, 3.0, 3.0, 2.0, 2.0, 1.0, 1.0, 3.0, 3.0, 1.0, 3.0, 2.0, 3.0, 3.0, 3.0, 2.0, 2.0, 1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 4.0, 3.0, 2.0, 3.0, 3.0, 3.0, 2.0, 3.0, 3.0, 1.0, 2.0, 3.0, 4.0, 4.0, 2.0, 3.0, 2.0, 4.0, 3.0, 3.0, 2.0, 3.0, 3.0, 2.0, 3.0, 2.0, 2.0, 2.0, 2.0, 4.0, 1.0, 2.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 2.0, 3.0, 1.0, 3.0, 2.0, 3.0, 3.0, 2.0, 3.0, 3.0, 4.0, 3.0, 2.0, 4.0, 2.0, 3.0, 2.0, 3.0, 3.0, 4.0, 2.0, 3.0, 2.0, 3.0, 3.0, 4.0, 3.0, 3.0, 3.0, 4.0, 3.0, 3.0, 4.0, 3.0, 2.0, 2.0, 3.0, 4.0, 3.0, 3.0, 3.0, 3.0, 3.0, 0.0, 3.0, 2.0, 2.0, 3.0, 4.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 2.0, 3.0, 3.0, 3.0, 4.0, 3.0, 2.0, 3.0, 3.0, 2.0, 3.0, 3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 4.0, 1.0, 2.0, 2.0, 2.0, 3.0, 2.0, 3.0, 3.0, 3.0, 2.0, 2.0, 2.0, 3.0, 3.0, 2.0, 3.0, 2.0, 3.0, 3.0, 3.0, 2.0, 3.0, 3.0, 2.0, 2.0, 2.0, 4.0, 3.0, 2.0, 3.0, 3.0, 3.0, 4.0, 1.0, 2.0, 3.0, 3.0, 2.0, 2.0, 3.0, 2.0, 1.0, 3.0, 2.0, 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 2.0, 3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 0.0, 2.0,

In [16]:
new_score = np.zeros(1800)
new_score_par = np.zeros(1800)
new_score

array([0., 0., 0., ..., 0., 0., 0.])

In [17]:
new_score[test_indices] = pred_labels_int
new_score_par[test_indices] = pred_labels_par_int

In [18]:
new_score

array([3., 4., 4., ..., 3., 2., 2.])

In [19]:
new_score_par

array([3., 3., 3., ..., 2., 2., 2.])

In [22]:
new_score[60]

4.0

In [17]:
joblib.dump(new_score, 'model_score_normalized')

['model_score_normalized']

In [23]:
joblib.dump(new_score, 'model_score_paraphrase_normalized')

['model_score_paraphrase_normalized']

## Model performance to predict paraphrased essays

In [25]:
# Accuracy
accuracy_score(new_score, new_score_par)

0.7838888888888889

In [26]:
# QWK
quadratic_weighted_kappa(new_score, new_score_par)

0.8459892540002762

## Model performance comparison between original and gibberish-trained model

In [11]:
model_gib = model
model_ori = joblib.load('model_asap6_extended_780_normalized')
data_gib = joblib.load('gibberish_200text_780')

In [12]:
d_data_gib = xgboost.DMatrix(data_gib, feature_names=feature_names)

In [14]:
pred_gib = model_gib.predict(d_data_gib)
pred_ori = model_ori.predict(d_data_gib)

In [15]:
pred_gib = np.round(pred_gib)
pred_ori = np.round(pred_ori)

In [16]:
from collections import Counter
print("Model Gibberish : ", Counter(pred_gib))
print("Model Original : ", Counter(pred_ori))

Model Gibberish :  Counter({-0.0: 194, 1.0: 6})
Model Original :  Counter({1.0: 115, 0.0: 83, 2.0: 2})


## Model performance 

In [42]:
d_data = xgboost.DMatrix(x, feature_names=feature_names)

In [43]:
pred1 = model_ori.predict(d_data)
pred2 = model_gib.predict(d_data)

In [44]:
pred1 = np.round(pred1)
pred2 = np.round(pred2)

In [45]:
quadratic_weighted_kappa(pred1, y)

0.9205878354159714

In [47]:
len(pred1)

1800

In [63]:
indices = np.arange(1800)
X_train, X_test, Y_train, Y_test, idx1, idx2 = train_test_split(x, y, indices, test_size=0.2, random_state=42)
X_train.shape

(1440, 780)

In [54]:
d_test = xgboost.DMatrix(X_test, feature_names=feature_names)
pred1 = model_ori.predict(d_test)
pred1 = np.round(pred1)

In [55]:
quadratic_weighted_kappa(pred1, Y_test)

0.8244318181818182

In [56]:
pred2 = model_gib.predict(d_test)
pred2 = np.round(pred2)

In [57]:
quadratic_weighted_kappa(pred2, Y_test)

0.799757281553398

In [64]:
indices = np.arange(2000)
X_train, X_test, Y_train, Y_test, idx1, idx2_gib = train_test_split(x_gib, y_gib, indices, test_size=0.2, random_state=42)
X_train.shape

(1600, 780)

In [66]:
len(idx2_gib)

400

In [91]:
print(len(test_indices))
fold1 = test_indices[0:400]
fold2 = test_indices[400:800]
fold3 = test_indices[800:1200]
fold4 = test_indices[1200:1600]
fold5 = test_indices[1600:]

2000


In [99]:
filter1 = [a for a in fold1 if a < 1800]
print(len(filter1))
filter2 = [a for a in fold2 if a < 1800]
print(len(filter2))
filter3 = [a for a in fold3 if a < 1800]
print(len(filter3))
filter4 = [a for a in fold4 if a < 1800]
print(len(filter4))
filter5 = [a for a in fold5 if a < 1800]
print(len(filter5))

358
360
361
363
358


In [104]:
print(filter1)

[23, 29, 30, 32, 44, 45, 49, 56, 59, 63, 65, 67, 69, 70, 73, 76, 78, 99, 100, 109, 111, 115, 120, 123, 124, 128, 135, 162, 163, 168, 173, 175, 185, 188, 194, 196, 203, 210, 211, 212, 218, 220, 231, 233, 237, 239, 247, 251, 254, 256, 261, 266, 270, 275, 281, 289, 297, 298, 300, 303, 305, 306, 307, 316, 322, 324, 331, 342, 344, 350, 351, 352, 353, 354, 361, 366, 367, 368, 374, 382, 383, 393, 394, 411, 414, 416, 422, 427, 429, 432, 433, 438, 450, 453, 462, 464, 471, 478, 479, 480, 482, 485, 494, 495, 507, 514, 519, 526, 527, 529, 530, 534, 535, 538, 543, 544, 552, 554, 555, 570, 572, 579, 581, 582, 583, 584, 585, 591, 599, 602, 607, 610, 611, 613, 617, 618, 620, 628, 630, 637, 651, 654, 670, 674, 678, 679, 693, 701, 712, 720, 730, 743, 744, 746, 755, 757, 759, 764, 771, 780, 782, 785, 787, 788, 792, 802, 807, 808, 824, 829, 832, 834, 855, 857, 862, 874, 879, 886, 887, 888, 889, 905, 906, 907, 909, 916, 930, 937, 938, 944, 949, 964, 965, 973, 974, 978, 987, 990, 993, 1004, 1027, 1033, 1036