## Import library

In [13]:
%run bert
%run utils.ipynb
%run feat

import numpy as np
import pandas as pd
import sklearn as sk
from sklearn import linear_model 
from sklearn.metrics import cohen_kappa_score,mean_absolute_error,mean_squared_error,accuracy_score,explained_variance_score
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import optuna

## Load Data

In [20]:
loader = DataLoader()
d = loader.GetData('L2Writing')
#d = loader.GetShuffled()
d.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5


## Model Inference

In [21]:
EmbeddingFetcher = GetBERTEmbeddings(d['full_text'],'model/deberta-v3-large')
EmbeddingFetcher.inf(stop=1500,SeqLen = 512 )

tokenized
0/1500, run:0
10/1500, run:0
20/1500, run:0
30/1500, run:0
40/1500, run:0
50/1500, run:0
60/1500, run:0
70/1500, run:0
80/1500, run:0
90/1500, run:0
100/1500, run:0
110/1500, run:0


KeyboardInterrupt: 

### Clear cuda cache

In [6]:
del EmbeddingFetcher.model
torch.cuda.empty_cache()

## Extract Embeddings

In [7]:
x = EmbeddingFetcher.GetEmbeddings('CLS')

## Split Train and Test data

In [8]:
x = [np.array(i).reshape(1024) for i in x]
columns = ['cohesion', 'syntax', 'vocabulary','phraseology', 'grammar', 'conventions']
TrainX,TestX = x[:350],x[350:]

In [9]:
TrainY,TestY = [],[]
for i in columns:
    TrainY.append(d[i].iloc[:350])
    TestY.append(d[i].iloc[350:500])

## Hyper parameter tuning

In [None]:
def objective(trial):
    params = {"random_state":trial.suggest_categorical("random_state", [42]),           
        'learning_rate' : trial.suggest_float('learning_rate', 0.01, 1),  
        "max_depth" : trial.suggest_int("max_depth", 5, 15),
        "alpha" : trial.suggest_float('alpha',0.9,1),
    }
    error = []
    for i in range(6):
        model = xgb.XGBRegressor(**params)
        model.fit(TrainX,TrainY[i])
        PredY = model.predict(TestX)
        e = mean_squared_error(TestY[i],PredY,squared=False)
        error.append(e)
    mcrmse = sum(error)/6
    return mcrmse
study = optuna.create_study()
study.optimize(objective, n_trials=100) 

In [None]:
from optuna.visualization.matplotlib import plot_optimization_history
plot_optimization_history(study)

## No tuning

In [10]:
def objective():
    params = {          
        'learning_rate' : 0.042,                   
    }
    error = []
    for i in range(6):
        model = xgb.XGBRegressor(**params)
        model.fit(TrainX,TrainY[i])
        PredY = model.predict(TestX)
        e = mean_squared_error(TestY[i],PredY,squared=False)
        error.append(e)
    mcrmse = sum(error)/6
    return mcrmse,error
objective()

(0.5244083139471178,
 [0.5525675913412039,
  0.4858335621054145,
  0.5061131831871145,
  0.5064351456795005,
  0.546654733279391,
  0.5488456680900828])

## Features only

In [11]:
spacy.prefer_gpu()

True

In [14]:
x = []
for essay in range(len(d['full_text'])):
    ext = FeatureExtraction(d['full_text'][essay])
    i = ext.GetPos() + ext.GetVar()
    x.append(i)
    if essay % 10 == 0:
        print(essay)
x[0]

0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490


[0.2019704433497537,
 0.3054187192118227,
 0.06403940886699508,
 0.03940886699507389,
 0.7380952380952381,
 0.10344827586206896,
 1.9877674693472376,
 2.0654609733079115,
 1.3728129459672884,
 1.5]

In [17]:
x[0]

[0.20689655172413793,
 0.2857142857142857,
 0.06896551724137931,
 0.06403940886699508,
 0.7696969696969697,
 2.0730699572419278,
 1.9498010508590446,
 1.5118578920369088,
 0.9805806756909202]

In [15]:
TrainX,TestX = x[:350],x[350:500]

In [17]:
x1 = EmbeddingFetcher.GetEmbeddings('CLS')
x1 = [np.array(i).reshape(1024) for i in x1]
TrainX,TestX = x1[:350],x1[350:500]

In [16]:
objective()

(0.666086473833794,
 [0.6626710674956092,
  0.6019208355218901,
  0.5947586377806194,
  0.6665634204043137,
  0.7733610375611794,
  0.6972438442391526])

## Features+embeddings

In [18]:
x2 = []
for i in range(500):
    a = np.concatenate((x[i],x1[i]))
    x2.append(a)
TrainX,TestX = x2[:350],x2[350:500]

In [19]:
objective()

(0.5211581437726029,
 [0.5480708390067163,
  0.48195563641690475,
  0.5075835304700707,
  0.4956495660123182,
  0.5401011990802276,
  0.55358809164938])