# This file computes syntactic complexity measures and bert embeddings for all essays in the ELL dataset, and builds a regression model to score the essays.
## Import library

In [None]:
%run functions

## Load Data

In [None]:
loader = DataLoader()
d = loader.GetData('L2Writing')
d.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5


## Extract bert embeddings

25m19.4s

In [None]:
xc = []
xp = []
for i in range(0,4000,1000):
    if i == 3000:
        end = len(d)
    else:
        end = i+1000
    EmbeddingFetcher = GetBERTEmbeddings(d['full_text'][i:end],'model/deberta-v3-large')
    EmbeddingFetcher.inf(stop=1000,SeqLen = 512)
    xp.append(EmbeddingFetcher.GetEmbeddings('MeanP'))
    xc.append(EmbeddingFetcher.GetEmbeddings('CLS')) 
    del EmbeddingFetcher
    torch.cuda.empty_cache()

### Save to disk

In [None]:
torch.save(xp,'features/kaggle-meanp.pt')
torch.save(xc,'features/kaggle-cls.pt')

## Fine-grained feature

In [None]:
for i in range(len(d)):
    t = d['full_text'][i]
    extractor = FeatureExtraction(t)
    extractor.process()
    features = extractor.get_data()
    with open('features/kaggle-fine.csv','a',encoding='utf-8') as f:
        f.write('{},'.format(i))
        f.write(','.join([str(i) for i in features]))
        f.write('\n')
    print(i)

## Regression to predict scores
### Split Train and Test data

In [27]:
ind = [i for i in range(3911)]
trainIn,testIn = ind[:3100],ind[3100:]

In [36]:
xec= [np.array(i).reshape(1024) for i in xc] # cls embeddings
xep= [np.array(i).reshape(1024) for i in xp] # mean pooling embeddings
columns = ['cohesion', 'syntax', 'vocabulary','phraseology', 'grammar', 'conventions']
TrainY,TestY = [],[]
for i in columns:
    TrainY.append(d[i].iloc[trainIn])
    TestY.append(d[i].iloc[testIn])

### Regression function

In [34]:
def regression(TrainX,TestX):
    error = []
    for i in range(6):
        model = xgb.XGBRegressor(tree_method="hist",learning_rate=0.042)
        model.fit(TrainX,TrainY[i])
        PredY = model.predict(TestX)
        e = mean_squared_error(TestY[i],PredY,squared=False)
        error.append(e)
    mcrmse = sum(error)/6
    return mcrmse,error


### Regression with CLS token embeddings

In [43]:
TrainX,TestX = [xec[i] for i in trainIn],[xec[i] for i in testIn]
regression(TrainX,TestX)

(0.4856760066137961,
 [0.5235910118697392,
  0.47968087090756906,
  0.43411403061999365,
  0.4794984069311703,
  0.5179769664669042,
  0.4791947528874001])

### Regression with Mean pooling embeddings

In [44]:
TrainX,TestX = [xep[i] for i in trainIn],[xep[i] for i in testIn]
regression(TrainX,TestX)

(0.4652828341404194,
 [0.503632963189457,
  0.4673676553525596,
  0.41686302356275945,
  0.45842822346324913,
  0.4884066773503887,
  0.45699846192410243])

### Regression with Complexity measures

In [46]:
xf = []
with open('features/kaggle-fine.csv') as f:
    for l in f.readlines():
        feat = [float(i) for i in l.split(',')]
        xf.append(feat)

In [48]:
TrainX,TestX = [xf[i] for i in trainIn],[xf[i] for i in testIn]
regression(TrainX,TestX)

(0.5262974224357096,
 [0.5463729037668124,
  0.5155164357476183,
  0.45916339031867515,
  0.5153879155747669,
  0.5954167347881139,
  0.5259271544182714])

### Regression with measures + embeddings

In [51]:
xef = []
for i in range(3911):
    a = np.concatenate((xep[i],xf[i]))
    xef.append(a)
TrainX,TestX = [xef[i] for i in trainIn],[xef[i] for i in testIn]
regression(TrainX,TestX)

(0.46442690022657396,
 [0.49914265135722524,
  0.46245414719977535,
  0.40926493640405653,
  0.4615697145507722,
  0.49489668805051074,
  0.4592332637971034])