This file uses the fine-tuned model to get embeddings by calculating the most similar train text to the input (i.e., test) text, and assigning it the score of that similar text.

A large portion of this code is written by Mathis Lucka ([GitHub](https://github.com/mathislucka), [Kaggle](https://www.kaggle.com/mathislucka)).

# Imports

In [None]:
!pip install sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import semantic_search
import os
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# Open up gdrive to get files

from google.colab import drive
drive.mount('gdrive')

# Globals

In [None]:
FOLD = 3
BASE_PATH = 'gdrive/My Drive/colabNotebooks/commonLitReadabilityPrize/quantProject'
ONLY_COMMONLIT_TRAIN_DATA_PATH = os.path.join(BASE_PATH, 'data/training/original/train.csv')
ORIGINAL_DATASET_PATH = os.path.join(BASE_PATH, 'data/training/allData/fullset.csv')
ENCODINGS_ORIGINAL_DATASET_PATH = os.path.join(BASE_PATH, 'finalEncodings/sentenceBERT.csv')
FULL_CLEAR_CORPUS_PATH = os.path.join(BASE_PATH, 'data/clearCorpus/clearCorpus.csv') # has test & train sets

In [None]:
# Make the full corpus into a dataframe
FULL_CLEAR_CORPUS_DF = pd.read_csv(FULL_CLEAR_CORPUS_PATH)

# Get test set from full corpus
isTest_Bool = FULL_CLEAR_CORPUS_DF['testOrTrain'] == 'Test'
TEST_SET_TEXT = FULL_CLEAR_CORPUS_DF[isTest_Bool]['Excerpt']
TEST_SET_TARGET = FULL_CLEAR_CORPUS_DF[isTest_Bool]['BT_easiness']

# Get train set from full corpus
isTrain_Bool = FULL_CLEAR_CORPUS_DF['testOrTrain'] == 'Train'
TRAIN_SET_TEXT = FULL_CLEAR_CORPUS_DF[isTrain_Bool]['Excerpt']
TRAIN_SET_TARGET = FULL_CLEAR_CORPUS_DF[isTrain_Bool]['BT_easiness']

# Embeddings

In [None]:
#### USING A FINE-TUNED MODEL ####

testExcerpts = TEST_SET_TEXT.tolist()
testTargets = TEST_SET_TARGET.tolist()
print('Num test: ', len(testExcerpts))
# full data set w/ external data
trainExcerpts = (pd.read_csv(ORIGINAL_DATASET_PATH)).excerpt.tolist()
print('Num train: ', len(trainExcerpts))
trainTargets = (pd.read_csv(ORIGINAL_DATASET_PATH)).target.tolist()

sbertModel = SentenceTransformer('gdrive/MyDrive/colabNotebooks/commonLitReadabilityPrize/firstPlace_CodeFiles/models/finalModel_robertabase_simplerAlgo/model_fold_{}_simplerAlgo'.format(FOLD))

queriesTestSet = sbertModel.encode(testExcerpts)
#print(queriesTestSet)
# Get sentence embeddings for the model we trained
fullSetEmbeddings = sbertModel.encode(trainExcerpts)
#print(fullSetEmbeddings)

similarMatches = semantic_search(queriesTestSet, fullSetEmbeddings, top_k=5)


In [None]:
# Printing the top hit for each of the samples above from 
#   this portion is using the model I trained 
#   can compare the hits with the ones from the samples printed above
listOfTargets = []
for match in similarMatches:
  print(trainExcerpts[match[0]['corpus_id']])
  print(trainTargets[match[0]['corpus_id']])
  listOfTargets.append(trainTargets[match[0]['corpus_id']])
  print('--')

print(len(listOfTargets))

In [None]:
# RMSE
rms2 = mean_squared_error(testTargets, listOfTargets, squared=False)
print('RMSE: ', rms2)

print('R-squared Score: ', r2_score(testTargets, listOfTargets))

# RMSE best distilroberta: model fold 4 = 0.7935653654411279
# RMSE best roberta-base: model fold 0 = 0.7560099182727237 (didn't check the other folds)
# RMSE best finalModel (trained on distilroberta): model fold 5 = 0.5902524799847981
# RMSE best finalModel (trained on roberta-base): model fold 3 = 0.559884461712637
# RMSE best finalModel (trained on deberta-base): model fold 1 = 0.5617638620548048

# With fine-tuned roberta-base:
#RMSE:  0.559884461712637
#R-squared Score:  0.7066733375116456