In [26]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from math import sqrt
import numpy as np
from scipy.stats import spearmanr
from sklearn.metrics import mean_squared_error
from bert_embedding import BertEmbedding
from sklearn.preprocessing import MinMaxScaler
from allennlp.commands.elmo import ElmoEmbedder
import torch
from transformers import OpenAIGPTTokenizer, OpenAIGPTLMHeadModel
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.isotonic import IsotonicRegression

## Preprocessing

In [2]:
#Preprocessing function(Stopword removal and question demoting)
def preprocessing(q, ans):
    stop_words = set(stopwords.words('english'))
    q_tokens = word_tokenize(q)
    ans_tokens = word_tokenize(ans)
    demoted_tokens = [t for t in ans_tokens if t not in q_tokens]
    filtered_sent = [w for w in demoted_tokens if not w in stop_words]
    return filtered_sent

## Evaluation Metrics

In [3]:
# Metrics functions : rmse and pearson correlation
def RMSE(actual, pred):
    return sqrt(mean_squared_error(actual, pred))

def Pearson(actual,pred):
    mean_a = sum(actual) / len(actual)
    mean_p = sum(pred) / len(pred)
    cov = sum((a - mean_a) * (b - mean_p) for (a, b) in zip(actual, pred)) / len(actual)
    p = float(cov / (np.std(actual) * np.std(pred)))
    return p

## Embeddings

In [4]:
def check_tokens(sent):
    if not list:
        sent = word_tokenize(sent)
    return sent

In [5]:
def bert(sent):
    tokens = check_tokens(sent)
    embedding = BertEmbedding().embedding(sentences=tokens)
    word_arr = []
    for i in range(len(embedding)):
        word_arr.append(embedding[i][1][0])
    return word_arr

In [7]:
def gpt(sent):
    tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'openai-gpt')
    model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'openai-gpt')
    
    tokens = check_tokens(sent)
    tokens_i = tokenizer.convert_tokens_to_ids(tokens)
    tokens_t = torch.tensor([tokens_i])
    embedding = model(tokens_t)
    
    word_arr = []
    for i in range(embedding[0].shape[1]):
        word_arr.append(embedding[0][0][i].tolist())
    return word_arr

In [8]:
def elmo(sent):
    tokens = check_tokens(sent)
    embedding = ElmoEmbedder().embed_sentence(tokens)
    word_arr = []

    for i in range(len(embedding[2])):
        word_arr.append(embedding[0][i])
    return word_arr

In [9]:
def gpt2(sent):
    tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'gpt2')
    model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'gpt2')

    tokens=check_tokens(sent)
    tokens_i = tokenizer.convert_tokens_to_ids(tokens)
    tokens_t = torch.tensor([tokens_i])
    embedding = model(tokens_t)

    word_arr = []
    for i in range(embedding[0].size()[1]):
        word_arr.append(embedding[0][0][i].tolist())
    return word_arr

## Regression Models

In [15]:
def linear_reg(xTrain,yTrain,xTest):
    xTrain = check_nan(xTrain.to_numpy().reshape(-1,1))
    yTrain = check_nan(yTrain.to_numpy().reshape(-1, 1))
    xTest = check_nan(xTest.to_numpy().reshape(-1, 1))

    model = LinearRegression()
    model.fit(xTrain, yTrain)
    y_pred = model.predict(xTest)
    return y_pred

def isotonic_reg(xTrain,yTrain,xTest):
    xTrain = xTrain.to_list()
    yTrain = yTrain.to_list()
    xTest = xTest.to_list()

    model = IsotonicRegression()
    model.fit(xTrain, yTrain)
    y_pred = model.predict(xTest)
    return y_pred

def ridge_reg(xTrain, yTrain, xTest):
    xTrain = check_nan(xTrain.to_numpy().reshape(-1, 1))
    yTrain = check_nan(yTrain.to_numpy().reshape(-1, 1))
    xTest = check_nan(xTest.to_numpy().reshape(-1, 1))

    model = Ridge()
    model.fit(xTrain, yTrain)
    y_pred = model.predict(xTest)
    return y_pred

## Feature Extraction
Given the type of embedding to be used, the embeddings of model answers and student answers are generated. Then the cosine similarity score between the 2 are calculated and stored.

In [None]:
df = pd.read_csv('dataset/mohler_dataset_edited.csv')

# Get  student answers from dataset
student_answers = df['student_answer'].to_list()
similarity_scores = {}
model = str(input('Enter a model name(bert, gpt2, elmo, gpt) to get similarity scores for: '))

#Calculate cosine similarity score for each answer
for ans in student_answers:
    q = df.loc[df['student_answer'] == ans, 'question'].iloc[0]
    model_ans = df.loc[df['student_answer'] == ans, 'desired_answer'].iloc[0]

    # Preprocess student answer
    model_preproc = preprocessing(q, model_ans)
    stu_preproc = preprocessing(q, ans)

    # Calculate and save similarity score
    if model=="bert":
        model_arr = bert(model_preproc)
        stu_arr = bert(stu_preproc)
        similarity_scores[ans]=1-scipy.spatial.distance.cosine(sum(model_arr),sum(stu_arr))

    elif model =="gpt2":
        model_arr = gpt2(model_preproc)
        stu_arr = gpt2(stu_preproc)

        e1 = [sum(i) for i in zip(*model_arr)]
        e2 = [sum(i) for i in zip(*stu_arr)]
        similarity_scores[ans] = 1-scipy.spatial.distance.cosine(e1,e2)

    elif model=="gpt":
        model_arr = gpt(model_preproc)
        stu_arr = gpt(stu_preproc)
        e1 = [sum(i) for i in zip(*model_arr)]
        e2 = [sum(i) for i in zip(*stu_arr)]
        similarity_scores[ans] = 1-scipy.spatial.distance.cosine(e1,e2)

    elif model=="elmo":
        model_arr = elmo(model_preproc)
        stu_arr = elmo(stu_preproc)
        similarity_scores[ans] = 1-scipy.spatial.distance.cosine(sum(model_arr), sum(stu_arr))
    print('SCORE',similarity_scores)

col_name=model+"_similarity_score"
for a in student_answers:
    df.loc[df['student_answer'] == a, col_name] = similarity_scores[a]

# Apply normalization techniques
columns = ['bert_similarity_score','elmo_similarity_score','gpt_similarity_score','gpt2_similarity_score']
for c in columns:
    df['normalized_'+c] = MinMaxScaler().fit_transform(np.array(df[c]).reshape(-1,1))
    
df.to_csv('dataset/answers_with_similarity_score.csv')

## Evaluation
The similarity scores are used as data to train different regression models.

In [11]:
def avg(l):
    return sum(l) / len(l)

def check_nan(arr):
    idx_NaN = np.isnan(arr)
    arr[idx_NaN] = 0
    return arr

In [27]:
def evaluate_models(mdl, df):
    mask = np.random.rand(len(df)) < (70 / 100)
    dataTrain = df[mask]
    dataTest = df[~mask]

    mdl_score = 'normalized_' + mdl.lower() + '_similarity_score'
    xTrain = dataTrain[mdl_score]
    xTest = dataTest[mdl_score]
    yTrain = dataTrain['score_avg']
    yTest = dataTest['score_avg'].to_list()

    linear_yPred = [float(x) for x in linear_reg(xTrain,yTrain,xTest)]
    ridge_yPred = [float(x) for x in ridge_reg(xTrain,yTrain,xTest)]
    isotonic_yPred = list(np.nan_to_num(isotonic_reg(xTrain,yTrain,xTest), nan=0))

    y = check_nan(np.asarray(yTest))
    isotonic_pred = check_nan(np.asarray([round(i*2)/2  for i in isotonic_yPred]))
    linear_pred = check_nan(np.asarray([round(i*2)/2  for i in linear_yPred]))
    ridge_pred = check_nan(np.asarray([round(i*2)/2  for i in ridge_yPred]))

    return RMSE(y,isotonic_pred), Pearson(y,isotonic_pred), RMSE(y,linear_pred), Pearson(y,linear_pred), RMSE(y,ridge_pred), Pearson(y,ridge_pred)

In [28]:
df = pd.read_csv('dataset/answers_with_similarity_score.csv')
print("Evaluation Results:")
models=["bert","elmo","gpt","gpt2"]
for m in models:
    print(m.upper())
    rmse_iso,rmse_lin,rmse_rid = [],[],[]
    pc_iso, pc_lin, pc_rid = [], [],[]

    for i in range(0, 1000):
        iso_rmse_score, iso_pc_score, lin_rmse_score, lin_pc_score, rid_rmse_score, rid_pc_score = evaluate_models(m,df)
        rmse_iso.append(iso_rmse_score)
        pc_iso.append(iso_pc_score)

        rmse_lin.append(lin_rmse_score)
        pc_lin.append(lin_pc_score)
        
        rmse_rid.append(rid_rmse_score)
        pc_rid.append(rid_pc_score)


    print("Isotonic Regression \t ==> \t RMSE :",round(avg(rmse_iso), 3)," \t Pearson Correlation :", round(avg(pc_iso), 3))
    print("Linear Regression \t ==> \t RMSE :",round(avg(rmse_lin), 3)," \t Pearson Correlation :", round(avg(pc_lin), 3))
    print("Ridge Regression \t ==> \t RMSE :",round(avg(rmse_rid), 3)," \t Pearson Correlation :", round(avg(pc_rid), 3))

Evaluation Results:
BERT
Isotonic Regression 	 ==> 	 RMSE : 1.06  	 Pearson Correlation : 0.319
Linear Regression 	 ==> 	 RMSE : 1.081  	 Pearson Correlation : 0.266
Ridge Regression 	 ==> 	 RMSE : 1.078  	 Pearson Correlation : 0.269
ELMO
Isotonic Regression 	 ==> 	 RMSE : 0.98  	 Pearson Correlation : 0.482
Linear Regression 	 ==> 	 RMSE : 0.996  	 Pearson Correlation : 0.451
Ridge Regression 	 ==> 	 RMSE : 0.997  	 Pearson Correlation : 0.449
GPT
Isotonic Regression 	 ==> 	 RMSE : 1.083  	 Pearson Correlation : 0.25
Linear Regression 	 ==> 	 RMSE : 1.088  	 Pearson Correlation : 0.227
Ridge Regression 	 ==> 	 RMSE : 1.089  	 Pearson Correlation : 0.22
GPT2
Isotonic Regression 	 ==> 	 RMSE : 1.066  	 Pearson Correlation : 0.312
Linear Regression 	 ==> 	 RMSE : 1.079  	 Pearson Correlation : 0.273
Ridge Regression 	 ==> 	 RMSE : 1.08  	 Pearson Correlation : 0.268
