In [None]:
import numpy as np
import scipy.spatial
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

def get_cosine_similarity(u,v):
    return 1-scipy.spatial.distance.cosine(u,v)

def cosine_similarity_matrix():
    '''
    Creates a matrix depicting the cosine distances between the words of two sentences
    returns: array
      Similarity matrix of words in two sentences
    '''
    matrix = np.zeros((len(word_array_1), len(word_array_2)))

    for i in range(0, len(word_array_1)):
        for j in range(0, len(word_array_2)):
            matrix[i][j] = get_cosine_similarity(word_array_1[i], word_array_2[j])
    return matrix.T

def plot_similarity_matrix(sentence_1, sentence_2, title):
    """Plot the similarity matrix of two sentences
    param:
    title: str
      Labels the plot with the corresponding title
    returns: None
    """
    x_labels, y_labels = word_tokenize(sentence_1), word_tokenize(sentence_2)
    similarity_matrix = cosine_similarity_matrix()
    sns.heatmap(similarity_matrix, vmin=0, vmax=1, xticklabels=x_labels, yticklabels=y_labels, cmap="YlGnBu",
                annot=True)
    plt.title(title)
    plt.show()

def get_similar_words(sentence_1, sentence_2):
    '''Prints similar word from second sentence for each word in the first sentence
    returns: list of similar words
    '''

    token_1 = word_tokenize(sentence_1)
    token_2 = word_tokenize(sentence_2)

    similarity_matrix = cosine_similarity_matrix()

    similar_word_dict = {}
    for row in range(0, len(similarity_matrix[0])):

        min_val = min(similarity_matrix.T[row])  # Here min value of transpose is found. To understand it print similarity matrix and find the logic
        index = (np.where(similarity_matrix.T[row] == min_val))[0]
        similar_word_list = []

        for i in range(0, len(index)):
            similar_word_list.append(token_2[index[i]])
        similar_word_dict[token_1[row]] = similar_word_list

    print('Similar words in two sentences are :', similar_word_dict)

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
Col

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
import pandas as pd
import numpy as np
import json
import torch
import tensorflow as tf
#Mean Pooling - Take attention mask into account for correct averaging



#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


#Encode text
def encode(texts):
    # Tokenize sentences
   encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
   with torch.no_grad():
       model_output = model(**encoded_input)

    # Perform pooling
   sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
   sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

   return sentence_embeddings

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/stsb-roberta-large')
model = AutoModel.from_pretrained('sentence-transformers/stsb-roberta-large')



def pre_processing(ques, ans):
    """
        Preprocess question and answer. Returns the filtered list of tokens
    :param ques: string
    :param ans: string
    :return: list
        Returns the filtered list after all preprocessing steps
    """

    question_demoted = question_demoting(ques, ans)
    filtered_sentence = remove_stop_words(question_demoted)
    return filtered_sentence

# Load data
df = pd.read_csv('mohler_dataset_edited.csv')
student_answers = df['student_answer'].to_list()
bert_similarity_score = {}
# Get a list of all student answers
student_answers = df['student_answer'].tolist()

# Create a dictionary to store the similarity scores
similarity_scores = {}

# For each student answer, get id, question, desired answer
for index, row in df.iterrows():
    id = row['id']
    question = row['question']
    desired_answer = row['desired_answer']
    student_answer = row['student_answer']

    # Preprocess student answer
    #pp_desired = pre_processing(question, desired_answer)
    #pp_student = pre_processing(question, student_answer)
    #model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

    # Encode query and docs
    text_1_embed = encode(desired_answer)
    text_2_embed = encode(student_answer)
    text_1_embed = np.array(text_1_embed).ravel()  # Reshape to 1-D
    text_2_embed = np.array(text_2_embed).ravel()
    bert_similarity_score[student_answer] = get_cosine_similarity(text_1_embed, text_2_embed)

# Save the similarity scores to a JSON file
with open('similarity_scores.json', 'w') as f:
    json.dump(similarity_scores, f)


for answer in student_answers:
        df.loc[df['student_answer'] == answer, 'bert_sim_score'] = bert_similarity_score[answer]

df.to_csv('ASAG_stsb-roberta-large.csv')


Downloading (…)okenizer_config.json:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/674 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

In [None]:
from math import sqrt

import numpy as np
from scipy.stats import spearmanr
from sklearn.metrics import mean_squared_error
import numpy as np


class Metrics:
    def __init__(self, x, y):
        self.x = self.__check_nan(np.asarray(x))
        self.y = self.__check_nan(np.asarray([round(i*2)/2  for i in y]))

    def __check_nan(self, array):

        NaNs_index = np.isnan(array)
        array[NaNs_index] = 0

        return array
    def rmse(self):
        for val in self.y:
            if np.isnan(val) or not np.isfinite(val):
                print(val)
        return sqrt(mean_squared_error(self.x, self.y))

    def pearson_correlation(self):
        mean_x = sum(self.x) / len(self.x)
        mean_y = sum(self.y) / len(self.y)
        cov = sum((a - mean_x) * (b - mean_y) for (a, b) in zip(self.x, self.y)) / len(self.x)

        std_x, std_y = np.std(self.x), np.std(self.y)

        p = cov / (std_x * std_y)

        return float(p)

    def spearman_correlation(self):
        return spearmanr(self.x, self.y)


In [None]:
from sklearn.linear_model import LinearRegression,Ridge
from sklearn.isotonic import IsotonicRegression
import numpy as np


class RegressionAnalysis:
    def __init__(self, train_x, train_y, test_x):
        self.train_x = train_x
        self.train_y = train_y
        self.test_x = test_x

    def __check_nan(self, array):

        NaNs_index = np.isnan(array)
        array[NaNs_index] = 0

        return array

    def linear(self):
        clf = LinearRegression()

        train_x = self.__check_nan(self.train_x.to_numpy().reshape(-1,1))
        train_y = self.__check_nan(self.train_y.to_numpy().reshape(-1, 1))
        test_x = self.__check_nan(self.test_x.to_numpy().reshape(-1, 1))

        for val in train_x:
            if np.isnan(val) or not np.isfinite(val):
                print(val)

        clf.fit(train_x, train_y)
        test_y_pred = clf.predict(test_x)
        return test_y_pred

    def ridge(self):
        clf = Ridge()

        train_x = self.__check_nan(self.train_x.to_numpy().reshape(-1, 1))
        train_y = self.__check_nan(self.train_y.to_numpy().reshape(-1, 1))
        test_x = self.__check_nan(self.test_x.to_numpy().reshape(-1, 1))

        clf.fit(train_x, train_y)
        test_y_pred = clf.predict(test_x)
        return test_y_pred


    def isotonic(self):

        clf = IsotonicRegression()
        train_x = self.train_x.to_list()
        train_y = self.train_y.to_list()
        test_x = self.test_x.to_list()
        clf.fit(train_x, train_y)
        test_y_pred = clf.predict(test_x)
        return test_y_pred


In [None]:
import numpy as np
import pandas as pd
df = pd.read_csv('ASAG_stsb-roberta-large.csv')
def train_test_split(data, percentage):
    msk = np.random.rand(len(data)) < (percentage / 100)
    data_train = df[msk]
    data_test = df[~msk]

    return data_train, data_test

def avg(given_list):
    return sum(given_list) / len(given_list)

def calculate_results():
    train_data, test_data = train_test_split(df, 70)

    train_data_x = train_data['bert_sim_score']
    train_data_y = train_data['score_avg']

    test_data_x = test_data['bert_sim_score']
    test_data_y = test_data['score_avg'].to_list()

    regression = RegressionAnalysis(train_data_x, train_data_y, test_data_x)

    test_y_pred_lin = [float(x) for x in regression.linear()]
    test_y_pred_rid = [float(x) for x in regression.ridge()]
    test_y_pred_iso = list(np.nan_to_num(regression.isotonic(), nan=0))

    metrics_iso = Metrics(test_data_y, test_y_pred_iso)
    metrics_lin = Metrics(test_data_y, test_y_pred_lin)
    metrics_rid = Metrics(test_data_y, test_y_pred_rid)

    return metrics_iso.rmse(), metrics_iso.pearson_correlation(), metrics_lin.rmse(), metrics_lin.pearson_correlation(), metrics_rid.rmse(), metrics_rid.pearson_correlation()

if __name__ == '__main__':


    iso_rmse = []
    iso_pearson = []

    lin_rmse = []
    lin_pearson = []

    rid_rmse = []
    rid_pearson = []

    for i in range(0, 1000):
        iso_rmse_score, iso_pc_score, lin_rmse_score, lin_pc_score, rid_rmse_score, rid_pc_score = calculate_results()
        iso_rmse.append(iso_rmse_score)
        iso_pearson.append(iso_pc_score)

        lin_rmse.append(lin_rmse_score)
        lin_pearson.append(lin_pc_score)

        rid_rmse.append(rid_rmse_score)
        rid_pearson.append(rid_pc_score)

    print('Metric \t \t \t | Isotonic Regression \t | Linear Regression \t | Ridge Regression | ')
    print('------------------------------------------------------------------------------------------------')
    print('RMSE \t \t | ', round(avg(iso_rmse), 3), '\t |', round(avg(lin_rmse), 3), '\t |', round(avg(rid_rmse), 3),
          ' |')
    print('Pearson Correlation \t | ', round(avg(iso_pearson), 3), '\t |', round(avg(lin_pearson), 3), '\t |',
          round(avg(rid_pearson), 3), ' |')
# print('Spearman Correlation \t | ', metrics_iso.spearman_correlation(),'\t |', metrics_lin.spearman_correlation(), '\t |', metrics_rid.spearman_correlation(), ' |')


Metric 	 	 	 | Isotonic Regression 	 | Linear Regression 	 | Ridge Regression | 
------------------------------------------------------------------------------------------------
RMSE 	 	 |  0.956 	 | 0.976 	 | 0.977  |
Pearson Correlation 	 |  0.512 	 | 0.477 	 | 0.475  |
