In [1]:
import joblib
import numpy as np
import pandas as pd
import random

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import ndcg_score

In [2]:
essay = joblib.load('essay_tfidf_asap7')
essay_par = joblib.load('essay_tfidf_paraphrase_asap7')

asap7 = joblib.load('asap7_paraphrase')

## TF-IDF

In [4]:
num_sample = 100

In [5]:
## SAMPLE 100 ESSAYS

random.seed(42)
essay_samples = random.sample(list(essay), num_sample)
#essay_samples = random.sample(list(enumerate(essay)), 100)
len(essay_samples)

100

In [6]:
## SAMPLE 100 PARAPHRASED ESSAYS

random.seed(42)
essay_samples_par = random.sample(list(essay_par), num_sample)
len(essay_samples_par)

100

### similarity

In [7]:
similarity_list = []

for i in range(len(essay_samples)):
    sim_list = []
    for j in range(len(essay)):
        sim_list.append(cosine_similarity(essay_samples[i].reshape(1,-1), essay[j].reshape(1,-1))[0][0])
    
    similarity_list.append(sim_list)

In [8]:
similarity_list = np.array(similarity_list)
similarity_list.shape

(100, 1569)

In [9]:
similarity_list_par = []

for i in range(len(essay_samples_par)):
    sim_list_par = []
    for j in range(len(essay_par)):
        sim_list_par.append(cosine_similarity(essay_samples_par[i].reshape(1,-1), essay_par[j].reshape(1,-1))[0][0])
    
    similarity_list_par.append(sim_list_par)

In [10]:
similarity_list_par = np.array(similarity_list_par)
similarity_list_par.shape

(100, 1569)

## NDCG Score

In [11]:
index_start = np.arange(1, len(essay) + 1)
#index_start

In [12]:
def get_ndcg(similarity_list, similarity_list_par):
    df = pd.DataFrame()
    df['index'] = index_start
    df['sim'] = similarity_list
    df['index_par'] = index_start
    df['sim_par'] = similarity_list_par

    df_sorted = df.sort_values(by='sim', ascending=False)

    sim = df['sim'].to_numpy()
    sim_par = df['sim_par'].to_numpy()

    index = df['index'].to_numpy()
    index_par = df['index_par'].to_numpy()

    return ndcg_score(index.reshape(1,-1), index_par.reshape(1,-1))

In [13]:
ndcg_scores = []

for i in range(len(similarity_list)):
    ndcg_scores.append(get_ndcg(similarity_list[i], similarity_list_par[i]))


In [14]:
np.mean(ndcg_scores)

1.0000000000000002

### anchor essay position

In [260]:
idx = np.where((essay == anchor_essay).all(1))
idx

(array([780], dtype=int64),)

In [261]:
asap7.loc[780]['domain1_score']

18

In [262]:
asap7.loc[780]['essay']

'A time when had patience was when I was waiting in the car to get to @LOCATION2. From my house to @LOCATION1, @LOCATION2 itÆs about @NUM1 hrs away. So I had to sit in a car for @NUM1 hours. The only things to do were listen to your ipod and sleep. Which I didnÆt do either of those things. As you can tell it would take a lot of patience to wait in a car for @NUM1 hours. After an hour had passed it would have felt like @NUM4 if you werenÆt doing anything. By that time I was ready to mess with my sister who was sleeping. I knew I couldnÆt do that though. Now was the time to think of something to do. I pulled out my phone and started texting people but no one answered. I tried looking out the window but that made me sleepy. Getting there took too long. Finally @NUM1 hours passed and we made it to @LOCATION2. This was a time when I had patience'

In [263]:
asap7.loc[780]['essay_paraphrase']

'At the point when had persistence was the point at which I was holding up in the vehicle to get to @LOCATION2. From my home to @LOCATION1, @LOCATION2 itÃ¦s about @NUM1 hrs away. So I needed to sit in a vehicle for @NUM1 hours. The lone activities were tune in to your ipod and rest. Which I didnÃ¦t do both of those things. As you can advise it would take a great deal of persistence to hang tight in a vehicle for @NUM1 hours. Following an hour had passed it would have felt like @NUM4 in the event that you werenÃ¦t busy. At that point I was prepared to meddle with my sister who was dozing. I knew I couldnÃ¦t do that however. Presently was an ideal opportunity to consider something to do. I pulled out my telephone and began messaging individuals yet nobody replied. I gave looking a shot the window yet that made me languid. Arriving took excessively long. At long last @NUM1 hours passed and we made it to @LOCATION2. This was the point at which I had tolerance\n'

In [264]:
idx = np.where((essay_par == anchor_essay_par).all(1))
idx

(array([780], dtype=int64),)