# Purpose of Notebook

I need to ensure that the document embeddings created are meaningful in some way. Therefore, it is important to evaluate them. To do this, I am going to create triplets of papers where two of them have an overlapping research area and third is unrelated. 

# Import Dependencies

In [68]:
import pandas as pd
import numpy as np
import os
import glob
from tqdm import tqdm
from scipy.sparse import load_npz

# Load Papers and Research Area

In [8]:
path= r'C:\Users\aidan\OneDrive - University of Bath\1_Semester_2\Cm50175_dissertation_preparation\Data\consolidate\final_data'
os.chdir(path)

In [9]:
papers = pd.read_csv('papers.csv')
areas = pd.read_csv('field_sources_list.csv')
abstracts = pd.read_csv('abstracts.csv')

# Consolidate Data

Create a dataframe that consists of `EID | source id | list of asjc codes`.

In [10]:
# Drop columns that are not needed to evaluation
drop_columns = ['Unnamed: 0', 'doi', 'title', 'afid', 'coverDate', 'publicationName', 'citedby_count']
papers.drop(columns=drop_columns, inplace=True)
papers.drop_duplicates(subset=['eid'], inplace=True)

# Merge with abstracts to drop papers without abstracts
merged = pd.merge(papers, abstracts, how='left', on='eid')
merged.dropna(subset=['description'], inplace=True)

# Change format of areas such that each source_id corresponds to a list of asjc codes
areas = areas.groupby(by=['source_id']).agg(lambda x: x.to_list())

# Merge two dataframes
merged = pd.merge(merged, areas, how='left', on='source_id')
merged.drop(columns=['type', 'Unnamed: 0', 'doi', 'description'], inplace=True)

__How to catch the fact that some papers will not have ASJC codes?__

* Catch them in the function that generates the triplets

# Create Triplets

As a reminder, I want to create triplets whereby two papers are from the same research area and one paper from unrelated research areas. To start with, I will create 10,000 triplets.

In [11]:
# Get 10,000 papers
triplet_1 = merged[merged['asjc'].notnull()].sample(n=10000).copy()

In [16]:
# Drop initial 10,000 papers
merged.drop(triplet_1.index, inplace=True)

In [17]:
# Find related papers
def find_triplet(in_codes, df):
    
    # Sample merged
    sample = df.sample(n=20000)
    
    match = None
    match_codes = None
    no_match = None
    no_match_codes = None
    
    # Loop through samples until one matches
    for i, row in sample.iterrows():
        
        # Convert codes to set
        try:
            if not row['asjc']:  # skip if no asjc codes present
                continue
            codes = set(row['asjc'])
        except:
            continue
            
        # Check if there is a match
        if (in_codes & codes):
            match = row['eid']
            match_codes = codes
        else:
            no_match = row['eid']
            no_match_codes = codes
        
        # Check stopping condition
        if (match is not None) and (no_match is not None):
            return {'match': match,
                    'match_codes': match_codes,
                    'no_match': no_match,
                    'no_match_codes': no_match_codes}
    
    # No match
    print('No match')
    return None


count = 0
matches = []
matches_codes = []
no_matches = []
no_matches_codes = []

for index, in_row in triplet_1.iterrows():
    try:
        in_codes = set(in_row['asjc'])
    except:
        print(index)
        print(row)
        break
    res = find_triplet(in_codes, merged)
    
    matches.append(res['match'])
    matches_codes.append(res['match_codes'])
    no_matches.append(res['no_match'])
    no_matches_codes.append(res['no_match_codes'])
    
    count += 1
    if count % 1000 == 0:
        print(f'{count} out of 10,000 completed.')

1000 out of 10,000 completed.
2000 out of 10,000 completed.
3000 out of 10,000 completed.
4000 out of 10,000 completed.
5000 out of 10,000 completed.
6000 out of 10,000 completed.
7000 out of 10,000 completed.
8000 out of 10,000 completed.
9000 out of 10,000 completed.
10000 out of 10,000 completed.


In [18]:
triplet_1['matches'] = matches
triplet_1['matches_codes'] = matches_codes
triplet_1['no_matches'] = no_matches
triplet_1['no_matches_codes'] = no_matches_codes

In [13]:
# Save triplets down just in case
os.chdir(r'C:\Users\aidan\OneDrive - University of Bath\1_Semester_2\Cm50175_dissertation_preparation\Data\consolidate\embeddings')
triplet_1.to_csv('triplets.csv')

# Evaluate Embeddings

1. Load document embeddings
2. For each triplet:
    1. Find all three embeddings
    2. Compute the cosine similarity between (eid, matches) and (eid, no_matches)
    3. Add record (correct or incorrect) to `triplets` dataframe for further evaluation at the end

In [14]:
triplets = pd.read_csv('triplets.csv')
triplets.drop(columns=['source_id', 'asjc', 'matches_codes', 'no_matches_codes'], inplace=True)
triplets.rename(columns={'Unnamed: 0': 'orig_idx'}, inplace=True)

__Load Embeddings__

In [87]:
def load_embeddings(model, vector_size, pretrained=False):
    '''Function loads embeddings from respective directories and file formats.
    model : word2vec, doc2vec, tfidf, glove, BERT
    pretrained : True, False
    vector_size : dimension of document embedding to load
    '''
    
    # Change directory
    fpath = r'C:\Users\aidan\OneDrive - University of Bath\1_Semester_2\Cm50175_dissertation_preparation\Data\consolidate\embeddings'
    os.chdir(fpath + f'\{model}')
    
    if model == 'word2vec':
        if pretrained:
            return np.load(f'word2vec_pretrained.npy')
        else:
            return np.load(f'my_w2v_{vector_size}_docvecs.npy')
    elif model == 'doc2vec':
        return np.load(f'my_d2v_{vector_size}.model.dv.vectors.npy')
    elif model == 'tfidf':
        return load_npz(f'tfidf_{vector_size}.npz').toarray()
    elif model == 'glove':
        return np.load(f'glove_pretrained_{vector_size}.npy').T
    elif model == 'BERT':
        return np.load('specter_embeddings.npy')
    else:
        print('Model not recognised! Please try again.')

In [16]:
# Test
#d2v_100 = load_embeddings(model='doc2vec', vector_size=100, pretrained=False)

__Evaluate Triplets__

In [104]:
def evaluate_triplets(triplets_df, model, vector_size, pretrained=False):
    
    # 1) Load document vectors
    doc_vecs = load_embeddings(model=model, vector_size=vector_size, pretrained=pretrained)
    print('Vectors loaded')
    
    # Create column in df to record correct / incorrect
    if pretrained:
        triplets_df[f'{model}_{vector_size}_pre'] = None
    else:
        triplets_df[f'{model}_{vector_size}'] = None
    
    # 2) Loop through each triplet
    for i, row in tqdm(triplets_df.iterrows(), total=triplets_df.shape[0]):
        
        # Find index of the three articles - first already in df
        #first_eid = row['eid']
        second_eid = row['matches']
        third_eid = row['no_matches']
        
        first_idx = row['orig_idx']
        second_idx = merged[merged['eid'] == second_eid].index[0]
        third_idx = merged[merged['eid'] == third_eid].index[0]
        
        # Retrieve document vectors for three articles
        first_vec = doc_vecs[first_idx]
        second_vec = doc_vecs[second_idx]
        third_vec = doc_vecs[third_idx]
        
        # Compare three document vectors - cosine similarity
        comp_one = np.dot(first_vec, second_vec) / (np.linalg.norm(first_vec) * np.linalg.norm(second_vec))
        comp_two = np.dot(first_vec, third_vec) / (np.linalg.norm(first_vec) * np.linalg.norm(third_vec))
        
        # Update table - 'correct' is True or False
        #res = {'first': comp_one,
        #       'second': comp_two,
        #       'correct': comp_one > comp_two}
        res = comp_one > comp_two
        if pretrained:
            triplets_df.at[i, f'{model}_{vector_size}_pre'] = res
        else:
            triplets_df.at[i, f'{model}_{vector_size}'] = res
    
    return triplets_df

In [107]:
# tfidf 10000
test = evaluate_triplets(triplets, 'word2vec', 300, pretrained=False)

  0%|                                                                                        | 0/10000 [00:00<?, ?it/s]

Vectors loaded


100%|████████████████████████████████████████████████████████████████████████████| 10000/10000 [26:16<00:00,  6.34it/s]


In [108]:
test[test.columns[4:]].apply(pd.Series.value_counts)

Unnamed: 0,doc2vec_100,doc2vec_300,doc2vec_500,doc2vec_1000,glove_50,glove_100,glove_200,glove_300,word2vec_100,word2vec_300,word2vec_500,word2vec_1000,specter_768,tfidf_100,tfidf_300,tfidf_500,word2vec_300_pre
True,6384,6122,6064,6036,7211,7157,7107,7134,7493,7556,7581,7581,7389,5166,5796,5973,7460
False,3616,3878,3936,3964,2789,2843,2893,2866,2507,2444,2419,2419,2611,4834,4204,4027,2540


# Save results

In [113]:
test.to_csv('test_results.csv')