In [81]:
import csv
import functools
import gc
import json
import os
import sys

import copy
import copydf
import datasets
import decouple
import einops
import numpy as np
import pandas as pd
from sklearn.metrics import pairwise_distances
import transformers as tfs
import torch
import tqdm

import data_collator
import modeling_bert
import semscholar_data_process
import utils

## Get data

In [61]:
# tried to import from semscholar_data_process.py but didn't work

def parse_metadata(metadata_path):

    paper_metadata = {}
    with open(metadata_path, 'r') as rf:
        for line in rf:
            raw_metadata = json.loads(line)
            paper_metadata[raw_metadata['paper_id']] = {
                k: v for k, v in raw_metadata.items() if k!='paper_id'
            }
            
    return paper_metadata

In [73]:
def build_paperstring(metadata, paper_id):

    title = metadata[paper_id]['title']
    authors = ', '.join([f'{author["first"]} {author["last"]}' for author in metadata[paper_id]['authors']])
    year = metadata[paper_id]['year']
    paperstring = f'{title}, by {authors} ({year}).'

    return paperstring

In [70]:
data_file = './project_dir/cse599/data/semscholar_data.jsonl'
metadata_file = './project_dir/cse599/data/semscholar_metadata.jsonl'
# out_file = 'preprocessed_data.jsonl'
# abstract_out_file = 'preprocessed_abstracts.jsonl'
embedding_size = 768

paper_data = []
with open(data_file, 'r') as rf:
    for line in rf:
        this_paper_data = json.loads(line)
        paper_data.append(this_paper_data)

metadata = parse_metadata(metadata_file)

print(f'paper_data: {len(paper_data)}')
papers = []
abstracts = []
paper_ids = []
for paper in paper_data:
    if paper['body_text'] and paper['abstract']:
        paper_sections = []
        for section_dict in paper['body_text']:
            paper_section = {
                'party': str(section_dict['section']),
                'text': str(semscholar_data_process.clean_text(section_dict['text'])),
            }
            # section = clean_text(section_dict['section'] + ' ' + section_dict['text'])
            paper_sections.append(paper_section)
        papers.append(paper_sections)

        abstract = {
            'party': 'abstract',
            'text': str(semscholar_data_process.clean_text(paper['abstract'][0]['text'])),
        }
        abstracts.append([abstract])
        # print(papers)

        paper_ids.append(paper['paper_id'])

paper_data: 203


In [4]:
len(papers)

78

In [5]:
paper_num_sections = [len(paper) for paper in papers]
print(sum(paper_num_sections))

2942


## get embeddings

In [10]:
specter_emb_dir = '/g/tial/data/ccu/roylu/cse599/bert/embs/'
specter_emb_name = 'bert_mean_pooler_output'

hibert_emb_dir = '/g/tial/data/ccu/roylu/cse599/hibert/embs'
hibert_emb_name = 'conversation_pooler_output'

In [26]:
specter_output = utils.load_embeddings(specter_emb_dir, 'train')[specter_emb_name]
hibert_output = utils.load_embeddings(hibert_emb_dir, 'train')[hibert_emb_name]

In [27]:
specter_output.shape

(78, 768)

In [13]:
hibert_output.shape
# curr_start = 0
# hibert_output_meanpooled = []
# for num_sections in paper_num_sections:
#     paper_meanpooled_embed = np.mean(hibert_output[curr_start : curr_start + num_sections], axis=0)
#     hibert_output_meanpooled.append(paper_meanpooled_embed)
#     print(f'original shape ({hibert_output[curr_start : curr_start + num_sections].shape}) from {curr_start}')
#     curr_start += num_sections
#     print(f'new shape {paper_meanpooled_embed.shape}')
# hibert_output_meanpooled = np.stack(hibert_output_meanpooled, axis=0)
# print(f'final shape {hibert_output_meanpooled.shape}')

(78, 300)

In [63]:
def get_top5_neighbors(embeddings):

    distances = pairwise_distances(embeddings, metric='cosine')
    top5_lists = []
    top5_distances = []
    for embed_distances in distances:
        embed_ranks = embed_distances.argsort()
        embed_top5_neighbors = list(embed_ranks[1:6])
        embed_top5_distances = [embed_distances[neighbor_idx] for neighbor_idx in embed_top5_neighbors]

        top5_lists.append(embed_top5_neighbors)
        top5_distances.append(embed_top5_distances)
    
    return top5_lists, top5_distances

def get_intersections(a_top5_neighbors, b_top5_neighbors):

    intersections = []
    for a, b in zip(a_top5_neighbors, b_top5_neighbors):
        intersection = list(set(a) & set(b))
        intersections.append(intersection)

    return intersections

In [65]:
specter_top5_neighbors, specter_top5_distances = get_top5_neighbors(specter_output)
print(specter_top5_neighbors)
print(specter_top5_distances)

hibert_top5_neighbors, hibert_top5_distances = get_top5_neighbors(hibert_output)
print(hibert_top5_neighbors)
print(hibert_top5_distances)


[[7, 24, 51, 55, 54], [13, 64, 51, 49, 36], [38, 50, 58, 68, 5], [8, 53, 12, 67, 17], [70, 65, 63, 16, 52], [38, 50, 2, 58, 68], [57, 55, 76, 40, 14], [33, 24, 0, 22, 44], [62, 12, 3, 67, 17], [38, 2, 50, 58, 68], [23, 33, 41, 40, 76], [66, 42, 64, 49, 30], [17, 8, 66, 69, 3], [1, 36, 54, 49, 64], [40, 15, 62, 70, 33], [14, 40, 49, 70, 61], [52, 8, 47, 17, 27], [66, 12, 67, 69, 29], [46, 36, 48, 13, 54], [76, 55, 44, 48, 31], [73, 29, 65, 67, 63], [16, 76, 48, 31, 49], [48, 7, 24, 19, 76], [33, 44, 55, 10, 40], [7, 51, 54, 0, 22], [37, 5, 60, 50, 58], [56, 48, 76, 22, 32], [40, 3, 41, 30, 8], [53, 3, 27, 8, 49], [73, 20, 66, 17, 67], [70, 69, 66, 64, 42], [49, 48, 8, 30, 40], [39, 46, 31, 13, 63], [23, 40, 55, 49, 14], [70, 62, 40, 61, 65], [56, 70, 17, 4, 63], [13, 54, 23, 49, 18], [25, 60, 5, 50, 58], [50, 2, 58, 68, 5], [32, 48, 4, 35, 67], [33, 14, 27, 61, 76], [27, 8, 54, 40, 48], [49, 30, 74, 47, 66], [33, 44, 23, 70, 47], [23, 33, 43, 55, 19], [40, 3, 41, 23, 70], [32, 52, 18, 5

In [39]:
overlaps = get_intersections(specter_top5_neighbors, hibert_top5_neighbors)

[24]
[52, 13]
[]
[8, 67, 53]
[16]
[68]
[57]
[24, 22]
[3, 12]
[]
[40, 33, 23]
[64, 30]
[8, 3]
[32, 1]
[]
[]
[52]
[67, 69]
[32, 13]
[44]
[17]
[16]
[24, 7]
[33, 10, 44]
[51, 7]
[]
[32]
[40, 49]
[3, 53]
[17, 66, 20]
[69]
[]
[13]
[40, 23]
[70]
[56, 65]
[18, 13]
[38]
[]
[35]
[33, 27, 61]
[48]
[47]
[]
[]
[]
[32]
[17, 42]
[31]
[30]
[58]
[24, 43]
[16]
[8, 3, 28]
[]
[]
[35]
[6]
[38]
[]
[68]
[72]
[8, 12]
[64, 65, 17]
[17]
[63]
[17, 29]
[64, 17]
[]
[17]
[72]
[65, 70]
[61, 70, 71]
[17, 67, 20]
[17, 66, 3, 42]
[32, 26]
[61, 55]
[61]


In [72]:
metadata

{'77490025': {'title': 'State of external ocular muscles in strabismus of different duration',
  'authors': [{'first': "Pen'kov", 'middle': [], 'last': 'Ma', 'suffix': ''},
   {'first': 'Konstantinovskaia', 'middle': [], 'last': 'Ke', 'suffix': ''},
   {'first': 'Zhukova', 'middle': [], 'last': 'Sv', 'suffix': ''}],
  'abstract': None,
  'year': 1975,
  'arxiv_id': None,
  'acl_id': None,
  'pmc_id': None,
  'pubmed_id': None,
  'doi': None,
  'venue': None,
  'journal': 'Oftalmologicheskiĭ zhurnal',
  'mag_id': '2426173123',
  'mag_field_of_study': ['Medicine'],
  'outbound_citations': [],
  'inbound_citations': [],
  'has_outbound_citations': False,
  'has_inbound_citations': False,
  'has_pdf_parse': False,
  's2_url': 'https://api.semanticscholar.org/CorpusID:77490025'},
 '77490084': {'title': 'Management of uterine fibromyoma',
  'authors': [{'first': 'Landau', 'middle': [], 'last': 'IaM', 'suffix': ''},
   {'first': 'Karpushin', 'middle': [], 'last': 'Vp', 'suffix': ''}],
  'abs

In [79]:
parsed_neighbors = []
for paper_id, specter_neighbors, specter_distances, hibert_neighbors, hibert_distances, overlap \
    in zip(paper_ids, specter_top5_neighbors, specter_top5_distances, hibert_top5_neighbors, hibert_top5_distances, overlaps):
    this_paper_info = {}
    this_paper_info['paper_id'] = paper_id
    this_paper_info['paper_info'] = build_paperstring(metadata, paper_id)
    for i in range(5):
        this_paper_info[f'specter_neighbor_{i+1}'] = build_paperstring(metadata, paper_ids[specter_neighbors[i]])
        this_paper_info[f'specter_distance_{i+1}'] = specter_distances[i]

        this_paper_info[f'hibert_neighbor_{i+1}'] = build_paperstring(metadata, paper_ids[hibert_neighbors[i]])
        this_paper_info[f'hibert_distance_{i+1}'] = hibert_distances[i]

        
    
    parsed_neighbors.append(this_paper_info)

In [80]:
parsed_neighbors[-1]

{'paper_id': '123621513',
 'paper_info': 'Cold start characteristics study based on real time no emissions in an LPG SI engine, by Yingli Zu, Gong Li (2010).',
 'specter_neighbor_1': "Do we need a new family of optical-NIR extinction laws?, by J. Apell'aniz (2012).",
 'specter_distance_1': 0.2679466,
 'hibert_distance_1': 0.00085014105,
 'specter_neighbor_2': 'Understanding our Galaxy - key contributions from the Parkes telescope, by J. Caswell (2012).',
 'specter_distance_2': 0.27512223,
 'hibert_neighbor_2': 'Understanding our Galaxy - key contributions from the Parkes telescope, by J. Caswell (2012).',
 'hibert_distance_2': 0.00094014406,
 'specter_neighbor_3': 'Investigation ofant colonyalgorithm in multiple traffic flow environments, by A. Tizghadam, M. Hashemi, A. Leon-Garcia (2005).',
 'specter_distance_3': 0.2866732,
 'hibert_neighbor_3': 'In vivo/in vitro Studies of the Effects of the Type II Arabinogalactan Isolated from Maytenus ilicifolia Mart. ex Reissek on the Gastrointes

In [82]:
output_csv = './project_dir/cse599/data/neighbors_output.csv'
with open(output_csv, 'w') as csvfile:
    fieldnames = parsed_neighbors[0].keys()
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for paper_neighbors in parsed_neighbors:
        writer.writerow(paper_neighbors)