In [1]:
#import dependencies
import numpy as np
import pandas as pd
import os #for file operations
import glob #for file operations
import json #for creating dataset
import random
import time
from datetime import datetime


import storage
import csv

import spacy

import geopandas as gpd
import pandas as pd
from shapely.geometry import Point

### Set global variables to minimize unnecessary processing

In [2]:
STAGE_0 = 1
STAGE_1 = 1
STAGE_2 = 0

### Retrieve & Format Departments (Actors), Articles (Events), Links (Edges)

In [3]:
if(STAGE_0==1):
    # only load departments directly from SQL database if GL has not been calculated.
    departments = storage.retrieve_all_departments()
    department_frame = pd.DataFrame.from_records(departments, columns =['index', 'name', 'lat', 'long'])

article_dept_links = storage.get_article_department_links()
articles = storage.retrieve_all_articles()
article_frame = pd.DataFrame.from_records(articles, columns =['index', 'title', 'journal', 'date', 'abstract', 'grants'])
link_frame = pd.DataFrame.from_records(article_dept_links, columns =['dept0_index', 'dept1_index', 'article_index', 'journal'])


In [4]:
if(STAGE_0):
    department_frame.head()

In [5]:
article_frame.head()

Unnamed: 0,index,title,journal,date,abstract,grants
0,39571576,Evolutionary genomics of the emergence of brow...,Cell,2024 Nov 18,Brown seaweeds are keystone species of coastal...,
1,39566495,Fibroblastic reticular cells generate protecti...,Cell,2024 Nov 19,Stringent control of T cell activity in the tu...,
2,39561773,Glialike taste cells mediate an intercellular ...,Cell,2024 Nov 9,The sense of taste generally shows diminishing...,
3,39549699,The Arabidopsis bluelight photoreceptor CRY2 i...,Cell,2024 Nov 8,Cryptochromes (CRYs) are bluelight receptors t...,
4,39549698,The singlemolecule accessibility landscape of ...,Cell,2024 Nov 12,We present replicationaware singlemolecule acc...,


In [6]:
link_frame.head()

Unnamed: 0,dept0_index,dept1_index,article_index,journal
0,1,2,39571576,Cell
1,1,3,39571576,Cell
2,1,4,39571576,Cell
3,1,5,39571576,Cell
4,1,6,39571576,Cell


#### Format Time

In [7]:
article_frame.fillna("--",inplace=True)
article_frame['date']= article_frame['date'].apply(lambda d: d.replace("--","1900 Jan 0"))
article_frame['date'] = article_frame['date'].apply(lambda d: " ".join(d.split(" ")[0:2]))
article_frame['date'] = article_frame['date'].apply(lambda d: datetime.strptime(d, '%Y %b'))
article_frame['date'] = article_frame['date'] = pd.to_datetime(article_frame['date'], format="%m/%d/%Y")


In [8]:
article_frame['date'].describe()

count                             1809
mean     2023-08-25 23:22:35.223880448
min                2020-07-01 00:00:00
25%                2023-03-01 00:00:00
50%                2023-12-01 00:00:00
75%                2024-06-01 00:00:00
max                2024-11-01 00:00:00
Name: date, dtype: object

#### Descriptions of Dataframes Prior to Filtering + Processing

In [9]:
if(STAGE_0):
    department_frame.describe()

In [10]:
article_frame.describe()

Unnamed: 0,date
count,1809
mean,2023-08-25 23:22:35.223880448
min,2020-07-01 00:00:00
25%,2023-03-01 00:00:00
50%,2023-12-01 00:00:00
75%,2024-06-01 00:00:00
max,2024-11-01 00:00:00


In [11]:
link_frame.describe()

Unnamed: 0,dept0_index,dept1_index
count,426966.0,426966.0
mean,7969.046711,7969.046711
std,4959.87019,4959.87019
min,1.0,1.0
25%,3721.0,3721.0
50%,7696.0,7696.0
75%,11463.0,11463.0
max,18402.0,18402.0


## Apply GLiNER

[GLiNER: Generalist Model for Named Entity Recognition using Bidirectional Transformer by Zaratiana et. al](https://arxiv.org/pdf/2311.08526) presents an open named entity resolution model which matches text to a set of natural language entity tags provided at inference time. 

In [12]:
if(STAGE_0):
    #Load Spacy+Gliner Pipeline Pipeline
    nlp = spacy.blank("en")
    #add configs (University, School, Organization)
    custom_spacy_config = { "gliner_model": "urchade/gliner_multi",
                            "chunk_size": 250,
                            "labels": ["university", "school", "organization"],
                            "style": "ent"}
    nlp.add_pipe("gliner_spacy", config=custom_spacy_config)

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

  state_dict = torch.load(model_file, map_location=torch.device(map_location))


In [13]:
if(STAGE_0):
    from tqdm import tqdm
    
    tagged_schools = []
    GL_t = []
    texts = department_frame['name'].to_list()
    
    # Use tqdm to track progress
    for doc in tqdm(nlp.pipe(texts, disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"]), total=len(texts)):
        ents_schools = [ent.text for ent in doc.ents if ent.label_ == 'university' or ent.label_ == 'school']
        # if university or school is identified, append to the list
        if len(ents_schools) >= 1:
            tagged_schools.append(ents_schools)
            GL_t.append("S")
        # if a university or school is not identified, look for organizations
        else:
            ents_orgs = [ent.text for ent in doc.ents if ent.label_ == 'organization']
            if len(ents_orgs) >= 1:
                tagged_schools.append(ents_orgs)
                GL_t.append("O")
        # this should just append the actual text itself if no entities are extracted
        # - if a department entity is included it will use that first
        if len(ents_schools) == 0 and len(ents_orgs) == 0:
            tagged_schools.append([doc.text])
            GL_t.append("N")
            
    
    department_frame['GL'] = tagged_schools
    department_frame['GL_t'] = GL_t
    department_frame.to_pickle("./department_frame_GL.pkl") 


  0%|                                                 | 0/18402 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|█████████████████████████████████████| 18402/18402 [14:07<00:00, 21.72it/s]


In [16]:
import gc
import torch
gc.collect()
torch.cuda.empty_cache()

We now have a dataframe with GLiNER extracted entities in column *GL*
This has the effect of removing redundant/extraneous information, reducing the chance RELiK focuses on the incorrect entity.

In [17]:
if(STAGE_0):
    department_frame.head()

In [31]:
if(STAGE_1):
    department_frame = pd.read_pickle("./department_frame_GL.pkl")  

In [32]:
if(STAGE_1):
    from relik import Relik
    from relik.inference.data.objects import RelikOutput
    import torch
    torch.cuda.empty_cache()
    relik = Relik.from_pretrained("relik-ie/relik-cie-large", device="cuda")
    #, device="cuda")?
    #relik = Relik.from_pretrained("sapienzanlp/relik-entity-linking-small")


                ___              __         
               /\_ \      __    /\ \        
 _ __     __   \//\ \    /\_\   \ \ \/'\    
/\`'__\ /'__`\   \ \ \   \/\ \   \ \ , <    
\ \ \/ /\  __/    \_\ \_  \ \ \   \ \ \\`\  
 \ \_\ \ \____\   /\____\  \ \_\   \ \_\ \_\
  \/_/  \/____/   \/____/   \/_/    \/_/\/_/
                                            
                                            





config.yaml:   0%|          | 0.00/942 [00:00<?, ?B/s]

[2024-11-23 12:15:24,625] [INFO] [relik.inference.annotator.from_pretrained:700] [PID:1219296] [RANK:0] Loading Relik from relik-ie/relik-cie-large[39m
[2024-11-23 12:15:24,627] [INFO] [relik.inference.annotator.from_pretrained:701] [PID:1219296] [RANK:0] {
    '_target_': 'relik.inference.annotator.Relik',
    'index': {
        'span': {
            '_target_': 'relik.retriever.indexers.inmemory.InMemoryDocumentIndex.from_pretrained',
            'name_or_path': 'relik-ie/index-e5-small-v2-wikipedia-matryoshka',
        },
        'triplet': {
            '_target_': 'relik.retriever.indexers.inmemory.InMemoryDocumentIndex.from_pretrained',
            'name_or_path': 'relik-ie/encoder-e5-small-v2-wikipedia-relations-index',
        },
    },
    'metadata_fields': [],
    'reader': {
        '_target_': 'relik.reader.pytorch_modules.triplet.RelikReaderForTripletExtraction',
        'transformer_model': 'relik-ie/relik-reader-deberta-v3-large-cie-wikipedia',
        'use_nme': True,

config.json:   0%|          | 0.00/771 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/720 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

config.yaml:   0%|          | 0.00/202 [00:00<?, ?B/s]

documents.jsonl:   0%|          | 0.00/2.96G [00:00<?, ?B/s]

embeddings.pt:   0%|          | 0.00/4.79G [00:00<?, ?B/s]

[2024-11-23 12:20:55,098] [INFO] [relik.retriever.indexers.base.from_pretrained:484] [PID:1219296] [RANK:0] Loading Index from config:[39m
[2024-11-23 12:20:55,099] [INFO] [relik.retriever.indexers.base.from_pretrained:485] [PID:1219296] [RANK:0] {
    '_target_': 'relik.retriever.indexers.inmemory.InMemoryDocumentIndex',
    'device': 'cuda',
    'metadata_fields': [],
    'name_or_path': '/media/ssd/perelluis/relik_experiments/indexes/index-e5-small-v2-wikipedia-matryoshka',
    'precision': None,
    'separator': None,
    'use_faiss': False,
}[39m
[2024-11-23 12:20:55,099] [INFO] [relik.retriever.indexers.base.from_pretrained:492] [PID:1219296] [RANK:0] Loading documents from /home/aidan/.cache/huggingface/hub/models--relik-ie--index-e5-small-v2-wikipedia-matryoshka/snapshots/8d119e710a7a8be5000b789dfcde3e661767982b/documents.jsonl[39m
[2024-11-23 12:21:14,694] [INFO] [relik.retriever.indexers.base.from_pretrained:535] [PID:1219296] [RANK:0] Loading embeddings from /home/aidan/.

  embeddings = torch.load(embedding_path, map_location="cpu")


[2024-11-23 12:21:16,112] [INFO] [relik.retriever.indexers.inmemory.__init__:65] [PID:1219296] [RANK:0] Both documents and embeddings are provided.[39m


config.yaml:   0%|          | 0.00/171 [00:00<?, ?B/s]

documents.jsonl:   0%|          | 0.00/107k [00:00<?, ?B/s]

embeddings.pt:   0%|          | 0.00/476k [00:00<?, ?B/s]

[2024-11-23 12:21:17,297] [INFO] [relik.retriever.indexers.base.from_pretrained:484] [PID:1219296] [RANK:0] Loading Index from config:[39m
[2024-11-23 12:21:17,297] [INFO] [relik.retriever.indexers.base.from_pretrained:485] [PID:1219296] [RANK:0] {
    '_target_': 'relik.retriever.indexers.inmemory.InMemoryDocumentIndex',
    'device': 'cuda',
    'metadata_fields': [],
    'name_or_path': 'relik-ie/encoder-e5-small-v2-wikipedia-relations-index',
    'precision': None,
    'separator': None,
    'use_faiss': False,
}[39m
[2024-11-23 12:21:17,298] [INFO] [relik.retriever.indexers.base.from_pretrained:492] [PID:1219296] [RANK:0] Loading documents from /home/aidan/.cache/huggingface/hub/models--relik-ie--encoder-e5-small-v2-wikipedia-relations-index/snapshots/f311d53631c26f80a2b1ba16ac65337d06561946/documents.jsonl[39m
[2024-11-23 12:21:17,299] [INFO] [relik.retriever.indexers.base.from_pretrained:535] [PID:1219296] [RANK:0] Loading embeddings from /home/aidan/.cache/huggingface/hub/mo

config.json:   0%|          | 0.00/767 [00:00<?, ?B/s]

configuration_relik.py:   0%|          | 0.00/1.70k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.78G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/20.2k [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.67M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.02k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.40k [00:00<?, ?B/s]

[2024-11-23 12:22:36,763] [INFO] [relik.inference.utils.load_reader:383] [PID:1219296] [RANK:0] Moving reader to `cuda`.[39m


In [22]:
def apply_relik_to_df(df, stage=True):
    if not stage:
        return df
        
    # Create new RELIK column
    df['RELIK'] = df['GL'].apply(lambda x: relik(x[0]) if isinstance(x, list) and len(x) > 0 else None)
    
    return df

In [23]:
if(STAGE_1):
    department_frame = apply_relik_to_df(department_frame)
    print(department_frame.iloc[0]['RELIK'])  # Print first result for verification
    department_frame.to_pickle("./relikApplied.pkl") 


[36m[2024-11-23 11:44:22,336] [DEBUG] [relik.reader.data.relik_reader_data.__iter__:527] [PID:1219296] [RANK:0] Dataset finished: 1 number of elements processed[39m
[36m[2024-11-23 11:44:23,199] [DEBUG] [relik.reader.data.relik_reader_data.__iter__:527] [PID:1219296] [RANK:0] Dataset finished: 1 number of elements processed[39m
[36m[2024-11-23 11:44:24,037] [DEBUG] [relik.reader.data.relik_reader_data.__iter__:527] [PID:1219296] [RANK:0] Dataset finished: 1 number of elements processed[39m
[36m[2024-11-23 11:44:24,864] [DEBUG] [relik.reader.data.relik_reader_data.__iter__:527] [PID:1219296] [RANK:0] Dataset finished: 1 number of elements processed[39m
[36m[2024-11-23 11:44:25,712] [DEBUG] [relik.reader.data.relik_reader_data.__iter__:527] [PID:1219296] [RANK:0] Dataset finished: 1 number of elements processed[39m
[36m[2024-11-23 11:44:26,529] [DEBUG] [relik.reader.data.relik_reader_data.__iter__:527] [PID:1219296] [RANK:0] Dataset finished: 1 number of elements processed[39

KeyboardInterrupt: 

In [34]:
xr = department_frame['GL'].tolist()
xr = [item[0] for item in xr]
department_frame['RELIK'] = relik(xr)

department_frame.to_pickle("./relikApplied2.pkl") 


tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

[36m[2024-11-23 12:33:26,567] [DEBUG] [relik.reader.data.relik_reader_re_data.__iter__:399] [PID:1219296] [RANK:0] Dataset finished: 18402 number of elements processed[39m


In [35]:
department_frame.head()


Unnamed: 0,index,name,lat,long,GL,GL_t,RELIK
0,1,"Génomique Métabolique, Genoscope, Institut Fra...",0.0,0.0,"[Génomique Métabolique, Genoscope, Institut Fr...",O,"RelikOutput(text='Génomique Métabolique', toke..."
1,2,"Sorbonne Université, CNRS, Integrative Biology...",0.0,0.0,[Sorbonne Université],S,"RelikOutput(text='Sorbonne Université', tokens..."
2,3,"Genoscope, Institut François Jacob, CEA, CNRS,...",0.0,0.0,[Université Paris-Saclay],S,"RelikOutput(text='Université Paris-Saclay', to..."
3,4,"Sorbonne Université, CNRS, Algal Genetics Grou...",0.0,0.0,[Sorbonne Université],S,"RelikOutput(text='Sorbonne Université', tokens..."
4,5,"Sorbonne Université, CNRS, Algal Genetics Grou...",0.0,0.0,[Sorbonne Université],S,"RelikOutput(text='Sorbonne Université', tokens..."


In [36]:
department_frame.iloc[2]['RELIK']

RelikOutput(text='Université Paris-Saclay', tokens=Université Paris-Saclay, id=2, spans=[Span(start=0, end=23, label='Paris-Saclay', text='Université Paris-Saclay')], triplets=[], candidates=Candidates(span=[[[{"text": "Paris-Saclay University", "id": 2447329, "metadata": {"wikidata": "Q109409389", "definition": "Paris-Saclay University (Universit\u00e9 Paris-Saclay) is a combined technological research institute and public research university in Paris, France. Paris-Saclay was established in 2019 after the merger of four technical grandes \u00e9coles, as well as several technological institutes, engineering schools, and research facilities; giving it fifteen constituent colleges with over 48,000 students combined. With the merger, the French government has explicitly voiced their wish to rival"}}, {"text": "Paris-Saclay Faculty of Sciences", "id": 1316417, "metadata": {"wikidata": "Q98112081", "definition": "The Paris-Saclay Faculty of Sciences or Orsay Faculty of Sciences, in French 

In [28]:
department_frame.to_pickle("./relik2Applied.pkl") 


In [25]:
department_frame2 = pd.read_pickle("./relik2Applied.pkl")  

In [26]:
department_frame2.head()

Unnamed: 0,original,id,txt
0,Department of Microbial Pathogenesis and Micro...,5307788,"New Haven, Connecticut"
1,Department of Microbial Pathogenesis and Micro...,5307788,"New Haven, Connecticut"
2,"MOE Key Laboratory of Bioinformatics, Center f...",5902060,Tsinghua University
3,"Department of Genetics, Stanford University, S...",681339,Stanford University
4,"MOE Key Laboratory of Bioinformatics, Center f...",5902060,Tsinghua University


In [None]:
department_frame['id'].unique()


In [None]:
import networkx as nx
G = nx.Graph()  # or DiGraph, MultiGraph, MultiDiGraph, etc
for i in range(0, len(department_frame)):
    G.add_node(department_frame.iloc[i]['id'])
    

In [None]:
G.number_of_nodes()

In [None]:
for i in range(0, len(article_dept_links)):
    a = link_frame.iloc[i]['dept0_index']
    b = link_frame.iloc[i]['dept1_index']
    print(str(a), "-->", str(b))
    try:
        a_id = department_frame.iloc[a]['id']
        b_id = department_frame.iloc[b]['id']
        G.add_edge(a_id, b_id)
    except:
        pass

In [None]:
G.number_of_edges()

In [None]:
reachability_matrix = np.zeros((len(G.nodes()), len(G.nodes())))
print(reachability_matrix.shape)

In [None]:
count_i=0
for i in G.nodes:
    count_j=0
    for j in G.nodes:
        reachability_matrix[count_i][count_j] = nx.has_path(G, i, j)
        #reachability_matrix[count_i][count_j] = nx.has_path(G, i, j)
        count_j+=1
    count_i+=1

                

In [None]:
import numpy as np
import matplotlib.pyplot as plt
plt.matshow(reachability_matrix)
plt.colorbar()
plt.savefig('reachability.png')
plt.show()

In [None]:
row_sums = np.sum(reachability_matrix, axis=1)
sorted_indices = np.argsort(row_sums)
bottom_10_indices = sorted_indices[:10]

In [None]:
for x in bottom_10_indices:
    ids = G.nodes()
    #print(x)
    id = list(ids)[x]
    print(department_frame[department_frame['id'] == id]['original'].values)
    low_reaching_i = department_frame[department_frame['id'] == id].index[0]
    low_i_journals = link_frame[link_frame['dept0_index'] == low_reaching_i].journal.tolist()+link_frame[link_frame['dept1_index'] == low_reaching_i].journal.tolist()
    print(len(low_i_journals))
    print(low_i_journals)