In [None]:
import scispacy
import spacy
import re
import pandas as pd 
import numpy as np
from collections import defaultdict
from tqdm import tqdm
import os
from efficient_apriori import apriori

In [None]:
# en_ner_jnlpba_md, en_ner_bc5cdr_md, en_ner_bionlp13cg_md, en_ner_craft_md, en_core_sci_lg
nlp = spacy.load("en_core_sci_lg", disable=['parser','tagger']) # disable=['tagger', ]

In [None]:
os.chdir("/media/bramiozo/DATA-FAST/text_data/pubscience/pubmed")

In [None]:
filenames = [("cardiovascular_disease", "cvd_abstract/cardiovascular_disease.xml.abstract.raw"),
             ("acute_coronary_syndrome", "cvd_abstract/acute_coronary_syndrome.xml.abstract.raw"),
             ("atrial_fibrillation", "cvd_abstract/atrial_fibrillation.xml.abstract.raw"),
             ("cardiomyopathy", "cvd_abstract/cardiomyopathy.xml.abstract.raw"),
             ("myocardial_ischemia", "cvd_abstract/myocardial_ischemia.xml.abstract.raw"),
             ("tachycardia", "cvd_abstract/tachycardia.xml.abstract.raw"),
             ("rheumatoid_arthritis", "ra_abstract/rheumatoid_arthritis.xml.abstract.raw")]

In [None]:
corpora = defaultdict(list)
for file in filenames:
    with open(file[1], "r") as rf:
        corpora[file[0]] = rf.readlines()
        print(f"{file[0]} contains {len(corpora[file[0]])} documents")

# NER


## MedCat

## SciSpacy

In [None]:
token_db = defaultdict(tuple)
token_types = set()

for k, abstracts in corpora.items():
    print(f"Processing {k}...")
    token_dict = defaultdict(lambda: defaultdict(int))
    wv_dict_avg = dict()
    wv_dict_ent_avg = dict()
    for idx, _doc in tqdm(enumerate(nlp.pipe(abstracts))):
        wv_dict_avg[idx] = _doc.vector
        vlist = []
        for _token in _doc.ents:
            vlist.append(_token.vector)            
            if len(_token)>2:
                txt = _token.lemma_
                token_dict[idx][txt] += 1
        wv_dict_ent_avg[idx] = np.vstack(vlist).mean(axis=0) if len(vlist)>0 else None
    token_db[k] = token_dict, wv_dict_avg, wv_dict_ent_avg

In [None]:
apriori_results_db = dict()
for corpus_key, corpus in token_db.items():
    print(f"Collecting entity tuples and mining associations for :{corpus_key}")
    ent_tuples = [tuple([_v for _v in v.keys()]) for k,v in corpus[0].items()]

    apriori_results_db[corpus_key] = apriori(ent_tuples, min_support=0.0025, min_confidence=0.15)

In [None]:
db = 'rheumatoid_arthritis'
apriori_results_db[db][1]

In [None]:
# check associations for certain term
term = 'cholesterol'
db = 'cardiomyopathy'

for item in apriori_results_db[db][1]:
    # first index of the inner list
    # Contains base item and add item
    if term in str(item):
        print(str(item))

## BERN (BioBERT-based NER)

In [None]:
# NER : https://github.com/dmis-lab/bern


# Embeddings

In [None]:
import seaborn as sns
from rpca import R_pca
from umap import UMAP
from sklearn.decomposition import FactorAnalysis as FA
from sklearn.decomposition import PCA
from sklearn.manifold import isomap
from random import choices,sample
import matplotlib.pyplot as plt
import numpy as np
import scipy as sc

In [None]:
def collect_wv(key, sample_size=100, atype=1):    
    awv = token_db[key][atype][0]
    max_count = len(token_db[key][atype])
    rnd_choices = choices(range(1,max_count), k=sample_size)

    for sel in rnd_choices:
        awv = np.vstack([awv,token_db[key][atype][sel]])
    return awv

In [None]:
def reduce(data, n_dims=2, how=UMAP):
    reducer = how(n_components=n_dims).fit(data)
    return reducer.transform(data), reducer

In [None]:
embeddings = dict()
sample_size = 10000
embeddings['cardiovascular_disease'] = collect_wv('cardiovascular_disease', sample_size=sample_size, atype=1)
embeddings['rheumatoid_arthritis'] = collect_wv('rheumatoid_arthritis', sample_size=sample_size, atype=1)
embeddings['atrial_fibrillation'] = collect_wv('atrial_fibrillation', sample_size=sample_size, atype=1)
embeddings['myocardial_ischemia'] = collect_wv('myocardial_ischemia', sample_size=sample_size, atype=1)

In [None]:
to_embed = ['cardiovascular_disease', 'rheumatoid_arthritis', 'atrial_fibrillation', 'myocardial_ischemia']

embeddings['collected'] = collect_wv(to_embed[0], sample_size=sample_size, atype=1)
for idx in range(1,len(to_embed)):
    embeddings['collected'] = np.vstack([embeddings['collected'], 
                                         collect_wv(to_embed[idx], 
                                                    sample_size=sample_size, atype=1)])

In [None]:
reduced['collected'] = pd.DataFrame(data=reduce(embeddings['collected'], 
                                                             n_dims=dimensions)[0],
                                                 columns=['dim_'+str(i) for i in range(0,dimensions)],
                                                )
reduced['collected']['keyword'] = 'unknown'

In [None]:
for k,v in enumerate(to_embed):
    reduced['collected']['keyword'][k*sample_size:(k+1)*sample_size+1] = v

In [None]:
dimensions = 2
reduced['cardiovascular_disease'] = pd.DataFrame(data=reduce(embeddings['cardiovascular_disease'], 
                                                             n_dims=dimensions)[0],
                                                 columns=['dim_'+str(i) for i in range(0,dimensions)],
                                                )

reduced['rheumatoid_arthritis'] = pd.DataFrame(data=reduce(embeddings['rheumatoid_arthritis'], 
                                                             n_dims=dimensions)[0],
                                                 columns=['dim_'+str(i) for i in range(0,dimensions)],
                                                )

reduced['atrial_fibrillation'] = pd.DataFrame(data=reduce(embeddings['atrial_fibrillation'], 
                                                             n_dims=dimensions)[0],
                                                 columns=['dim_'+str(i) for i in range(0,dimensions)],
                                                )

reduced['myocardial_ischemia'] = pd.DataFrame(data=reduce(embeddings['myocardial_ischemia'], 
                                                             n_dims=dimensions)[0],
                                                 columns=['dim_'+str(i) for i in range(0,dimensions)],
                                                )

In [None]:
fig, ax = plt.subplots(figsize=(15,10))
sns.scatterplot(data=reduced['cardiovascular_disease'].sample(1500), x='dim_0', y='dim_1', color='red', 
                alpha=0.45, label='CVD')
sns.scatterplot(data=reduced['rheumatoid_arthritis'].sample(1500), x='dim_0', y='dim_1', color='green', 
                alpha=0.45, label='RA')
sns.scatterplot(data=reduced['atrial_fibrillation'].sample(1500), x='dim_0', y='dim_1', color='blue', 
                alpha=0.45, label='AF')
sns.scatterplot(data=reduced['myocardial_ischemia'].sample(1500), x='dim_0', y='dim_1', color='orange', 
                alpha=0.45, label='MI')
plt.legend()

In [None]:
fig, ax = plt.subplots(figsize=(15,10))
sns.scatterplot(data=reduced['collected'].sample(6000), x='dim_0', y='dim_1', hue='keyword', 
                alpha=0.5)
plt.legend()

In [None]:
def find_similar_docs(data, doc_id=None, how='euclidean', n_select=10):
    assert isinstance(doc_id, int), "doc_is should be a positive integer"
    assert doc_id>=0, "doc_is should be a positive integer"
    
    dists = sc.spatial.distance.cdist(data[doc_id,:].reshape(1,-1), np.delete(data, doc_id, axis=0))
    return np.argsort(dists)[0,:n_select]
    

In [None]:
sim_docs = find_similar_docs(embeddings['collected'], doc_id=2, how='euclidean', n_select=5)
print(sim_docs)

In [None]:
corpora_all = []
# TODO: incorporate actual index coming from the sampling
for k,v in corpora.items():
    corpora_all += v

In [None]:
for sim_doc in sim_docs:
    print(corpora_all[sim_doc])
    print("+-"*50)