In [1]:
import scispacy
import spacy
import re
import pandas as pd 
import numpy as np
from collections import defaultdict
from tqdm import tqdm

In [2]:
# en_ner_jnlpba_md, en_ner_bc5cdr_md, en_ner_bionlp13cg_md, en_ner_craft_md, en_core_sci_lg
nlp = spacy.load("en_ner_bionlp13cg_md", disable=['tagger', 'parser'])

In [3]:
abstracts = pd.read_fwf("abstract_cardiovascular_disease.xml.abstract.raw", engine="python", squeeze=True)

## Per abstract

In [9]:
token_dict = defaultdict(lambda: defaultdict(int))
token_types = set()
for idx, _doc in tqdm(enumerate(nlp.pipe(abstracts.text))):
    for _token in _doc:
        txt = _token.lemma_
        etype = _token.ent_type_
        token_types.add(etype)
        if etype in ['AMINO_ACID', 'GENE_OR_GENE_PRODUCT', 'SIMPLE_CHEMICAL', 'CELL', 'CELLULAR_COMPONENT']:
            token_dict[idx][txt] += 1

144701it [13:30, 178.54it/s]


In [22]:
ent_tuples = [tuple([_v for _v in v.keys()]) for k,v in token_dict.items()]

In [24]:
from efficient_apriori import apriori

In [33]:
itemsets, rules = apriori(ent_tuples, min_support=0.005, min_confidence=0.5)

In [34]:
itemsets

{1: {('calcium',): 1560,
  ('T',): 758,
  ('triglyceride',): 4632,
  ('lipid',): 6042,
  ('glucose',): 5206,
  ('cholesterol',): 9929,
  ('TG',): 949,
  ('TC',): 709,
  ('C',): 1096,
  ('-',): 3196,
  ('B',): 1422,
  ('oxygen',): 1305,
  ('alcohol',): 2530,
  ('factor',): 3158,
  ('hemoglobin',): 980,
  ('acid',): 4625,
  ('estrogen',): 1374,
  ('e.g.',): 565,
  ('beta',): 543,
  ('beta-blocker',): 510,
  ('creatinine',): 820,
  ('LDL',): 3143,
  ('HDL',): 3133,
  ('cell',): 7229,
  ('lipoprotein',): 5039,
  ('apolipoprotein',): 1440,
  ('HDL-cholesterol',): 743,
  ('A',): 862,
  ('fibrinogen',): 1385,
  ('high-density',): 1871,
  ('uric',): 569,
  ('/-',): 2111,
  ('DM',): 779,
  ('Type',): 835,
  ('platelet',): 1904,
  ('plasma',): 664,
  (')',): 3463,
  ('(',): 2379,
  ('smooth',): 996,
  ('receptor',): 3112,
  ('D',): 1160,
  ('vitamin',): 2712,
  ('insulin',): 6687,
  ('synthase',): 754,
  ('BP',): 2680,
  ('mitochondrial',): 503,
  ('fatty',): 2083,
  ('aspirin',): 1571,
  ('card