# Using flair

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd

import torch

from flair.data import Sentence
from flair.embeddings import (
    DocumentPoolEmbeddings,
    FlairEmbeddings, WordEmbeddings,
    ELMoEmbeddings, BertEmbeddings
)

# Test on pubmed data

In [4]:
embeddings = DocumentPoolEmbeddings([
    FlairEmbeddings("pubmed-forward"),
    FlairEmbeddings("pubmed-backward"),
    ELMoEmbeddings("pubmed"),
    BertEmbeddings("bert-large-uncased"),
])

In [5]:
pubmed_data = pd.concat([
    pd.read_json(f"data/medline/{medline_file}") for medline_file in
    ["medline_2016.json", "medline_2017.json", "medline_2018.json"]])

pubmed_data.head()

Unnamed: 0,abstract,doi,pmid,timestamp,title,type,year
0,OBJECTIVE: To investigate whether there is a a...,,19254455,2019-03-06 11:25:42.344605,[Association between RsaI and AluI polymorphis...,['Journal Article'],2016
1,OBJECTIVE: To determine the cardiac and renal ...,,19102882,2019-03-06 11:25:28.159770,[Cardiac and renal arteriolar pathological cha...,['Journal Article'],2016
2,OBJECTIVE: To observe field potentials (FPs) a...,,19102898,2019-03-06 11:25:28.161049,[Cardiac field potentials and activation seque...,['Journal Article'],2016
3,OBJECTIVE: To study the antitumour effects of ...,,21223742,2019-03-06 11:27:31.019968,[Investigation on the mechanisms of p15INK4B g...,['Journal Article'],2016
4,<AbstractText>The preparation of affinity memb...,10.1163/092050610X538731,21092422,2019-03-06 11:27:26.740139,Performance of protein-A-based affinity membra...,['Journal Article'],2016


In [6]:
pubmed_data.title.shape

(253308,)

In [7]:
print(pubmed_data.title.values[0])

[Association between RsaI and AluI polymorphism in the estrogen receptor beta gene and primary hepatocellular carcinoma.].


In [8]:
def preproc(text, limit=512):
    # NOTE: character `limit` is required by BERT
    return (
        text.replace("[", "")
            .replace("]", "")
            .lower()[:limit]
    )

print(preproc(pubmed_data.title.values[0]))

association between rsai and alui polymorphism in the estrogen receptor beta gene and primary hepatocellular carcinoma..


In [9]:
pubmed_corpus = [
    Sentence(text) for text in
    pubmed_data.title.apply(preproc).head(10_000)
]

In [10]:
query = [
    Sentence(text) for text in
    [
        "Searching for the causal effects of body mass index in over 300 000 participants in UK Biobank, using Mendelian randomization.",
        "Prioritizing putative influential genes in cardiovascular disease susceptibility by applying tissue-specific Mendelian randomization.",
        "Longitudinal analysis strategies for modelling epigenetic trajectories",
        "FATHMM-XF: accurate prediction of pathogenic point mutations via extended features",
        "PhenoSpD: an integrated toolkit for phenotypic correlation estimation and multiple testing correction using GWAS summary statistics.",
        "LD Hub: a centralized database and web interface to perform LD score regression that maximizes the potential of summary level GWAS data for SNP heritability and genetic correlation analysis.",
        "MELODI: Mining Enriched Literature Objects to Derive Intermediates",
        "The MR-Base platform supports systematic causal inference across the human phenome",
    ]
]

In [11]:
for text in query + pubmed_corpus:
    embeddings.embed(text)

In [12]:
cos = torch.nn.CosineSimilarity(dim=0, eps=1e-6)

In [13]:
cos_scores = []
for query_id, query_text in enumerate(query):
    cos_res = [
        {
            "query_id": query_id,
            "target_id": target_id,
            "score": cos(query_text.embedding, 
                         target_text.embedding).item()
        }
        for target_id, target_text in enumerate(pubmed_corpus)
    ]
    cos_scores.append(cos_res)
    
cos_scores = pd.concat(pd.DataFrame(x) for x in cos_scores)

In [14]:
cos_scores.head()

Unnamed: 0,query_id,score,target_id
0,0,0.646998,0
1,0,0.640018,1
2,0,0.652958,2
3,0,0.67897,3
4,0,0.632604,4


In [16]:
n = 5

for query_id, query_text in enumerate(query):
    print(f"# Query {query_id}")
    print(query_text)
    
    top_n = (
        cos_scores
          .query(f"query_id == {query_id}")
          .sort_values("score", ascending=False)
          .head(n)
    )
    for target_id, target_score in zip(top_n.target_id, top_n.score):
        print(f"  ## Candidate {target_id}, score {target_score}")
        print(f"  {pubmed_corpus[target_id]}\n")
    print("\n\n")

# Query 0
Sentence: "Searching for the causal effects of body mass index in over 300 000 participants in UK Biobank, using Mendelian randomization." - 20 Tokens
  ## Candidate 1835, score 0.8232989311218262
  Sentence: "review of statistical methodologies for the detection of parent-of-origin effects in family trio genome-wide association data with binary disease traits." - 20 Tokens

  ## Candidate 6101, score 0.8221418857574463
  Sentence: "the association between lower educational attainment and depression owing to shared genetic effects? results in ~25,000 subjects." - 17 Tokens

  ## Candidate 3903, score 0.8055606484413147
  Sentence: "network mendelian randomization: using genetic variants as instrumental variables to investigate mediation in causal pathways." - 15 Tokens

  ## Candidate 7759, score 0.802967369556427
  Sentence: "susceptibility to male infertility: replication study in japanese men looking for an association with four gwas-derived loci identified in european men