In [1]:
import os
import csv
import re
from collections import Counter

import pandas as pd
import numpy as np
from IPython.display import display

from opentargets_pharmgkb.evidence_generation import ID_COL_NAME
from opentargets_pharmgkb.pandas_utils import read_tsv_to_df, split_and_explode_column

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', None)

In [3]:
data_dir = '/home/april/projects/opentargets/pharmgkb/doe'

In [4]:
var_drug_ann = read_tsv_to_df(os.path.join(data_dir, 'var_drug_ann.tsv'))
var_fa_ann = read_tsv_to_df(os.path.join(data_dir, 'var_fa_ann.tsv'))
var_pheno_ann = read_tsv_to_df(os.path.join(data_dir, 'var_pheno_ann.tsv'))

In [5]:
clinical_annotations = read_tsv_to_df(os.path.join(data_dir, 'clinical_annotations.tsv'))
clinical_ann_evidence = read_tsv_to_df(os.path.join(data_dir, 'clinical_ann_evidence.tsv'))
clinical_ann_alleles = read_tsv_to_df(os.path.join(data_dir, 'clinical_ann_alleles.tsv'))

In [8]:
# Methods from last notebook for evaluation (not sure we'll use these)

def compute_variant_annotation_coverage(results):
    """Compute fraction of variant annotations that are covered by at least one clinical annotation genotype."""
    total_var_anns = 0
    total_failure = 0
    for caid in results:
        var_anns = results[caid]['Variant Annotation ID'].dropna().unique()
        total_var_anns += len(var_anns)
        match = set(results[caid][~results[caid][ID_COL_NAME].isna()]['Variant Annotation ID'].dropna().unique())
        non_match = set(results[caid][results[caid][ID_COL_NAME].isna()]['Variant Annotation ID'].dropna().unique())
        total_failure += len(non_match - match)

    total_success = total_var_anns - total_failure
    print(f'Coverage of variant annotations: {total_success}/{total_var_anns} ({total_success/total_var_anns*100:0.2f}%)')


def compute_clinical_annotation_genotype_coverage(results):
    """Compute fraction of clinical annotation genotype/alleles that are covered by at least one variant annotation"""
    total_genotypes = 0
    total_failure = 0
    
    for caid in results:
        genotypes = results[caid]['Genotype/Allele'].dropna().unique()
        total_genotypes += len(genotypes)
        match = set(results[caid][~results[caid]['Variant Annotation ID'].isna()]['Genotype/Allele'].dropna().unique())
        non_match = set(results[caid][results[caid]['Variant Annotation ID'].isna()]['Genotype/Allele'].dropna().unique())
        total_failure += len(non_match - match)
        
    total_success = total_genotypes - total_failure
    print(f'Coverage of clinical annotation genotype/alleles: {total_success}/{total_genotypes} ({total_success/total_genotypes*100:0.2f}%)')


def compute_clinical_annotation_coverage(results):
    """Compute fraction of clinical annotations that are covered by at least one variant annotation"""
    total_anns = 0
    total_failure = 0
    
    for caid in results:
        total_anns += 1
        match = set(results[caid][~results[caid]['Variant Annotation ID'].isna()][ID_COL_NAME].dropna().unique())
        non_match = set(results[caid][results[caid]['Variant Annotation ID'].isna()][ID_COL_NAME].dropna().unique())
        total_failure += len(non_match - match)
        
    total_success = total_anns - total_failure
    print(f'Coverage of clinical annotations: {total_success}/{total_anns} ({total_success/total_anns*100:0.2f}%)')

In [10]:
# Examples from spreadsheet
example_ca_ids = ['981755803', '1139506787', '1183888969', '1184514050', '981419266']

In [114]:
VAR_ID_COL_NAME = 'Variant Annotation ID'
EFFECT_COL_NAME = 'effect_term'
OBJECT_COL_NAME = 'object_term'
ASSOC_COL_NAME = 'Is/Is Not associated'
DOE_COL_NAME = 'Direction of effect'
COMPARISON_COL_NAME = 'Comparison Allele(s) or Genotype(s)'

### Column selection

Example of how to select columns from the different tables - note this mainly loses context while retaining the direction and the definition of the effect

In [15]:
var_drug_ann[['Variant Annotation ID', 'PMID', 'Sentence', 'Alleles', 'Is/Is Not associated', 'Direction of effect', 'PD/PK terms', 'Drug(s)', 'Comparison Allele(s) or Genotype(s)']]

Unnamed: 0,Variant Annotation ID,PMID,Sentence,Alleles,Is/Is Not associated,Direction of effect,PD/PK terms,Drug(s),Comparison Allele(s) or Genotype(s)
0,1451834452,15634941,CYP3A4 *17 is associated with decreased metabolism of nifedipine as compared to CYP3A4 *1.,*17,Associated with,decreased,metabolism of,nifedipine,*1
1,1451159680,22248286,Allele G is not associated with increased dose of warfarin in people with an international normalized ratio (INR) of 2.0-3.0 as compared to allele A.,G,Not associated with,increased,dose of,warfarin,A
2,1451306860,33350885,CYP2C9 *11/*11 is associated with decreased dose of warfarin.,*11/*11,Associated with,decreased,dose of,warfarin,
3,1448997750,16495778,CYP2B6 *1/*18 is associated with increased concentrations of efavirenz in people with HIV Infections as compared to CYP2B6 *1/*1.,*1/*18,Associated with,increased,concentrations of,efavirenz,*1/*1
4,1448631821,28470111,CYP2C19 *1/*2 is associated with increased trough concentration of clomipramine and desmethyl clomipramine.,*1/*2,Associated with,increased,trough concentration of,"clomipramine, desmethyl clomipramine",
...,...,...,...,...,...,...,...,...,...
11896,1452459580,38647526,NAT2 intermediate acetylator and rapid acetylator is associated with increased clearance of isoniazid in people with Tuberculosis as compared to NAT2 slow acetylator.,,Associated with,increased,clearance of,isoniazid,
11897,1452460020,38674283,Genotypes CC + CT is associated with increased resistance to antiepileptics in children with Epilepsy as compared to genotype TT.,CC + CT,Associated with,increased,resistance to,antiepileptics,TT
11898,1452460686,38661209,Genotypes AA + AG is associated with increased clearance of rifapentine as compared to genotype GG.,AA + AG,Associated with,increased,clearance of,rifapentine,GG
11899,1452460760,38669362,"Genotype GG is associated with increased dose of sufentanil in children with Pain, Postoperative as compared to genotypes AA + AG.",GG,Associated with,increased,dose of,sufentanil,AA + AG


In [17]:
var_pheno_ann[['Variant Annotation ID', 'PMID', 'Sentence', 'Alleles', 'Is/Is Not associated', 'Direction of effect', 'Side effect/efficacy/other', 'Phenotype', 'Comparison Allele(s) or Genotype(s)']]

Unnamed: 0,Variant Annotation ID,PMID,Sentence,Alleles,Is/Is Not associated,Direction of effect,Side effect/efficacy/other,Phenotype,Comparison Allele(s) or Genotype(s)
0,1449169911,29238301,"HLA-B *35:08 is not associated with likelihood of Maculopapular Exanthema, severe cutaneous adverse reactions or Stevens-Johnson Syndrome when treated with lamotrigine in people with Epilepsy.",*35:08,Not associated with,,likelihood of,"Side Effect:Maculopapular Exanthema, Side Effect:severe cutaneous adverse reactions, Side Effect:Stevens-Johnson Syndrome",
1,982022165,22850190,Allele T is not associated with increased risk of Hypersensitivity when treated with sulfamethoxazole / trimethoprim in people with Infection.,T,Not associated with,increased,risk of,Disease:Hypersensitivity,
2,982022148,22850190,Allele A is not associated with increased risk of Hypersensitivity when treated with sulfamethoxazole / trimethoprim in people with Infection.,A,Not associated with,increased,risk of,Disease:Hypersensitivity,
3,1451283480,22071378,Allele A is associated with increased severity of Tobacco Use Disorder in people with Tobacco Use Disorder.,A,Associated with,increased,severity of,Other:Tobacco Use Disorder,
4,1444696916,7689389,Allele G is associated with Ototoxicity when treated with streptomycin as compared to allele A.,G,Associated with,,,Side Effect:Ototoxicity,A
...,...,...,...,...,...,...,...,...,...
13512,1452460100,38673067,"Allele C is associated with increased likelihood of primary graft failure or Kidney Tubular Necrosis, Acute when treated with mycophenolic acid and tacrolimus in people with Kidney Transplantation as compared to allele T.",C,Associated with,increased,likelihood of,"""Efficacy:primary graft failure"", ""Efficacy:Kidney Tubular Necrosis, Acute""",T
13513,1452460774,38669362,"Genotype AA is associated with increased severity of Pain, Postoperative when treated with sufentanil in children with Pain, Postoperative as compared to genotypes AG + GG.",AA,Associated with,increased,severity of,"Efficacy:Pain, Postoperative",AG + GG
13514,1452460767,38669362,"Genotype GG is associated with increased severity of Pain, Postoperative when treated with sufentanil in children with Pain, Postoperative as compared to genotypes AA + AG.",GG,Associated with,increased,severity of,"Efficacy:Pain, Postoperative",AA + AG
13515,1452460920,38674407,"Genotypes AA + AG is associated with increased risk of treatment related mortality when treated with carboplatin, etoposide and ifosfamide in children with Central Nervous System Neoplasms as compared to genotype GG.",AA + AG,Associated with,increased,risk of,Efficacy:treatment related mortality,GG


In [19]:
var_fa_ann[['Variant Annotation ID', 'PMID', 'Sentence', 'Alleles', 'Is/Is Not associated', 'Direction of effect', 'Functional terms', 'Gene/gene product', 'Comparison Allele(s) or Genotype(s)']]

Unnamed: 0,Variant Annotation ID,PMID,Sentence,Alleles,Is/Is Not associated,Direction of effect,Functional terms,Gene/gene product,Comparison Allele(s) or Genotype(s)
0,1451148445,30902024,CYP2C19 *17/*17 is associated with increased formation of normeperidine as compared to CYP2C19 *1/*1 + *1/*17.,*17/*17,Associated with,increased,formation of,,*1/*1 + *1/*17
1,1447814273,26847243,Allele T is not associated with transcription of VKORC1 in HepG2 cells as compared to allele C.,T,Not associated with,,transcription of,VKORC1,C
2,1447814277,26847243,Allele C is associated with increased transcription of VKORC1 in HepG2 cells as compared to allele T.,C,Associated with,increased,transcription of,VKORC1,T
3,1447990384,2211621,Allele A is associated with decreased activity of CYP2D6 when assayed with bufuralol in COS-1 cells as compared to allele G.,A,Associated with,decreased,activity of,CYP2D6,G
4,1448281185,27439448,CYP2B6 *1/*1 is associated with increased activity of CYP2B6 when assayed with bupropion as compared to CYP2B6 *1/*6.,*1/*1,Associated with,increased,activity of,CYP2B6,*1/*6
...,...,...,...,...,...,...,...,...,...
2004,1452444774,38596098,Allele A is associated with decreased catalytic activity of CYP2B6 when assayed with efavirenz by recombinant protein expressed in Sf21 cells as compared to allele G.,A,Associated with,decreased,catalytic activity of,CYP2B6,G
2005,1452444804,38596098,Allele A is associated with decreased catalytic activity of CYP2B6 when assayed with efavirenz by recombinant protein expressed in Sf21 cells as compared to allele G.,A,Associated with,decreased,catalytic activity of,CYP2B6,G
2006,1452444900,38596098,CYP2B6 *47 is associated with decreased catalytic activity of CYP2B6 when assayed with efavirenz by recombinant protein expressed in Sf21 cells.,*47,Associated with,decreased,catalytic activity of,CYP2B6,
2007,1452444811,38596098,Allele A is associated with decreased catalytic activity of CYP2B6 when assayed with efavirenz by recombinant protein expressed in Sf21 cells as compared to allele G.,A,Associated with,decreased,catalytic activity of,CYP2B6,G


In [53]:
def merge_variant_annotation_tables():
    # Select relevant columns
    drug_df = var_drug_ann[[
        'Variant Annotation ID', 'PMID', 'Sentence', 'Alleles', 'Is/Is Not associated', 
        'Direction of effect', 'PD/PK terms', 'Drug(s)', 
        'Comparison Allele(s) or Genotype(s)'
    ]]
    phenotype_df = var_pheno_ann[[
        'Variant Annotation ID', 'PMID', 'Sentence', 'Alleles', 'Is/Is Not associated', 
        'Direction of effect', 'Side effect/efficacy/other', 'Phenotype', 
        'Comparison Allele(s) or Genotype(s)'
    ]]
    functional_df = var_fa_ann[[
        'Variant Annotation ID', 'PMID', 'Sentence', 'Alleles', 'Is/Is Not associated',
        'Direction of effect', 'Functional terms', 'Gene/gene product', 
        'Comparison Allele(s) or Genotype(s)'
    ]]
    # Rename differing columns so we can concat
    drug_df = drug_df.rename(columns={'PD/PK terms': EFFECT_COL_NAME, 'Drug(s)': OBJECT_COL_NAME})
    phenotype_df = phenotype_df.rename(columns={'Side effect/efficacy/other': EFFECT_COL_NAME, 'Phenotype': OBJECT_COL_NAME})
    functional_df = functional_df.rename(columns={'Functional terms': EFFECT_COL_NAME, 'Gene/gene product': OBJECT_COL_NAME})

    return pd.concat((drug_df, phenotype_df, functional_df))

In [56]:
def get_evidence_ids(caids=None):
    if not caids:
        caids = clinical_ann_evidence[ID_COL_NAME]
    # Variant annotation ids
    caid_to_vaid = {
        caid: clinical_ann_evidence[clinical_ann_evidence[ID_COL_NAME] == caid]['Evidence ID'].to_list()
        for caid in caids
    }
    return caid_to_vaid
    

def extended_parse_genotype(genotype_string):
    """
    Parse PGKB string representations of genotypes into alleles. Extended to include star alleles.
    """
    alleles = [genotype_string]
    
    # SNPs
    if len(genotype_string) == 2 and '*' not in genotype_string:
        alleles = [genotype_string[0], genotype_string[1]]

    # others
    m = re.match('([^/]+)/([^/]+)', genotype_string, re.IGNORECASE)
    if m:
        alleles = [m.group(1), m.group(2)]

    return alleles

In [100]:
def get_associations(annotation_df, clinical_alleles_df):
    """Main association method"""
    # Split on +
    split_ann_df = split_and_explode_column(annotation_df, 'Alleles', 'split_alleles_1', sep='\+')
    # Split on /
    split_ann_df = split_and_explode_column(split_ann_df, 'split_alleles_1', 'split_alleles_2', sep='/')
    # Get alleles from clinical annotations - same logic as for getting ids
    split_clin_df = clinical_alleles_df.assign(parsed_genotype=clinical_alleles_df['Genotype/Allele'].apply(extended_parse_genotype))
    split_clin_df = split_clin_df.explode('parsed_genotype').reset_index(drop=True)

    # Match by +-split and /-split
    merged_df = pd.merge(split_clin_df, split_ann_df, how='outer', left_on='Genotype/Allele', right_on='split_alleles_1')
    merged_df_2 = pd.merge(split_clin_df, split_ann_df, how='outer', left_on='parsed_genotype', right_on='split_alleles_2')

    # If a genotype in a clinical annotation doesn't have evidence, want this listed with nan's
    all_results = []
    for _, genotype, _, parsed_genotype in split_clin_df.itertuples(index=False):
        # Rows that matched on genotype
        rows_first_match = merged_df[(merged_df['Genotype/Allele'] == genotype) & (merged_df['parsed_genotype'] == parsed_genotype) & (~merged_df['Variant Annotation ID'].isna())]
        # Rows that matched on parsed genotype
        rows_second_match = merged_df_2[(merged_df_2['Genotype/Allele'] == genotype) & (merged_df_2['parsed_genotype'] == parsed_genotype) & (~merged_df_2['Variant Annotation ID'].isna())]
        
        # If neither matches, add with nan's
        if rows_first_match.empty and rows_second_match.empty:
            all_results.append(merged_df[(merged_df['Genotype/Allele'] == genotype) & (merged_df['parsed_genotype'] == parsed_genotype)])
        else:
            all_results.extend([rows_first_match, rows_second_match])

    final_result = pd.concat(all_results).drop_duplicates()

    # If _no_ part of a variant annotation is associated with any clinical annotation, want this listed with nan's
    for idx, row in split_ann_df.iterrows():
        vaid = row[VAR_ID_COL_NAME]
        alleles = row['Alleles']
        split_1 = row['split_alleles_1']
        split_2 = row['split_alleles_2']
        results_with_vaid = final_result[final_result[VAR_ID_COL_NAME] == vaid]
        if results_with_vaid.empty:
            final_result = pd.concat((final_result,
                                      merged_df[(merged_df['Variant Annotation ID'] == vaid) & (merged_df['Alleles'] == alleles) &
                                                (merged_df['split_alleles_1'] == split_1) & (merged_df['split_alleles_2'] == split_2)]
                                     ))
    return final_result

In [101]:
def get_results(caid_to_vaid):
    """Main method for getting resulting associations"""
    results = {}
    variant_annotations = merge_variant_annotation_tables()
    for caid, vaids in caid_to_vaid.items():
        clinical_alleles_df = clinical_ann_alleles[clinical_ann_alleles[ID_COL_NAME] == caid][[ID_COL_NAME, 'Genotype/Allele', 'Annotation Text']]
        variant_ann_for_caid = variant_annotations[variant_annotations[VAR_ID_COL_NAME].isin(vaids)]
        results[caid] = get_associations(variant_ann_for_caid, clinical_alleles_df)

    return results

In [102]:
example_results = get_results(get_evidence_ids(example_ca_ids))

In [111]:
# Sanity check for altered implementation
compute_variant_annotation_coverage(example_results)
compute_clinical_annotation_genotype_coverage(example_results)
compute_clinical_annotation_coverage(example_results)

Coverage of variant annotations: 113/116 (97.41%)
Coverage of clinical annotation genotype/alleles: 35/37 (94.59%)
Coverage of clinical annotations: 5/5 (100.00%)


In [119]:
full_results = get_results(get_evidence_ids())

In [120]:
compute_variant_annotation_coverage(full_results)
compute_clinical_annotation_genotype_coverage(full_results)
compute_clinical_annotation_coverage(full_results)

Coverage of variant annotations: 13380/14280 (93.70%)
Coverage of clinical annotation genotype/alleles: 10258/15755 (65.11%)
Coverage of clinical annotations: 5078/5111 (99.35%)


### Next steps

* Check coverage of comparison allele among all variant annotations (combined table)
* Group associations by clinical annotation genotype, check for ones with positive association and check direction

In [112]:
combined_table = merge_variant_annotation_tables()

In [116]:
num_compare_alleles = combined_table[COMPARISON_COL_NAME].count()
num_variant_annot = len(combined_table)
# There's actually a pretty high coverage of comparison alleles, so maybe we can use these
print(num_compare_alleles/num_variant_annot)

0.8455901119334962


In [117]:
num_alleles = combined_table['Alleles'].count()
num_alleles / num_variant_annot

0.9708681226528603

In [118]:
# Note larger than number computed in results above => some variant annotations not associated with a clinical annotation (currently)
num_variant_annot

27427

In [135]:
pd.options.mode.copy_on_write = True

In [136]:
for caid in example_ca_ids:
    print(caid)
    ex = example_results[caid]
    filtered_summary = ex[ex[ASSOC_COL_NAME] == 'Associated with']
    filtered_summary['summary'] = filtered_summary[DOE_COL_NAME].astype(str) + ' ' + filtered_summary[EFFECT_COL_NAME].astype(str) + ' ' + filtered_summary[OBJECT_COL_NAME].astype(str)
    display(filtered_summary.groupby(by='Genotype/Allele').aggregate(set)[['summary']])

981755803


Unnamed: 0_level_0,summary
Genotype/Allele,Unnamed: 1_level_1
AA,"{nan response to ivacaftor, decreased severity of Side Effect:bone density, nan activity of CFTR, increased response to ivacaftor, increased activity of CFTR, decreased likelihood of Disease:cystic fibrosis pulmonary exacerbation}"
AG,"{nan response to ivacaftor, decreased severity of Side Effect:bone density, nan activity of CFTR, increased response to ivacaftor, increased activity of CFTR, decreased likelihood of Disease:cystic fibrosis pulmonary exacerbation}"


1139506787


Unnamed: 0_level_0,summary
Genotype/Allele,Unnamed: 1_level_1
*1,"{decreased metabolism of nicotine, decreased metabolism of coumarin, nicotine, increased clearance of nicotine, increased metabolism of nicotine, decreased concentrations of 3-hydroxycotinine, decreased clearance of nicotine}"
*10,"{decreased metabolism of nicotine, decreased metabolism of coumarin, nicotine, decreased nan PK:plasma and urinary ratio of metabolites trans-3'-hydroxycotinine to cotinine (3HC/COT)}"
*11,{decreased metabolism of nicotine}
*12,"{decreased metabolism of nicotine, decreased clearance of nicotine, decreased clearance of cotinine, nicotine, decreased concentrations of 3-hydroxycotinine}"
*13,{decreased metabolism of nicotine}
*14,{increased metabolism of nicotine}
*15,{decreased metabolism of nicotine}
*17,"{decreased metabolism of nicotine, decreased catalytic activity of CYP2A6, decreased nan PK:metabolism}"
*19,"{decreased metabolism of nicotine, decreased catalytic activity of CYP2A6}"
*1x2,{increased metabolism of nicotine}


1183888969


Unnamed: 0_level_0,summary
Genotype/Allele,Unnamed: 1_level_1
CC,"{increased nan Efficacy:sustained virological response (svr), increased response to peginterferon alfa-2b, ribavirin, telaprevir, increased response to peginterferon alfa-2a, peginterferon alfa-2b, ribavirin, telaprevir, increased response to boceprevir, peginterferon alfa-2a, peginterferon alfa-2b, ribavirin, telaprevir}"
CT,"{increased response to boceprevir, peginterferon alfa-2a, peginterferon alfa-2b, ribavirin, telaprevir}"


1184514050


Unnamed: 0_level_0,summary
Genotype/Allele,Unnamed: 1_level_1
CC,{increased risk of Side Effect:Leukopenia}
CT,"{increased risk of Disease:Leukopenia, increased risk of Side Effect:Leukopenia, increased likelihood of Side Effect:Alopecia, Side Effect:Digestive system anomalies NOS, Side Effect:Leukopenia, increased risk of Side Effect:Leukopenia, Side Effect:Neutropenia, increased likelihood of Side Effect:Neutropenia, nan nan Side Effect:Alopecia, Side Effect:Leukopenia, increased likelihood of Side Effect:Leukopenia, increased risk of Side Effect:Pancytopenia, Side Effect:Thrombocytopenia}"
TT,"{increased risk of Disease:Leukopenia, nan nan Side Effect:Neutropenia, increased risk of Side Effect:Leukopenia, increased severity of Disease:Leukopenia, nan nan Side Effect:Agranulocytosis, Side Effect:Alopecia, Side Effect:febrile neutropenia, Side Effect:Leukopenia, Side Effect:Neutropenia, Side Effect:Thrombocytopenia, Side Effect:Toxic liver disease, increased likelihood of Side Effect:Alopecia, Side Effect:Digestive system anomalies NOS, Side Effect:Leukopenia, increased risk of Side Effect:Leukopenia, Side Effect:Neutropenia, increased likelihood of Side Effect:Neutropenia, nan nan Side Effect:Infection, Side Effect:Pancytopenia, nan nan Side Effect:Leukopenia, nan nan Side Effect:Alopecia, Side Effect:Leukopenia, increased likelihood of Side Effect:Leukopenia, increased risk of Side Effect:Pancytopenia, Side Effect:Thrombocytopenia}"


981419266


Unnamed: 0_level_0,summary
Genotype/Allele,Unnamed: 1_level_1
*15:02,"{nan nan Disease:Drug Hypersensitivity, nan risk of ""Side Effect:Epidermal Necrolysis, Toxic"", ""Side Effect:Stevens-Johnson Syndrome"", increased risk of Side Effect:severe cutaneous adverse reactions, increased risk of Side Effect:Stevens-Johnson Syndrome, increased risk of Disease:Stevens-Johnson Syndrome, increased risk of Side Effect:Epidermal Necrolysis, Toxic or Stevens-Johnson Syndrome, increased risk of ""Side Effect:Epidermal Necrolysis, Toxic"", ""Side Effect:Stevens-Johnson Syndrome"", increased risk of ""Disease:Epidermal Necrolysis, Toxic"", ""Disease:Stevens-Johnson Syndrome""}"


* For each genotype there are several different types of effects being reported
* In many but not all cases, there are no contradictions for a single effect type and single genotype
    * Note for 1139506787, the apparent contradiction is for `*1` which is the reference
        * I'd guess this is from "unwinding" variant annotations on ref/alt genotypes that are really "about" the alt rather than the ref
        * Should be able to check this by checking comparison alleles, if present - if it's compared to ref/ref then it's not saying anything about ref alone
* Ways to improve this
    * Use the "or/and" column to combine drugs etc. appropriately
    * Obviously deal with the `nan`s - depends how we should report to OT (i.e. the column breakdown vs. the text)

Next question: How do we "evaluate" this on the entire dataset?