In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib
import glob
import seaborn as sns

In [12]:
locus_list = [
        "acpM-kasA",
        "gid",
        "rpsA",
        "clpC",
        "embCAB",
        "aftB-ubiA",
        "rrs-rrl",
        "ethAR",
        "oxyR-ahpC",
        "tlyA",
        "KatG",
        "rpsL",
        "rpoBC",
        "FabG1-inhA",
        "eis",
        "gyrBA",
        "panD",
        "pncA"
    ]

drug_list = ['ETHIONAMIDE',
 'STREPTOMYCIN',
 'CAPREOMYCIN',
 'RIFAMPICIN',
 'PYRAZINAMIDE',
 'AMIKACIN',
 'KANAMYCIN',
 'ETHAMBUTOL',
 'OFLOXACIN',
 #'CIPROFLOXACIN',
 'MOXIFLOXACIN',
 'LEVOFLOXACIN',
 'ISONIAZID']

drug_to_loci = {
    'ETHIONAMIDE':[ "FabG1-inhA","ethAR"],
    'STREPTOMYCIN':["gid", "rrs-rrl", "rpsL"],
    'CAPREOMYCIN':["tlyA", "rrs-rrl"],
    'RIFAMPICIN':["rpoBC"],
    'PYRAZINAMIDE':["pncA", "panD", "clpC", "rpsA"],
    'AMIKACIN':["eis", "rrs-rrl"],
    'KANAMYCIN':["eis", "rrs-rrl"],
    'ETHAMBUTOL':["embCAB", "aftB-ubiA"],
    'OFLOXACIN':["gyrBA"],
    #'CIPROFLOXACIN':["gyrBA"],
    'MOXIFLOXACIN':["gyrBA"],
    'LEVOFLOXACIN':["gyrBA"],
    'ISONIAZID':["acpM-kasA", "oxyR-ahpC", "KatG", "FabG1-inhA"]
}

In [13]:
data_path = "../output_data/"

# Read in the coordinates of the model in H37Rv numbering
coords = np.load(f"{data_path}/X_matrix_H37RV_coords.npy")

# Models were padded with 0, so fill in with nans
coords[coords==0] = np.nan

# Confirm correct locus lengths
lens = coords.shape[0] - np.isnan(coords).sum(axis=0)
name_to_len = {}
for name, l in zip(locus_list, lens):
    name_to_len[name] = l
 
# coords were 0-indexed so must add 1
coords = coords + 1

## List of top positions per drug

In [8]:
# lineage positions for comparison
lineage_positions = pd.read_csv("../input_data/correlation_to_lineage_variants.csv")
lineage_positions["position"] = lineage_positions.associated_position
lineage_positions = lineage_positions.sort_values("position")
coll_positions = pd.read_csv("../input_data/Coll_2014_lineage_barcode.csv")

# Create a list of top 1 % of salient positions for each drug
!mkdir output

# Iterate through drugs and create a dataframe of the scores for each position
for drug in drug_list:
    combined_mean = np.load(f"{data_path}/sd_cnn_saliency/{drug}_mean.npy")
    combined_max = np.load(f"{data_path}/sd_cnn_saliency/{drug}_max.npy")
    
    ### Find the appropriate coordinate subset
    locus_indices = [locus_list.index(x) for x in drug_to_loci[drug]]
    
    gene_names = np.zeros_like(combined_mean, dtype=object)
    for i,ax in enumerate(drug_to_loci[drug]):
        gene_names[:,i] = [ax] *  combined_mean[:,i].shape[0]
    
    df = pd.DataFrame({
        "score_mean": combined_mean.flatten(),
        "score_max": combined_max.flatten(),
        "position": coords[0:combined_mean.shape[0]][:,locus_indices].flatten(),
        "locus": gene_names.flatten()
    })
    
    df["abs_score"] = np.abs(df.score_max)
    df = df.sort_values("abs_score", ascending=False)
    df["in_coll_barcode"] = [True if x in coll_positions.position else False for x in df.position]
    df["in_freschi_barcode"] = [True if x in set(lineage_positions.position) else False for x in df.position]
    
    top_n = len(df) * 0.01
    
    top_hits = df.iloc[0:int(top_n)]
    
    top_hits.to_csv(f"output/{drug}_top_0p01_hits.csv")
    
    df.to_csv(f"output/{drug}_all_hits.csv")

mkdir: output: File exists


## Analyze enrichment of lineage positions

In [15]:
# First for top fraction of salient positions
info = []
for drug in drug_list:
    saliency_file = f"output/{drug}_top_0p01_hits.csv"
    d = pd.read_csv(saliency_file, index_col=0)
    
    top_n = int(len(d)* 0.1)
    top_d = d.iloc[0:top_n]
    
    info.append([drug, len(d), np.sum(d.in_coll_barcode), np.sum(d.in_freschi_barcode),
                np.sum(top_d.in_coll_barcode), np.sum(top_d.in_freschi_barcode)])

df = pd.DataFrame(info, columns=["drug", "N_hits", "N_hits_coll", "N_hits_freschi", "N_hits_coll_0p001", "N_hits_freschi_0p001"])

df_top_0p01 = df


# Now for all positions
info = []
for drug in drug_list:
    saliency_file = f"output/{drug}_all_hits.csv"
    d = pd.read_csv(saliency_file, index_col=0)
    d["in_coll_barcode"] = [True if x in coll_positions.position else False for x in d.position]
    d["in_freschi_barcode"] = [True if x in set(lineage_positions.position) else False for x in d.position]
    d.to_csv(f"output/{drug}_all_saliency_barcode.csv")
    
    info.append([drug, len(d), np.sum(d.in_coll_barcode), np.sum(d.in_freschi_barcode)])

df = pd.DataFrame(info, columns=["drug", "N_sites", "N_sites_hits_coll", "N_sites_hits_freschi"])

lineage_comparison_df = df
lineage_comparison_df = lineage_comparison_df.merge(df_top_0p01, on="drug")


## Compare to known resistance-conferring SNPs

In [16]:
! cp ../../../databases/WHO_resistance_variants_Cat1orCat2.csv ../output_data

In [17]:
final_confidence_positions = pd.read_csv("../output_data/WHO_resistance_variants_Cat1orCat2.csv")

In [22]:
column_order = ['position',"locus_x", "gene", "score_max",  
                "score_mean","abs_score", "in_coll_barcode", "in_freschi_barcode",
        'in_WHO', 'in_INH_WHO',
       'in_RIF_WHO', 'in_EMB_WHO', 'in_STM_WHO', 
       'in_AMI_WHO', 'in_ETH_WHO', 'in_PZA_WHO', 'in_LEV_WHO', 'in_MXF_WHO',
        'in_KAN_WHO']

In [23]:
# Save a file on the top 1% of saliencies for each drug and whether they're implicated in resistance based on Maha's paper
genome_position_to_gene = pd.read_csv("../../../databases/_OLD_genome_position_to_gene_2.csv", index_col=0)
genome_position_to_gene.head()

info = []
for drug in drug_list:
    # read in the saliency file
    saliency_file = f"output/{drug}_top_0p01_hits.csv"
    d = pd.read_csv(saliency_file, index_col=0)
    d = d.dropna(subset=["position"])
    d["coordinate"] = d.position.astype(int)
    
    # for each position, label whether it was implicated in resistance to any drug in WHO set
    column_names = []
    for key,val in final_confidence_positions.groupby("drug"):
        sites = set(val.genome_index)
        d[f"in_{key}_WHO"] = [x in sites for x in d.coordinate] 
        column_names.append(f"in_{key}_WHO")
        
    d["in_WHO"] = np.any(d[column_names].values, axis=1)
    
    interest_df = d.merge(genome_position_to_gene, left_on="position", right_on="i", how="left")
    
    # reorder columns and save
    d = interest_df[column_order]  
    d = d.sort_values("score_max", ascending=False)
    d[column_order].to_csv(f"output/{drug}_0p01_saliency_resistance.csv")
    

  mask |= (ar1 == a)


In [24]:
## Count the percent of top hits for max and mean that are already known
d = []
for drug in drug_list:

    df = pd.read_csv(f"output/{drug}_0p01_saliency_resistance.csv", index_col=0)
    df = df.sort_values("score_mean")
    percent_known = len(df.query("in_WHO"))/len(df)
    
    top_n = len(df) * 0.1
    top_hits = df.iloc[0:int(top_n)]

    percent_known_top_0p001 = len(top_hits.query("in_WHO"))/len(top_hits)
    
    top_n = len(df) 
    top_hits = df.iloc[0:int(top_n)]

    percent_known_top_0p01 = len(top_hits.query("in_WHO"))/len(top_hits)
    
    print(drug,int(top_n), percent_known_top_0p01, percent_known_top_0p001)
    d.append([drug,percent_known_top_0p01,percent_known_top_0p001 ])
    #print(top_hits.query("not in_farhat"))
    
pd.DataFrame(d, columns=["drug", "percent_known_top_0p01", "percent_known_top_0p001"]).to_csv("percent_known.csv")

ETHIONAMIDE 43 0.23255813953488372 0.75
STREPTOMYCIN 166 0.26506024096385544 0.375
CAPREOMYCIN 89 0.0449438202247191 0.5
RIFAMPICIN 76 0.5526315789473685 0.8571428571428571
PYRAZINAMIDE 159 0.6226415094339622 0.8
AMIKACIN 117 0.06837606837606838 0.6363636363636364
KANAMYCIN 114 0.07894736842105263 0.7272727272727273
ETHAMBUTOL 205 0.05365853658536585 0.5
OFLOXACIN 47 0.0851063829787234 1.0
MOXIFLOXACIN 48 0.125 1.0
LEVOFLOXACIN 48 0.0625 0.75
ISONIAZID 139 0.09352517985611511 0.46153846153846156
