In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib

import seaborn as sns

In [2]:
name_list = [
        "acpM-kasA",
        "gid",
        "rpsA",
        "clpC",
        "embCAB",
        "aftB-ubiA",
        "rrs-rrl",
        "ethAR",
        "oxyR-ahpC",
        "tlyA",
        "KatG",
        "rpsL",
        "rpoBC",
        "FabG1-inhA",
        "eis",
        "gyrBA",
        "panD",
        "pncA"
    ]

drug_list = ['ETHIONAMIDE',
 'STREPTOMYCIN',
 'CAPREOMYCIN',
 'RIFAMPICIN',
 'PYRAZINAMIDE',
 'AMIKACIN',
 'KANAMYCIN',
 'ETHAMBUTOL',
 'OFLOXACIN',
 'CIPROFLOXACIN',
 'MOXIFLOXACIN',
 'LEVOFLOXACIN',
 'ISONIAZID']

drug_to_loci = {
    'ETHIONAMIDE':["ethAR", "FabG1-inhA"],
    'STREPTOMYCIN':["rpsL", "rrs-rrl", "gid"],
    'CAPREOMYCIN':["tlyA", "rrs-rrl"],
    'RIFAMPICIN':["rpoBC"],
    'PYRAZINAMIDE':["rpsA", "clpC", "pncA", "panD"],
    'AMIKACIN':["rrs-rrl", "eis"],
    'KANAMYCIN':["rrs-rrl", "eis"],
    'ETHAMBUTOL':["embCAB", "aftB-ubiA"],
    'OFLOXACIN':["gyrBA"],
    'CIPROFLOXACIN':["gyrBA"],
    'MOXIFLOXACIN':["gyrBA"],
    'LEVOFLOXACIN':["gyrBA"],
    'ISONIAZID':["acpM-kasA", "KatG", "FabG1-inhA", "oxyR-ahpC"]
}

In [4]:
## Load in the H37Rv coordinates, sanity check by displaying length and starting coordinate
coords = np.load("../output_data/X_matrix_H37RV_coords.npy")

coords[coords==0] = np.nan

lens = coords.shape[0] - np.isnan(coords).sum(axis=0)
name_to_len = {}
for idx, (name, l) in enumerate(zip(name_list, lens)):
    print(idx, name, "length of locus", l)
    name_to_len[name] = l
    
coords = coords + 1
coords[0,:]

0 acpM-kasA length of locus 1670
1 gid length of locus 806
2 rpsA length of locus 1609
3 clpC length of locus 4206
4 embCAB length of locus 10147
5 aftB-ubiA length of locus 2880
6 rrs-rrl length of locus 5437
7 ethAR length of locus 2195
8 oxyR-ahpC length of locus 1303
9 tlyA length of locus 991
10 KatG length of locus 3471
11 rpsL length of locus 623
12 rpoBC length of locus 7711
13 FabG1-inhA length of locus 2554
14 eis length of locus 2531
15 gyrBA length of locus 4821
16 panD length of locus 2169
17 pncA length of locus 1716


array([2517696., 4407529., 1833379., 4036732., 4239664., 4266954.,
       1471577., 4326005., 2725478., 1917756., 2153236.,  781312.,
        759610., 1672458., 2713784.,    4998., 4043042., 2287884.])

## List of top positions per drug

In [6]:
# Create a list of top 1 % of salient positions for each drug
!mkdir output

# make a matrix that contains the name of each gene, with the same size as the "combined" matrix of scores
combined = np.load(f"../output_data/md_cnn_saliency/RIFAMPICIN_mean.npy")
gene_names = np.zeros_like(combined, dtype=object)
for i,ax in enumerate(name_list):
    gene_names[:,i] = [name_list[i]] * combined[:,i].shape[0]

# Iterate through drugs and create a dataframe of the scores for each position
for drug in drug_list:
    combined_mean = np.load(f"../output_data/md_cnn_saliency//{drug}_mean.npy")
    combined_max = np.load(f"../output_data/md_cnn_saliency//{drug}_max.npy")
    df = pd.DataFrame({
        "score_mean": combined_mean.flatten(),
        "score_max": combined_max.flatten(),
        "position": coords.flatten(),
        "locus": gene_names.flatten()
    })
    
    df["abs_score"] = np.abs(df.score_max)
    df = df.sort_values("abs_score", ascending=False)
    top_n = len(df) * 0.01
    
    top_hits = df.iloc[0:int(top_n)]
    
    top_hits.to_csv(f"output/MD-CNN_{drug}_top_0p01_hits.csv")

mkdir: output: File exists
