# Mono resistance prediction

In this POC, we gloss over

- the resistance towards distinct drugs
- treat the mdr and xdr as simply being resistant


In [2]:
# Import the usual suspects.
import allel
import pandas as pd


In [None]:

# see http://alimanfoo.github.io/2017/06/14/read-vcf.html for allel tutorial
vcf = allel.read_vcf("../data/interim/haplotypeCaller/cohort.bqsr.filter.snps.vcf")

def vcf_get_index(vcf):
    """Return index as "CHR.POS"."""
    return [
        ".".join([chrom, str(pos)]) for chrom, pos in zip(
            vcf["variants/CHROM"],
            vcf["variants/POS"]
        )
    ]

def vcf_get_columns(vcf):
    """Return columns as "SAMPLE_ID"."""
    return vcf["samples"]

def vcf_to_snp_count(vcf):
    """Yield snp counts per sample, iterating over the snps in vcf."""
    for gt in vcf["calldata/GT"]:
        # GT value per allele is 0 if ref, >0 if alt.
        # see https://samtools.github.io/hts-specs/VCFv4.2.pdf section 1.4.2.
        n_snps = [sum(allele > 0) for allele in gt]
        yield n_snps

# TODO implement a function to only tell the nature of GT -> homo (0) or hetero (1)

snps_df = pd.DataFrame(
    vcf_to_snp_count(vcf),
    index=vcf_get_index(vcf),
    columns=vcf_get_columns(vcf)
)

In [None]:
snps_df = pd.read_json("../data/processed/snps_df.json")
snps_df.head()



In [None]:
# Read the final results file from tb-profiler results

tbprofiler_df = pd.read_json("../data/raw/tbProfiler/tbprofiler.json", encoding="UTF-8")
tbprofiler_df = tbprofiler_df.transpose()
tbprofiler_df.head()


In [8]:
resistance_status_df = tbprofiler_df[['drtype']]
resistance_status_df['isResistant'] = resistance_status_df.apply(lambda row: 0 if (row.drtype == 'Sensitive') else 1, axis = 1)
resistance_status_df = resistance_status_df.drop(['drtype'], axis=1)
resistance_status_df.head()


Unnamed: 0,rifampicin,isoniazid,pyrazinamide,ethambutol,streptomycin,fluoroquinolones,moxifloxacin,ofloxacin,levofloxacin,ciprofloxacin,...,linezolid,bedaquiline,clofazimine,delamanid,main_lin,sublin,drtype,MDR,XDR,isResistant
ERR760783,rpoB_p.Ser450Leu,katG_p.Ser315Thr,pncA_p.Gln10Pro,embB_p.Gly406Ala,-,-,-,-,-,-,...,-,-,-,-,lineage4,lineage4.1.2.1,MDR,R,,1
ERR776661,rpoB_p.Ser450Leu,ahpC_c.-81C>T,pncA_p.Met175Val,"embB_p.Gly406Ser, embB_p.Met306Val, embA_c.-16C>G","rpsL_p.Lys43Arg, rpsL_p.Lys88Gln",-,-,-,-,-,...,-,-,-,-,lineage4,lineage4.3.2,MDR,R,,1
SRR11098556,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,lineage2,lineage2.2.1,Sensitive,,,0
ERR760911,rpoB_p.Ser450Leu,katG_p.Ser315Thr,pncA_p.Gln10Pro,embB_p.Gly406Ala,-,-,-,-,-,-,...,-,-,-,-,lineage4,lineage4.1.2.1,MDR,R,,1
SRR9224969,rpoB_p.Ser450Leu,"fabG1_c.-15C>T, katG_p.Ser315Thr",pncA_p.Gly132Ser,embB_p.Met306Val,-,-,-,-,-,-,...,-,-,-,-,lineage4,lineage4.1.2.1,MDR,R,,1


In [12]:
final_df = resistance_status_df.join(snps_df)
cols = list(final_df.columns)
cols = [*cols[1:], cols[0]]
final_df = final_df[cols]
final_df.head()

Unnamed: 0,isoniazid,pyrazinamide,ethambutol,streptomycin,fluoroquinolones,moxifloxacin,ofloxacin,levofloxacin,ciprofloxacin,aminoglycosides,...,NC000962_3.4410070,NC000962_3.4410276,NC000962_3.4410309,NC000962_3.4410847,NC000962_3.4411128,NC000962_3.4411245,NC000962_3.4411286,NC000962_3.4411405,NC000962_3.4411518,rifampicin
ERR760783,katG_p.Ser315Thr,pncA_p.Gln10Pro,embB_p.Gly406Ala,-,-,-,-,-,-,rrs_r.1401a>g,...,,,,,,,,,,rpoB_p.Ser450Leu
ERR776661,ahpC_c.-81C>T,pncA_p.Met175Val,"embB_p.Gly406Ser, embB_p.Met306Val, embA_c.-16C>G","rpsL_p.Lys43Arg, rpsL_p.Lys88Gln",-,-,-,-,-,-,...,,,,,,,,,,rpoB_p.Ser450Leu
SRR11098556,-,-,-,-,-,-,-,-,-,-,...,,,,,,,,,,-
ERR760911,katG_p.Ser315Thr,pncA_p.Gln10Pro,embB_p.Gly406Ala,-,-,-,-,-,-,rrs_r.1401a>g,...,,,,,,,,,,rpoB_p.Ser450Leu
SRR9224969,"fabG1_c.-15C>T, katG_p.Ser315Thr",pncA_p.Gly132Ser,embB_p.Met306Val,-,-,-,-,-,-,-,...,,,,,,,,,,rpoB_p.Ser450Leu


Unnamed: 0.1,Unnamed: 0,NC000962_3.82,NC000962_3.88,NC000962_3.102,NC000962_3.104,NC000962_3.108,NC000962_3.117,NC000962_3.120,NC000962_3.135,NC000962_3.138,...,NC000962_3.4410847,NC000962_3.4411128,NC000962_3.4411245,NC000962_3.4411286,NC000962_3.4411405,NC000962_3.4411518,isResistant,NC000962_3.11,NC000962_3.78,NC000962_3.80
0,SRR10851707,0,0,0,0,0,0,0,0,0,...,0,0,2,0,0,0,0,0,0,0
1,SRR3732596,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ERR751356,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,ERR751456,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,ERR751482,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
5,ERR751359,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
6,SRR11922528,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,ERR775339,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
8,ERR775387,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
