# Mono resistance prediction

In this POC, we gloss over

- the resistance towards distinct drugs
- treat the mdr and xdr as simply being resistant


In [1]:
# Import the usual suspects.
import allel
import pandas as pd


In [3]:
# Code to create a custom snp_df
"""

# see http://alimanfoo.github.io/2017/06/14/read-vcf.html for allel tutorial
vcf = allel.read_vcf("../data/interim/haplotypeCaller/cohort.bqsr.filter.snps.vcf")

def vcf_get_index(vcf):
    """Return index as "CHR.POS"."""
    return [
        ".".join([chrom, str(pos)]) for chrom, pos in zip(
            vcf["variants/CHROM"],
            vcf["variants/POS"]
        )
    ]

def vcf_get_columns(vcf):
    """Return columns as "SAMPLE_ID"."""
    return vcf["samples"]

def vcf_to_snp_count(vcf):
    """Yield snp counts per sample, iterating over the snps in vcf."""
    for gt in vcf["calldata/GT"]:
        # GT value per allele is 0 if ref, >0 if alt.
        # see https://samtools.github.io/hts-specs/VCFv4.2.pdf section 1.4.2.
        n_snps = [sum(allele > 0) for allele in gt]
        yield n_snps

# TODO implement a function to only tell the nature of GT -> homo (0) or hetero (1)

snps_df = pd.DataFrame(
    vcf_to_snp_count(vcf),
    index=vcf_get_index(vcf),
    columns=vcf_get_columns(vcf)
)

"""

In [11]:
snps_df = pd.read_json("../data/processed/snps_df.json")
snps_df.head()



Unnamed: 0,NC000962_3.11,NC000962_3.78,NC000962_3.80,NC000962_3.82,NC000962_3.88,NC000962_3.102,NC000962_3.104,NC000962_3.108,NC000962_3.117,NC000962_3.120,...,NC000962_3.4410066,NC000962_3.4410070,NC000962_3.4410276,NC000962_3.4410309,NC000962_3.4410847,NC000962_3.4411128,NC000962_3.4411245,NC000962_3.4411286,NC000962_3.4411405,NC000962_3.4411518
ERR3129939,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERR3148148,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERR3148149,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERR3148151,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERR3148153,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
"""
# Read the final results file from tb-profiler results

tbprofiler_df = pd.read_json("../data/raw/tbProfiler/tbprofiler.json", encoding="UTF-8")
tbprofiler_df = tbprofiler_df.transpose()
tbprofiler_df.head()
"""

Unnamed: 0,rifampicin,isoniazid,pyrazinamide,ethambutol,streptomycin,fluoroquinolones,moxifloxacin,ofloxacin,levofloxacin,ciprofloxacin,...,cycloserine,linezolid,bedaquiline,clofazimine,delamanid,main_lin,sublin,drtype,MDR,XDR
ERR760783,rpoB_p.Ser450Leu,katG_p.Ser315Thr,pncA_p.Gln10Pro,embB_p.Gly406Ala,-,-,-,-,-,-,...,-,-,-,-,-,lineage4,lineage4.1.2.1,MDR,R,
ERR776661,rpoB_p.Ser450Leu,ahpC_c.-81C>T,pncA_p.Met175Val,"embB_p.Gly406Ser, embB_p.Met306Val, embA_c.-16C>G","rpsL_p.Lys43Arg, rpsL_p.Lys88Gln",-,-,-,-,-,...,-,-,-,-,-,lineage4,lineage4.3.2,MDR,R,
SRR11098556,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,lineage2,lineage2.2.1,Sensitive,,
ERR760911,rpoB_p.Ser450Leu,katG_p.Ser315Thr,pncA_p.Gln10Pro,embB_p.Gly406Ala,-,-,-,-,-,-,...,-,-,-,-,-,lineage4,lineage4.1.2.1,MDR,R,
SRR9224969,rpoB_p.Ser450Leu,"fabG1_c.-15C>T, katG_p.Ser315Thr",pncA_p.Gly132Ser,embB_p.Met306Val,-,-,-,-,-,-,...,-,-,-,-,-,lineage4,lineage4.1.2.1,MDR,R,


In [4]:
"""
resistance_status_df = tbprofiler_df[['drtype']]
resistance_status_df['isResistant'] = resistance_status_df.apply(lambda row: 0 if (row.drtype == 'Sensitive') else 1, axis = 1)
resistance_status_df = resistance_status_df.drop(['drtype'], axis=1)
resistance_status_df.head()
"""

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,isResistant
ERR760783,1
ERR776661,1
SRR11098556,0
ERR760911,1
SRR9224969,1


In [8]:
resistance_status_df = pd.read_json("../data/processed/resistance_status_df.json")
resistance_status_df.head()

Unnamed: 0,isResistant
ERR3087504,0
ERR3129939,0
ERR3129940,0
ERR3148147,1
ERR3148148,0


In [17]:
# mono_resistance_df = snps_df.join(resistance_status_df)
# mono_resistance_df.to_json("../data/processed/mono_resistance_df.json")

mono_resistance_df = pd.read_json("../data/processed/mono_resistance_df.json")

In [18]:
mono_resistance_df.head()

Unnamed: 0,NC000962_3.11,NC000962_3.78,NC000962_3.80,NC000962_3.82,NC000962_3.88,NC000962_3.102,NC000962_3.104,NC000962_3.108,NC000962_3.117,NC000962_3.120,...,NC000962_3.4410070,NC000962_3.4410276,NC000962_3.4410309,NC000962_3.4410847,NC000962_3.4411128,NC000962_3.4411245,NC000962_3.4411286,NC000962_3.4411405,NC000962_3.4411518,isResistant
ERR3129939,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
ERR3148148,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
ERR3148149,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0
ERR3148151,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0
ERR3148153,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0


In [20]:
# mono_resistance_df.shape

(301, 134257)

In [26]:
mono_resistance_df['isResistant'].value_count()

AttributeError: 'Series' object has no attribute 'value_count'