In [1]:
# Import the usual suspects.
import pandas as pd
import numpy as np
import dask
import dask.dataframe as dd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()



# Feature engineering for Resistance Profile

In [2]:
tbprofiler_df = pd.read_json("../data/raw/tbprofiler.json", encoding="UTF-8")
tbprofiler_df = tbprofiler_df.transpose()
tbprofiler_df.head()


Unnamed: 0,rifampicin,isoniazid,pyrazinamide,ethambutol,streptomycin,fluoroquinolones,moxifloxacin,ofloxacin,levofloxacin,ciprofloxacin,...,cycloserine,linezolid,bedaquiline,clofazimine,delamanid,main_lin,sublin,drtype,MDR,XDR
ERR760783,rpoB_p.Ser450Leu,katG_p.Ser315Thr,pncA_p.Gln10Pro,embB_p.Gly406Ala,-,-,-,-,-,-,...,-,-,-,-,-,lineage4,lineage4.1.2.1,MDR,R,
ERR776661,rpoB_p.Ser450Leu,ahpC_c.-81C>T,pncA_p.Met175Val,"embB_p.Gly406Ser, embB_p.Met306Val, embA_c.-16C>G","rpsL_p.Lys43Arg, rpsL_p.Lys88Gln",-,-,-,-,-,...,-,-,-,-,-,lineage4,lineage4.3.2,MDR,R,
SRR11098556,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,lineage2,lineage2.2.1,Sensitive,,
ERR760911,rpoB_p.Ser450Leu,katG_p.Ser315Thr,pncA_p.Gln10Pro,embB_p.Gly406Ala,-,-,-,-,-,-,...,-,-,-,-,-,lineage4,lineage4.1.2.1,MDR,R,
SRR9224969,rpoB_p.Ser450Leu,"fabG1_c.-15C>T, katG_p.Ser315Thr",pncA_p.Gly132Ser,embB_p.Met306Val,-,-,-,-,-,-,...,-,-,-,-,-,lineage4,lineage4.1.2.1,MDR,R,


In [3]:
resistance_status_df = tbprofiler_df
resistance_status_df['Resistance_Status'] = resistance_status_df.apply(lambda row: 'Sensitive' if (row.drtype == 'Sensitive') else 'Resistant', axis = 1)
resistance_status_df.head()

Unnamed: 0,rifampicin,isoniazid,pyrazinamide,ethambutol,streptomycin,fluoroquinolones,moxifloxacin,ofloxacin,levofloxacin,ciprofloxacin,...,linezolid,bedaquiline,clofazimine,delamanid,main_lin,sublin,drtype,MDR,XDR,Resistance_Status
ERR760783,rpoB_p.Ser450Leu,katG_p.Ser315Thr,pncA_p.Gln10Pro,embB_p.Gly406Ala,-,-,-,-,-,-,...,-,-,-,-,lineage4,lineage4.1.2.1,MDR,R,,Resistant
ERR776661,rpoB_p.Ser450Leu,ahpC_c.-81C>T,pncA_p.Met175Val,"embB_p.Gly406Ser, embB_p.Met306Val, embA_c.-16C>G","rpsL_p.Lys43Arg, rpsL_p.Lys88Gln",-,-,-,-,-,...,-,-,-,-,lineage4,lineage4.3.2,MDR,R,,Resistant
SRR11098556,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,lineage2,lineage2.2.1,Sensitive,,,Sensitive
ERR760911,rpoB_p.Ser450Leu,katG_p.Ser315Thr,pncA_p.Gln10Pro,embB_p.Gly406Ala,-,-,-,-,-,-,...,-,-,-,-,lineage4,lineage4.1.2.1,MDR,R,,Resistant
SRR9224969,rpoB_p.Ser450Leu,"fabG1_c.-15C>T, katG_p.Ser315Thr",pncA_p.Gly132Ser,embB_p.Met306Val,-,-,-,-,-,-,...,-,-,-,-,lineage4,lineage4.1.2.1,MDR,R,,Resistant


In [4]:
resistance_status_df.to_csv("../data/processed/resistance_status_df.csv")
# resistance_status_df = pd.read_csv("../data/processed/resistance_status_df.csv")
resistance_status_df.head()


Unnamed: 0,rifampicin,isoniazid,pyrazinamide,ethambutol,streptomycin,fluoroquinolones,moxifloxacin,ofloxacin,levofloxacin,ciprofloxacin,...,linezolid,bedaquiline,clofazimine,delamanid,main_lin,sublin,drtype,MDR,XDR,Resistance_Status
ERR760783,rpoB_p.Ser450Leu,katG_p.Ser315Thr,pncA_p.Gln10Pro,embB_p.Gly406Ala,-,-,-,-,-,-,...,-,-,-,-,lineage4,lineage4.1.2.1,MDR,R,,Resistant
ERR776661,rpoB_p.Ser450Leu,ahpC_c.-81C>T,pncA_p.Met175Val,"embB_p.Gly406Ser, embB_p.Met306Val, embA_c.-16C>G","rpsL_p.Lys43Arg, rpsL_p.Lys88Gln",-,-,-,-,-,...,-,-,-,-,lineage4,lineage4.3.2,MDR,R,,Resistant
SRR11098556,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,lineage2,lineage2.2.1,Sensitive,,,Sensitive
ERR760911,rpoB_p.Ser450Leu,katG_p.Ser315Thr,pncA_p.Gln10Pro,embB_p.Gly406Ala,-,-,-,-,-,-,...,-,-,-,-,lineage4,lineage4.1.2.1,MDR,R,,Resistant
SRR9224969,rpoB_p.Ser450Leu,"fabG1_c.-15C>T, katG_p.Ser315Thr",pncA_p.Gly132Ser,embB_p.Met306Val,-,-,-,-,-,-,...,-,-,-,-,lineage4,lineage4.1.2.1,MDR,R,,Resistant


In [5]:
drugs_column_names = ['rifampicin',
                      'isoniazid',
                      'pyrazinamide',
                      'ethambutol',
                      'streptomycin',
                      'fluoroquinolones',
                      'moxifloxacin',
                      'ofloxacin',
                      'levofloxacin',
                      'ciprofloxacin',
                      'aminoglycosides',
                      'amikacin',
                      'kanamycin',
                      'capreomycin',
                      'ethionamide',
                      'para-aminosalicylic_acid',
                      'cycloserine',
                      'linezolid',
                      'bedaquiline',
                      'clofazimine',
                      'delamanid']


lineage_column_names = [ 'main_lin', 'sublin' ]

resistance_status_column_names = [ 'drtype', 'MDR', 'XDR', 'Resistance_Status' ]


In [6]:
renamed_drug_columns_names = [                     'rifampicin_resistance',
                                                   'isoniazid_resistance',
                                                   'pyrazinamide_resistance',
                                                   'ethambutol_resistance',
                                                   'streptomycin_resistance',
                                                   'fluoroquinolones_resistance',
                                                   'moxifloxacin_resistance',
                                                   'ofloxacin_resistance',
                                                   'levofloxacin_resistance',
                                                   'ciprofloxacin_resistance',
                                                   'aminoglycosides_resistance',
                                                   'amikacin_resistance',
                                                   'kanamycin_resistance',
                                                   'capreomycin_resistance',
                                                   'ethionamide_resistance',
                                                   'para-aminosalicylic_acid_resistance',
                                                   'cycloserine_resistance',
                                                   'linezolid_resistance',
                                                   'bedaquiline_resistance',
                                                   'clofazimine_resistance',
                                                   'delamanid_resistance']


renamed_drug_columns_names_dict = {
                         'rifampicin': 'rifampicin_resistance',
                         'isoniazid': 'isoniazid_resistance',
                         'pyrazinamide': 'pyrazinamide_resistance',
                         'ethambutol': 'ethambutol_resistance',
                         'streptomycin': 'streptomycin_resistance',
                         'fluoroquinolones': 'fluoroquinolones_resistance',
                         'moxifloxacin': 'moxifloxacin_resistance',
                         'ofloxacin': 'ofloxacin_resistance',
                         'levofloxacin': 'levofloxacin_resistance',
                         'ciprofloxacin': 'ciprofloxacin_resistance',
                         'aminoglycosides': 'aminoglycosides_resistance',
                         'amikacin': 'amikacin_resistance',
                         'kanamycin': 'kanamycin_resistance',
                         'capreomycin': 'capreomycin_resistance',
                         'ethionamide': 'ethionamide_resistance',
                         'para-aminosalicylic_acid': 'para-aminosalicylic_acid_resistance',
                         'cycloserine': 'cycloserine_resistance',
                         'linezolid': 'linezolid_resistance',
                         'bedaquiline': 'bedaquiline_resistance',
                         'clofazimine': 'clofazimine_resistance',
                         'delamanid': 'delamanid_resistance'
}

In [7]:
resistance_status_df.rename(columns = renamed_drug_columns_names_dict,
                            inplace=True)

resistance_status_df.head()


Unnamed: 0,rifampicin_resistance,isoniazid_resistance,pyrazinamide_resistance,ethambutol_resistance,streptomycin_resistance,fluoroquinolones_resistance,moxifloxacin_resistance,ofloxacin_resistance,levofloxacin_resistance,ciprofloxacin_resistance,...,linezolid_resistance,bedaquiline_resistance,clofazimine_resistance,delamanid_resistance,main_lin,sublin,drtype,MDR,XDR,Resistance_Status
ERR760783,rpoB_p.Ser450Leu,katG_p.Ser315Thr,pncA_p.Gln10Pro,embB_p.Gly406Ala,-,-,-,-,-,-,...,-,-,-,-,lineage4,lineage4.1.2.1,MDR,R,,Resistant
ERR776661,rpoB_p.Ser450Leu,ahpC_c.-81C>T,pncA_p.Met175Val,"embB_p.Gly406Ser, embB_p.Met306Val, embA_c.-16C>G","rpsL_p.Lys43Arg, rpsL_p.Lys88Gln",-,-,-,-,-,...,-,-,-,-,lineage4,lineage4.3.2,MDR,R,,Resistant
SRR11098556,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,lineage2,lineage2.2.1,Sensitive,,,Sensitive
ERR760911,rpoB_p.Ser450Leu,katG_p.Ser315Thr,pncA_p.Gln10Pro,embB_p.Gly406Ala,-,-,-,-,-,-,...,-,-,-,-,lineage4,lineage4.1.2.1,MDR,R,,Resistant
SRR9224969,rpoB_p.Ser450Leu,"fabG1_c.-15C>T, katG_p.Ser315Thr",pncA_p.Gly132Ser,embB_p.Met306Val,-,-,-,-,-,-,...,-,-,-,-,lineage4,lineage4.1.2.1,MDR,R,,Resistant


In [257]:
# for col_name in renamed_drug_columns_names:
#     resistance_status_df[col_name] = resistance_status_df[col_name].apply(lambda resistance: 0 if resistance is '-' else 1)
#
# resistance_status_df.head()



Unnamed: 0,rifampicin_resistance,isoniazid_resistance,pyrazinamide_resistance,ethambutol_resistance,streptomycin_resistance,fluoroquinolones_resistance,moxifloxacin_resistance,ofloxacin_resistance,levofloxacin_resistance,ciprofloxacin_resistance,...,linezolid_resistance,bedaquiline_resistance,clofazimine_resistance,delamanid_resistance,main_lin,sublin,drtype,MDR,XDR,isResistant
ERR760783,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,lineage4,lineage4.1.2.1,MDR,R,,1
ERR776661,1,1,1,1,1,0,0,0,0,0,...,0,0,0,0,lineage4,lineage4.3.2,MDR,R,,1
SRR11098556,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,lineage2,lineage2.2.1,Sensitive,,,0
ERR760911,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,lineage4,lineage4.1.2.1,MDR,R,,1
SRR9224969,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,lineage4,lineage4.1.2.1,MDR,R,,1


In [8]:
resistance_status_df.to_json("../data/processed/resistance_status_df.json")
resistance_status_df.to_csv("../data/processed/resistance_status_df.csv")

In [9]:
resistance_status_df = pd.read_csv("../data/processed/resistance_status_df.csv").rename(columns={'Unnamed: 0' : 'SampleID'}).set_index('SampleID')

resistance_status_df.head()

Unnamed: 0_level_0,rifampicin_resistance,isoniazid_resistance,pyrazinamide_resistance,ethambutol_resistance,streptomycin_resistance,fluoroquinolones_resistance,moxifloxacin_resistance,ofloxacin_resistance,levofloxacin_resistance,ciprofloxacin_resistance,...,linezolid_resistance,bedaquiline_resistance,clofazimine_resistance,delamanid_resistance,main_lin,sublin,drtype,MDR,XDR,Resistance_Status
SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ERR760783,rpoB_p.Ser450Leu,katG_p.Ser315Thr,pncA_p.Gln10Pro,embB_p.Gly406Ala,-,-,-,-,-,-,...,-,-,-,-,lineage4,lineage4.1.2.1,MDR,R,,Resistant
ERR776661,rpoB_p.Ser450Leu,ahpC_c.-81C>T,pncA_p.Met175Val,"embB_p.Gly406Ser, embB_p.Met306Val, embA_c.-16C>G","rpsL_p.Lys43Arg, rpsL_p.Lys88Gln",-,-,-,-,-,...,-,-,-,-,lineage4,lineage4.3.2,MDR,R,,Resistant
SRR11098556,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,lineage2,lineage2.2.1,Sensitive,,,Sensitive
ERR760911,rpoB_p.Ser450Leu,katG_p.Ser315Thr,pncA_p.Gln10Pro,embB_p.Gly406Ala,-,-,-,-,-,-,...,-,-,-,-,lineage4,lineage4.1.2.1,MDR,R,,Resistant
SRR9224969,rpoB_p.Ser450Leu,"fabG1_c.-15C>T, katG_p.Ser315Thr",pncA_p.Gly132Ser,embB_p.Met306Val,-,-,-,-,-,-,...,-,-,-,-,lineage4,lineage4.1.2.1,MDR,R,,Resistant


# Feature engineering for SNP

## Include INDELS if there's a need

In [114]:
vcf_df = pd.read_csv("../data/interim/cohort.bqsr.filter.snps.tsv", sep='\t')
vcf_df.head()

Unnamed: 0,CHROM,POS,ERR3129939.GT,ERR3148148.GT,ERR3148149.GT,ERR3148151.GT,ERR3148153.GT,ERR3148155.GT,ERR3148159.GT,ERR3148163.GT,...,SRR9224941.GT,SRR9224942.GT,SRR9224952.GT,SRR9224958.GT,SRR9224968.GT,SRR9224981.GT,SRR9224985.GT,SRR9224986.GT,SRR9224992.GT,SRR9224997.GT
0,NC000962_3,11,A/A,A/A,A/A,A/A,A/A,A/A,A/A,A/A,...,A/A,A/A,A/A,A/A,A/A,A/A,A/A,A/A,A/A,A/A
1,NC000962_3,78,T/T,T/T,T/T,T/T,T/T,T/T,T/T,T/T,...,T/T,T/T,T/T,T/T,T/T,T/T,T/T,T/T,T/T,T/T
2,NC000962_3,80,A/A,A/A,A/A,A/A,A/A,A/A,A/A,A/A,...,A/A,A/A,A/A,A/A,A/A,A/A,A/A,A/A,A/A,A/A
3,NC000962_3,82,G/G,G/G,G/G,G/G,G/G,G/G,G/G,G/G,...,G/G,G/G,G/G,G/G,G/G,G/G,G/G,G/G,G/G,G/G
4,NC000962_3,88,C/C,C/C,C/C,C/C,C/C,C/C,C/C,C/C,...,C/C,C/C,C/C,C/C,C/C,C/C,C/C,C/C,C/C,C/C


In [115]:
vcf_df['CHROM.POS'] = vcf_df.apply(lambda row: row.CHROM + "." + str(row.POS) , axis = 1)
vcf_df.head()

Unnamed: 0,CHROM,POS,ERR3129939.GT,ERR3148148.GT,ERR3148149.GT,ERR3148151.GT,ERR3148153.GT,ERR3148155.GT,ERR3148159.GT,ERR3148163.GT,...,SRR9224942.GT,SRR9224952.GT,SRR9224958.GT,SRR9224968.GT,SRR9224981.GT,SRR9224985.GT,SRR9224986.GT,SRR9224992.GT,SRR9224997.GT,CHROM.POS
0,NC000962_3,11,A/A,A/A,A/A,A/A,A/A,A/A,A/A,A/A,...,A/A,A/A,A/A,A/A,A/A,A/A,A/A,A/A,A/A,NC000962_3.11
1,NC000962_3,78,T/T,T/T,T/T,T/T,T/T,T/T,T/T,T/T,...,T/T,T/T,T/T,T/T,T/T,T/T,T/T,T/T,T/T,NC000962_3.78
2,NC000962_3,80,A/A,A/A,A/A,A/A,A/A,A/A,A/A,A/A,...,A/A,A/A,A/A,A/A,A/A,A/A,A/A,A/A,A/A,NC000962_3.80
3,NC000962_3,82,G/G,G/G,G/G,G/G,G/G,G/G,G/G,G/G,...,G/G,G/G,G/G,G/G,G/G,G/G,G/G,G/G,G/G,NC000962_3.82
4,NC000962_3,88,C/C,C/C,C/C,C/C,C/C,C/C,C/C,C/C,...,C/C,C/C,C/C,C/C,C/C,C/C,C/C,C/C,C/C,NC000962_3.88


In [116]:
vcf_df.drop(['CHROM', 'POS'], axis=1, inplace= True)
vcf_df.head()


Unnamed: 0,ERR3129939.GT,ERR3148148.GT,ERR3148149.GT,ERR3148151.GT,ERR3148153.GT,ERR3148155.GT,ERR3148159.GT,ERR3148163.GT,ERR3148164.GT,ERR3148166.GT,...,SRR9224942.GT,SRR9224952.GT,SRR9224958.GT,SRR9224968.GT,SRR9224981.GT,SRR9224985.GT,SRR9224986.GT,SRR9224992.GT,SRR9224997.GT,CHROM.POS
0,A/A,A/A,A/A,A/A,A/A,A/A,A/A,A/A,A/A,A/A,...,A/A,A/A,A/A,A/A,A/A,A/A,A/A,A/A,A/A,NC000962_3.11
1,T/T,T/T,T/T,T/T,T/T,T/T,T/T,T/T,T/T,T/T,...,T/T,T/T,T/T,T/T,T/T,T/T,T/T,T/T,T/T,NC000962_3.78
2,A/A,A/A,A/A,A/A,A/A,A/A,A/A,A/A,A/A,A/A,...,A/A,A/A,A/A,A/A,A/A,A/A,A/A,A/A,A/A,NC000962_3.80
3,G/G,G/G,G/G,G/G,G/G,G/G,G/G,G/G,G/G,G/G,...,G/G,G/G,G/G,G/G,G/G,G/G,G/G,G/G,G/G,NC000962_3.82
4,C/C,C/C,C/C,C/C,C/C,C/C,C/C,C/C,C/C,C/C,...,C/C,C/C,C/C,C/C,C/C,C/C,C/C,C/C,C/C,NC000962_3.88


In [117]:
vcf_df.set_index('CHROM.POS', inplace= True)
vcf_df.columns = list(map (lambda column: column.split(".")[0], vcf_df.columns))
vcf_df = vcf_df.transpose()
vcf_df.to_csv("../data/processed/vcf_df.csv")
vcf_df.head()


CHROM.POS,NC000962_3.11,NC000962_3.78,NC000962_3.80,NC000962_3.82,NC000962_3.88,NC000962_3.102,NC000962_3.104,NC000962_3.108,NC000962_3.117,NC000962_3.120,...,NC000962_3.4410066,NC000962_3.4410070,NC000962_3.4410276,NC000962_3.4410309,NC000962_3.4410847,NC000962_3.4411128,NC000962_3.4411245,NC000962_3.4411286,NC000962_3.4411405,NC000962_3.4411518
ERR3129939,A/A,T/T,A/A,G/G,C/C,T/T,A/A,C/C,G/G,G/G,...,T/T,G/G,G/G,G/G,C/C,G/G,T/T,G/G,G/G,C/C
ERR3148148,A/A,T/T,A/A,G/G,C/C,T/T,A/A,C/C,G/G,G/G,...,T/T,G/G,G/G,G/G,C/C,G/G,T/T,G/G,G/G,C/C
ERR3148149,A/A,T/T,A/A,G/G,C/C,T/T,A/A,C/C,G/G,G/G,...,T/T,G/G,G/G,G/G,C/C,G/G,T/T,G/G,G/G,C/C
ERR3148151,A/A,T/T,A/A,G/G,C/C,T/T,A/A,C/C,G/G,G/G,...,T/T,G/G,G/G,G/G,C/C,G/G,T/T,G/G,G/G,C/C
ERR3148153,A/A,T/T,A/A,G/G,C/C,T/T,A/A,C/C,G/G,G/G,...,T/T,G/G,G/G,G/G,C/C,G/G,T/T,G/G,G/G,C/C


In [12]:

vcf_snps_df = pd.read_csv("../data/processed/vcf_df.csv")

vcf_snps_df.head()

Unnamed: 0.1,Unnamed: 0,NC000962_3.11,NC000962_3.78,NC000962_3.80,NC000962_3.82,NC000962_3.88,NC000962_3.102,NC000962_3.104,NC000962_3.108,NC000962_3.117,...,NC000962_3.4410066,NC000962_3.4410070,NC000962_3.4410276,NC000962_3.4410309,NC000962_3.4410847,NC000962_3.4411128,NC000962_3.4411245,NC000962_3.4411286,NC000962_3.4411405,NC000962_3.4411518
0,ERR3129939,A/A,T/T,A/A,G/G,C/C,T/T,A/A,C/C,G/G,...,T/T,G/G,G/G,G/G,C/C,G/G,T/T,G/G,G/G,C/C
1,ERR3148148,A/A,T/T,A/A,G/G,C/C,T/T,A/A,C/C,G/G,...,T/T,G/G,G/G,G/G,C/C,G/G,T/T,G/G,G/G,C/C
2,ERR3148149,A/A,T/T,A/A,G/G,C/C,T/T,A/A,C/C,G/G,...,T/T,G/G,G/G,G/G,C/C,G/G,T/T,G/G,G/G,C/C
3,ERR3148151,A/A,T/T,A/A,G/G,C/C,T/T,A/A,C/C,G/G,...,T/T,G/G,G/G,G/G,C/C,G/G,T/T,G/G,G/G,C/C
4,ERR3148153,A/A,T/T,A/A,G/G,C/C,T/T,A/A,C/C,G/G,...,T/T,G/G,G/G,G/G,C/C,G/G,T/T,G/G,G/G,C/C


In [13]:
vcf_snps_df = vcf_snps_df.rename(columns={'Unnamed: 0': 'SampleID'}).set_index('SampleID')
vcf_snps_df.head()

Unnamed: 0_level_0,NC000962_3.11,NC000962_3.78,NC000962_3.80,NC000962_3.82,NC000962_3.88,NC000962_3.102,NC000962_3.104,NC000962_3.108,NC000962_3.117,NC000962_3.120,...,NC000962_3.4410066,NC000962_3.4410070,NC000962_3.4410276,NC000962_3.4410309,NC000962_3.4410847,NC000962_3.4411128,NC000962_3.4411245,NC000962_3.4411286,NC000962_3.4411405,NC000962_3.4411518
SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ERR3129939,A/A,T/T,A/A,G/G,C/C,T/T,A/A,C/C,G/G,G/G,...,T/T,G/G,G/G,G/G,C/C,G/G,T/T,G/G,G/G,C/C
ERR3148148,A/A,T/T,A/A,G/G,C/C,T/T,A/A,C/C,G/G,G/G,...,T/T,G/G,G/G,G/G,C/C,G/G,T/T,G/G,G/G,C/C
ERR3148149,A/A,T/T,A/A,G/G,C/C,T/T,A/A,C/C,G/G,G/G,...,T/T,G/G,G/G,G/G,C/C,G/G,T/T,G/G,G/G,C/C
ERR3148151,A/A,T/T,A/A,G/G,C/C,T/T,A/A,C/C,G/G,G/G,...,T/T,G/G,G/G,G/G,C/C,G/G,T/T,G/G,G/G,C/C
ERR3148153,A/A,T/T,A/A,G/G,C/C,T/T,A/A,C/C,G/G,G/G,...,T/T,G/G,G/G,G/G,C/C,G/G,T/T,G/G,G/G,C/C


In [14]:
col_mutation_dict = {}

for col in vcf_snps_df.columns:
    col_unique_values = list(vcf_snps_df[col].unique())
    if len(col_unique_values) > 1:
        # print('col: ', col, "\tsum: ", col_sum)
        col_mutation_dict[col] =  dict(vcf_snps_df[col].value_counts())


In [15]:
col_mutation_dict

{'NC000962_3.11': {'A/A': 299, 'C/C': 2},
 'NC000962_3.78': {'T/T': 300, 'T|C': 1},
 'NC000962_3.80': {'A/A': 300, 'A|C': 1},
 'NC000962_3.82': {'G/G': 297, 'C/C': 4},
 'NC000962_3.88': {'C/C': 300, 'T/T': 1},
 'NC000962_3.102': {'T/T': 300, 'T|G': 1},
 'NC000962_3.104': {'A/A': 300, 'A|C': 1},
 'NC000962_3.108': {'C/C': 296, 'T/T': 5},
 'NC000962_3.117': {'G/G': 300, 'G|C': 1},
 'NC000962_3.120': {'G/G': 300, 'G|C': 1},
 'NC000962_3.135': {'G/G': 300, 'G|A': 1},
 'NC000962_3.138': {'T/T': 300, 'T|C': 1},
 'NC000962_3.150': {'C/C': 300, 'C|G': 1},
 'NC000962_3.155': {'A/A': 300, 'A|G': 1},
 'NC000962_3.156': {'G/G': 300, 'G|C': 1},
 'NC000962_3.159': {'A/A': 300, 'A|G': 1},
 'NC000962_3.160': {'T/T': 300, 'T|C': 1},
 'NC000962_3.187': {'T/T': 299, 'T|C': 2},
 'NC000962_3.189': {'A/A': 299, 'A|G': 2},
 'NC000962_3.192': {'C/C': 299, 'C|G': 2},
 'NC000962_3.204': {'C/C': 299, 'C|T': 1, 'C|C': 1},
 'NC000962_3.207': {'T/T': 299, 'T|C': 1, 'T|T': 1},
 'NC000962_3.210': {'C/C': 298, 'C|G': 

In [16]:
list(col_mutation_dict.keys())

['NC000962_3.11',
 'NC000962_3.78',
 'NC000962_3.80',
 'NC000962_3.82',
 'NC000962_3.88',
 'NC000962_3.102',
 'NC000962_3.104',
 'NC000962_3.108',
 'NC000962_3.117',
 'NC000962_3.120',
 'NC000962_3.135',
 'NC000962_3.138',
 'NC000962_3.150',
 'NC000962_3.155',
 'NC000962_3.156',
 'NC000962_3.159',
 'NC000962_3.160',
 'NC000962_3.187',
 'NC000962_3.189',
 'NC000962_3.192',
 'NC000962_3.204',
 'NC000962_3.207',
 'NC000962_3.210',
 'NC000962_3.213',
 'NC000962_3.219',
 'NC000962_3.225',
 'NC000962_3.228',
 'NC000962_3.231',
 'NC000962_3.237',
 'NC000962_3.246',
 'NC000962_3.255',
 'NC000962_3.267',
 'NC000962_3.270',
 'NC000962_3.273',
 'NC000962_3.333',
 'NC000962_3.371',
 'NC000962_3.467',
 'NC000962_3.571',
 'NC000962_3.576',
 'NC000962_3.582',
 'NC000962_3.591',
 'NC000962_3.600',
 'NC000962_3.609',
 'NC000962_3.610',
 'NC000962_3.636',
 'NC000962_3.645',
 'NC000962_3.654',
 'NC000962_3.660',
 'NC000962_3.663',
 'NC000962_3.669',
 'NC000962_3.672',
 'NC000962_3.675',
 'NC000962_3.678'

In [38]:
import orjson

col_mutation_dict_json = orjson.dumps(col_mutation_dict,
                     # outfile,
                     option=orjson.OPT_SERIALIZE_NUMPY )


with open("../data/processed/cols_with_mutations.json", "wb") as outfile:
    outfile.write(col_mutation_dict_json)


In [None]:
with open("../data/processed/cols_with_mutations.txt", "w") as outfile:
    outfile.write(str(col_mutation_dict))



In [23]:
dict(vcf_snps_df['NC000962_3.11'].value_counts())

{'A/A': 299, 'C/C': 2}

In [40]:
vcf_unique_snps_df =  vcf_snps_df[list(col_mutation_dict.keys())]
vcf_unique_snps_df.to_csv("../data/processed/vcf_unique_snps_df.csv")
vcf_unique_snps_df.head()

Unnamed: 0_level_0,NC000962_3.11,NC000962_3.78,NC000962_3.80,NC000962_3.82,NC000962_3.88,NC000962_3.102,NC000962_3.104,NC000962_3.108,NC000962_3.117,NC000962_3.120,...,NC000962_3.4410066,NC000962_3.4410070,NC000962_3.4410276,NC000962_3.4410309,NC000962_3.4410847,NC000962_3.4411128,NC000962_3.4411245,NC000962_3.4411286,NC000962_3.4411405,NC000962_3.4411518
SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ERR3129939,A/A,T/T,A/A,G/G,C/C,T/T,A/A,C/C,G/G,G/G,...,T/T,G/G,G/G,G/G,C/C,G/G,T/T,G/G,G/G,C/C
ERR3148148,A/A,T/T,A/A,G/G,C/C,T/T,A/A,C/C,G/G,G/G,...,T/T,G/G,G/G,G/G,C/C,G/G,T/T,G/G,G/G,C/C
ERR3148149,A/A,T/T,A/A,G/G,C/C,T/T,A/A,C/C,G/G,G/G,...,T/T,G/G,G/G,G/G,C/C,G/G,T/T,G/G,G/G,C/C
ERR3148151,A/A,T/T,A/A,G/G,C/C,T/T,A/A,C/C,G/G,G/G,...,T/T,G/G,G/G,G/G,C/C,G/G,T/T,G/G,G/G,C/C
ERR3148153,A/A,T/T,A/A,G/G,C/C,T/T,A/A,C/C,G/G,G/G,...,T/T,G/G,G/G,G/G,C/C,G/G,T/T,G/G,G/G,C/C


In [41]:
# FIXME we have completely ignored zygocity and have only focused on allel-pair-hetegenous
# https://gatk.broadinstitute.org/hc/en-us/articles/360035531912-Spanning-or-overlapping-deletions-allele-

def split_allele_pair(allele_pair):
    first_allele = list(allele_pair)[0]
    second_allele = list(allele_pair)[-1]
    return [first_allele, second_allele]

def compare_alleles(allele_pair):
    first_allele, second_allele = split_allele_pair(allele_pair)
    if first_allele == '.' or second_allele == '.':
        return np.nan
    else:
        return 1 if first_allele != second_allele else 0

def is_heterozygous_allele(allele_pair):
    # print(allele_pair)
    # print(compare_alleles(split_allele_pair(allele_pair)))
     compare_alleles(split_allele_pair(allele_pair))


# NOTE Finalize the rules for reducing the Allele patterns to Homozygous and Heterozygous
def is_heterozygous_vector(allele_vector):
     return list(map(lambda allele: compare_alleles(split_allele_pair(allele)), allele_vector))


# list(map(lambda allele_pair: is_heterozygous(allele_pair),
#          vcf_df['ERR3129939'].unique()))

In [132]:
# temp_df = vcf_snps_df.iloc[:10, :10]
# temp_df.to_csv('../data/interim/temp_vcf_snp_df.csv')
# temp_df = pd.read_csv('../data/interim/temp_vcf_snp_df.csv').rename(columns={'Unnamed: 0': 'SampleID'}).set_index('SampleID')


temp_df = pd.read_csv('../data/interim/temp_vcf_snp_df.csv').set_index('SampleID')
temp_df

Unnamed: 0_level_0,NC000962_3.11,NC000962_3.78,NC000962_3.80,NC000962_3.82,NC000962_3.88,NC000962_3.102,NC000962_3.104,NC000962_3.108,NC000962_3.117,NC000962_3.120
SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ERR3129939,A/A,T/C,./.,*|*,C/.,T/T,A/A,C/C,G/G,G/G
ERR3148148,A/A,T/T,A/A,G/G,C/C,T/T,A/A,C/C,G/G,G/G
ERR3148149,A/A,T/T,A/A,G/G,C/C,T/T,A/A,C/C,G/G,G/G
ERR3148151,A/A,T/T,A/A,G/G,C/C,T/T,A/A,C/C,G/G,G/G
ERR3148153,A/A,T/T,A/A,G/G,C/C,T/T,A/A,C/C,G/G,G/G
ERR3148155,A/A,T/T,A/A,G/G,C/A,T/A,A/A,C/C,G/G,G/G
ERR3148159,A/A,T/A,A/T,G/C,C/C,T/T,A/A,C/C,G/G,G/G
ERR3148163,A/A,T/T,A/A,G/G,C/C,T/T,A/A,C/G,G/G,G/G
ERR3148164,A/A,T/T,A/A,G/G,C/C,T/T,A/A,C/C,G/C,G/G
ERR3148166,A/A,T/T,A/A,G/G,C/C,T/T,A/A,C/C,G/G,G/G


In [133]:
temp_df[['NC000962_3.78']].apply(is_heterozygous_vector, axis=0)

Unnamed: 0_level_0,NC000962_3.78
SampleID,Unnamed: 1_level_1
ERR3129939,1
ERR3148148,0
ERR3148149,0
ERR3148151,0
ERR3148153,0
ERR3148155,0
ERR3148159,1
ERR3148163,0
ERR3148164,0
ERR3148166,0


In [134]:
temp_df.apply(is_heterozygous_vector, axis=0)

Unnamed: 0_level_0,NC000962_3.11,NC000962_3.78,NC000962_3.80,NC000962_3.82,NC000962_3.88,NC000962_3.102,NC000962_3.104,NC000962_3.108,NC000962_3.117,NC000962_3.120
SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ERR3129939,0,1,,0,,0,0,0,0,0
ERR3148148,0,0,0.0,0,0.0,0,0,0,0,0
ERR3148149,0,0,0.0,0,0.0,0,0,0,0,0
ERR3148151,0,0,0.0,0,0.0,0,0,0,0,0
ERR3148153,0,0,0.0,0,0.0,0,0,0,0,0
ERR3148155,0,0,0.0,0,1.0,1,0,0,0,0
ERR3148159,0,1,1.0,1,0.0,0,0,0,0,0
ERR3148163,0,0,0.0,0,0.0,0,0,1,0,0
ERR3148164,0,0,0.0,0,0.0,0,0,0,1,0
ERR3148166,0,0,0.0,0,0.0,0,0,0,0,0


In [42]:

binary_unique_snps_df= vcf_unique_snps_df.apply(is_heterozygous_vector, axis=0)
binary_unique_snps_df.head()

Unnamed: 0_level_0,NC000962_3.11,NC000962_3.78,NC000962_3.80,NC000962_3.82,NC000962_3.88,NC000962_3.102,NC000962_3.104,NC000962_3.108,NC000962_3.117,NC000962_3.120,...,NC000962_3.4410066,NC000962_3.4410070,NC000962_3.4410276,NC000962_3.4410309,NC000962_3.4410847,NC000962_3.4411128,NC000962_3.4411245,NC000962_3.4411286,NC000962_3.4411405,NC000962_3.4411518
SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ERR3129939,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERR3148148,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERR3148149,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERR3148151,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERR3148153,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [46]:
binary_unique_snps_df.to_csv("../data/processed/binary_unique_snps_df.csv")

In [2]:

binary_unique_snps_df = pd.read_csv("../data/processed/binary_unique_snps_df.csv")
binary_unique_snps_df.head()



KeyboardInterrupt: 

In [9]:
binary_unique_snps_df = binary_unique_snps_df.rename({'Unnamed: 0': 'SampleID'}).set_index('SampleID')
binary_unique_snps_df.head()


Unnamed: 0_level_0,NC000962_3.11,NC000962_3.78,NC000962_3.80,NC000962_3.82,NC000962_3.88,NC000962_3.102,NC000962_3.104,NC000962_3.108,NC000962_3.117,NC000962_3.120,...,NC000962_3.4410066,NC000962_3.4410070,NC000962_3.4410276,NC000962_3.4410309,NC000962_3.4410847,NC000962_3.4411128,NC000962_3.4411245,NC000962_3.4411286,NC000962_3.4411405,NC000962_3.4411518
SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ERR3129939,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERR3148148,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERR3148149,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERR3148151,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERR3148153,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
binary_unique_snps_df['NC000962_3.78'].sum()


1

In [11]:
binary_unique_snps_df.head()

Unnamed: 0_level_0,NC000962_3.11,NC000962_3.78,NC000962_3.80,NC000962_3.82,NC000962_3.88,NC000962_3.102,NC000962_3.104,NC000962_3.108,NC000962_3.117,NC000962_3.120,...,NC000962_3.4410066,NC000962_3.4410070,NC000962_3.4410276,NC000962_3.4410309,NC000962_3.4410847,NC000962_3.4411128,NC000962_3.4411245,NC000962_3.4411286,NC000962_3.4411405,NC000962_3.4411518
SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ERR3129939,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERR3148148,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERR3148149,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERR3148151,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERR3148153,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
# Ensure that every column has atleast one mutation

binary_col_mutation_dict = {}

for col in binary_unique_snps_df.columns:
    sum_of_col = binary_unique_snps_df[col].sum()
    if  sum_of_col > 0:
        # print('col: ', col, "\tsum: ", col_sum)
        binary_col_mutation_dict[col] =  dict(binary_unique_snps_df[col].value_counts())

binary_col_mutation_dict

{'NC000962_3.78': {0: 300, 1: 1},
 'NC000962_3.80': {0: 300, 1: 1},
 'NC000962_3.102': {0: 300, 1: 1},
 'NC000962_3.104': {0: 300, 1: 1},
 'NC000962_3.117': {0: 300, 1: 1},
 'NC000962_3.120': {0: 300, 1: 1},
 'NC000962_3.135': {0: 300, 1: 1},
 'NC000962_3.138': {0: 300, 1: 1},
 'NC000962_3.150': {0: 300, 1: 1},
 'NC000962_3.155': {0: 300, 1: 1},
 'NC000962_3.156': {0: 300, 1: 1},
 'NC000962_3.159': {0: 300, 1: 1},
 'NC000962_3.160': {0: 300, 1: 1},
 'NC000962_3.187': {0: 299, 1: 2},
 'NC000962_3.189': {0: 299, 1: 2},
 'NC000962_3.192': {0: 299, 1: 2},
 'NC000962_3.204': {0: 300, 1: 1},
 'NC000962_3.207': {0: 300, 1: 1},
 'NC000962_3.210': {0: 300, 1: 1},
 'NC000962_3.213': {0: 300, 1: 1},
 'NC000962_3.219': {0: 299, 1: 2},
 'NC000962_3.225': {0: 299, 1: 2},
 'NC000962_3.228': {0: 299, 1: 2},
 'NC000962_3.231': {0: 299, 1: 2},
 'NC000962_3.237': {0: 300, 1: 1},
 'NC000962_3.246': {0: 300, 1: 1},
 'NC000962_3.255': {0: 300, 1: 1},
 'NC000962_3.267': {0: 300, 1: 1},
 'NC000962_3.270': {0:

In [17]:

len(list(binary_col_mutation_dict.keys()))

118668

In [20]:
binary_unique_snps_df[list(binary_col_mutation_dict.keys())].to_csv("../data/processed/hetero_binary_vcf_snps_with_mutations_df.csv")

In [10]:
hetero_binary_vcf_snps_with_mutations_df = pd.read_csv("../data/processed/hetero_binary_vcf_snps_with_mutations_df.csv", index_col='SampleID')

# hetero_binary_vcf_snps_with_mutations_df = binary_unique_snps_df[list(binary_col_mutation_dict.keys())]

hetero_binary_vcf_snps_with_mutations_df.head()


Unnamed: 0_level_0,NC000962_3.78,NC000962_3.80,NC000962_3.102,NC000962_3.104,NC000962_3.117,NC000962_3.120,NC000962_3.135,NC000962_3.138,NC000962_3.150,NC000962_3.155,...,NC000962_3.4409993,NC000962_3.4409994,NC000962_3.4410001,NC000962_3.4410033,NC000962_3.4410043,NC000962_3.4410061,NC000962_3.4410065,NC000962_3.4410066,NC000962_3.4410070,NC000962_3.4411245
SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ERR3129939,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERR3148148,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERR3148149,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERR3148151,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERR3148153,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
hetero_binary_vcf_snps_with_mutations_df.shape



(301, 118668)