In [5]:
import pandas as pd
import numpy as np

In [6]:
#input local file path to CRyPTIC tables
file_path='../../cryptic_data_analysis/data_tables/cryptic-analysis-group'


In [7]:
#read in files
genomes = pd.read_pickle(file_path+"/GENOMES.pkl.gz")
genomes.reset_index(level=0, inplace=True)
genomes=genomes.loc[genomes['BELONGS_GPI']==True]

phenotypes = pd.read_pickle(file_path+"/UKMYC_PHENOTYPES.pkl.gz")
phenotypes.reset_index(level=1, inplace=True)
phenotypes.reset_index(level=0, inplace=True)
phenotypes=phenotypes.loc[phenotypes['BELONGS_GPI']==True]
phenotypes.loc[((phenotypes['DRUG'] =='INH')&(phenotypes['BINARY_PHENOTYPE']=='I')), 'BINARY_PHENOTYPE'] = 'R'


fq_phenotypes=phenotypes.loc[phenotypes.DRUG.isin(['LEV','MXF'])]

samples = pd.read_pickle(file_path+"/SAMPLES.pkl.gz")
samples.reset_index(level=2, inplace=True)
samples.reset_index(level=1, inplace=True)
samples.reset_index(level=0, inplace=True)

mutations=pd.read_pickle(file_path+"/MUTATIONS_GPI.pkl.gz")
mutations.reset_index(inplace=True)

hets=pd.read_csv('het_mutations_dp2.csv')

features = pd.read_csv("STRUCTURE_BASED_FEATURES_ALL.csv")

In [4]:
#define list of lineage mutations not to include
#ref for lineage mutations: Miotto ERJ 2017
lin_muts=['gyrAS95T','gyrAE21Q','gyrAT80A','gyrAA90G','gyrAG247S','gyrAA384V',
             'gyrAG668D','gyrAL712V', 'gyrAS250A','gyrAR252L','gyrAL398F','gyrAA463S','gyrAD639A','gyrAV742L',
              'gyrBV301L','gyrBM291I','gyrBA403S']

In [5]:
#create dataframe of RIF and INH resistant background for each isolate
uid=[]
background=[]
for i in phenotypes.UNIQUEID.unique():
    uid.append(i)
    df=phenotypes.loc[phenotypes.UNIQUEID==i]
    if len(df.loc[((df.DRUG=='RIF')&(df.BINARY_PHENOTYPE=='R'))|((df.DRUG=='INH')&(df.BINARY_PHENOTYPE=='R'))])==2:
        background.append('MDR')
    elif len(df.loc[((df.DRUG=='RIF')&(df.BINARY_PHENOTYPE=='R'))])==1 and len(df.loc[((df.DRUG=='INH')&(df.BINARY_PHENOTYPE=='S'))])==1:
        background.append('RIF_MONOR')
    elif len(df.loc[((df.DRUG=='RIF')&(df.BINARY_PHENOTYPE=='S'))])==1 and len(df.loc[((df.DRUG=='INH')&(df.BINARY_PHENOTYPE=='R'))])==1:
        background.append('INH_MONOR')
    elif len(df.loc[((df.DRUG=='RIF')&(df.BINARY_PHENOTYPE=='S'))])==1 and len(df.loc[((df.DRUG=='INH')&(df.BINARY_PHENOTYPE=='S'))])==1:
        background.append('INH_AND_RIF_S')
    else:
        background.append('UNKNOWN')
background_df=pd.DataFrame(list(zip(uid,background)), columns=['UNIQUEID', 'BACKGROUND'])

background_df.BACKGROUND.value_counts()

INH_AND_RIF_S    5788
MDR              4353
INH_MONOR        1470
UNKNOWN           445
RIF_MONOR         302
Name: BACKGROUND, dtype: int64

In [6]:
#filter mutations so only have mutations in dna gyrase, label as homogeneous (HET=0)
gyrase_mutations=mutations.loc[mutations.GENE.isin(['gyrA','gyrB'])&(mutations.IS_SYNONYMOUS==False)&(mutations.IN_PROMOTER==False)&(mutations.IS_SNP==True)&(mutations.IS_NULL==False)&(mutations.IS_FILTER_PASS==True)]


gyrase_mutations["GENEMUT"]=gyrase_mutations['GENE'].astype('str')+gyrase_mutations['MUTATION']
gyrase_mutations = gyrase_mutations.loc[(-gyrase_mutations.GENEMUT.isin(lin_muts))]
gyrase_mutations['IS_HET']=0

gyrase_mutations = gyrase_mutations[['UNIQUEID','GENE', 'MUTATION', 'POSITION', "GENEMUT", "IS_HET"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gyrase_mutations["GENEMUT"]=gyrase_mutations['GENE'].astype('str')+gyrase_mutations['MUTATION']


In [7]:
#filter gyrase mutations with FRS <0.9 (remove nsynonymous muts and stop codons) and create complimentary dataframe to homogeneous mutations
uid=[]
gene=[]
mutation=[]
position=[]
genemut=[]
is_het=[]

for index, row in hets.iterrows():

    if str(row['HET_MUTATION'])!= 'nan' and str(row['HET_MUTATION'])[4]!= str(row['HET_MUTATION'])[-1] and str(row['HET_MUTATION'])[-1]!='O' and str(row['HET_MUTATION'])[5]!='-' and str(row['HET_MUTATION'])[-1]!='X' and str(row['HET_MUTATION'])[-4:] != 'STOP' and str(row['HET_MUTATION'])[4]!='!' and str(row['HET_MUTATION'])[4:8] != 'STOP':
            uid.append(row['UNIQUEID'])
            gene.append(str(row['HET_MUTATION'])[0:4])
            mutation.append(str(row['HET_MUTATION'])[4:])
            position.append(int(str(row['HET_MUTATION'])[5:-1]))
            genemut.append(str(row['HET_MUTATION']))
            is_het.append(1)
            
            

new_hets=pd.DataFrame(list(zip(uid, gene, mutation, position, genemut, is_het)), 
                      columns=['UNIQUEID','GENE','MUTATION','POSITION', 'GENEMUT','IS_HET'])

#remove lineage specific mutations
new_hets=new_hets.loc[(-new_hets.GENEMUT.isin(lin_muts))]

In [9]:
#combine homogenous mutations and mutations with FRS <0.9
gyrase_mutations=gyrase_mutations.append(new_hets)

In [10]:
#filter to remove low quality phenotpyes and remove unecessary columns
fq_phenotypes = fq_phenotypes.loc[fq_phenotypes.PHENOTYPE_QUALITY!='LOW']
fq_phenotypes = fq_phenotypes[['UNIQUEID', 'DRUG','MIC', 'LOG2MIC','BINARY_PHENOTYPE', 'PHENOTYPE_QUALITY']]

In [25]:
#merge files to give df with sample uniqueid, lineage, country of origin and phenotypes to LEV/MXF
gs=pd.merge(genomes, samples, how='left', on=['SITEID','SUBJID','LABID'])
gs = gs[['UNIQUEID','LINEAGE_NAME','COUNTRY_WHERE_SAMPLE_TAKEN']]
gs=pd.merge(gs, background_df, how='left', on=['UNIQUEID'])
pgs=pd.merge(fq_phenotypes, gs, how='left', on=['UNIQUEID'])
pgs

Unnamed: 0,UNIQUEID,DRUG,MIC,LOG2MIC,BINARY_PHENOTYPE,PHENOTYPE_QUALITY,LINEAGE_NAME,COUNTRY_WHERE_SAMPLE_TAKEN,BACKGROUND
0,site.06.subj.06TB_1032.lab.06MIL2037.iso.1,MXF,0.25,-2.00,S,HIGH,Lineage 4,,INH_MONOR
1,site.06.subj.06TB_0404.lab.06MIL1373.iso.1,MXF,0.25,-2.00,S,HIGH,Lineage 4,,INH_AND_RIF_S
2,site.06.subj.SSM_0197-R.lab.06MIL0117.iso.1,MXF,<=0.06,-4.06,S,HIGH,Lineage 3,PAK,INH_AND_RIF_S
3,site.04.subj.01328.lab.722926.iso.1,LEV,0.5,-1.00,S,HIGH,Lineage 2,IND,INH_AND_RIF_S
4,site.03.subj.T1069.lab.T1069.iso.1,LEV,0.5,-1.00,S,HIGH,Lineage 4,TJK,RIF_MONOR
...,...,...,...,...,...,...,...,...,...
17598,site.05.subj.CA-1473.lab.CO-03323-19.iso.1,MXF,0.5,-1.00,S,HIGH,Lineage 4,PER,INH_AND_RIF_S
17599,site.02.subj.0910.lab.22A144.iso.1,LEV,4.0,2.00,R,HIGH,Lineage 2,CHN,MDR
17600,site.05.subj.CA-0335.lab.CO-08338-18.iso.1,MXF,0.5,-1.00,S,HIGH,Lineage 4,PER,INH_AND_RIF_S
17601,site.02.subj.0016.lab.2014231005.iso.1,LEV,0.25,-2.00,S,HIGH,Lineage 2,CHN,INH_AND_RIF_S


In [26]:
#merge to get mutations in each isolate and remove any isolates with no MIC
pgsm = pd.merge(pgs, gyrase_mutations, how='left', on='UNIQUEID')
mic_nan=pgsm.loc[pgsm['MIC'].isna()].index
pgsm=pgsm.drop(mic_nan)

In [27]:
#count number of dna gyrase mutations per sample
no_muts=[]
for index, row in pgsm.iterrows():
    no_muts.append(pgsm.loc[pgsm.UNIQUEID==row['UNIQUEID']].GENEMUT.nunique(dropna=True))
pgsm['NUMBER_OF_MUTATIONS']=no_muts
pgsm

Unnamed: 0,UNIQUEID,DRUG,MIC,LOG2MIC,BINARY_PHENOTYPE,PHENOTYPE_QUALITY,LINEAGE_NAME,COUNTRY_WHERE_SAMPLE_TAKEN,BACKGROUND,GENE,MUTATION,POSITION,GENEMUT,IS_HET,NUMBER_OF_MUTATIONS
0,site.06.subj.06TB_1032.lab.06MIL2037.iso.1,MXF,0.25,-2.00,S,HIGH,Lineage 4,,INH_MONOR,,,,,,0
1,site.06.subj.06TB_0404.lab.06MIL1373.iso.1,MXF,0.25,-2.00,S,HIGH,Lineage 4,,INH_AND_RIF_S,,,,,,0
2,site.06.subj.SSM_0197-R.lab.06MIL0117.iso.1,MXF,<=0.06,-4.06,S,HIGH,Lineage 3,PAK,INH_AND_RIF_S,,,,,,0
3,site.04.subj.01328.lab.722926.iso.1,LEV,0.5,-1.00,S,HIGH,Lineage 2,IND,INH_AND_RIF_S,,,,,,0
4,site.03.subj.T1069.lab.T1069.iso.1,LEV,0.5,-1.00,S,HIGH,Lineage 4,TJK,RIF_MONOR,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18767,site.05.subj.CA-1473.lab.CO-03323-19.iso.1,MXF,0.5,-1.00,S,HIGH,Lineage 4,PER,INH_AND_RIF_S,,,,,,0
18768,site.02.subj.0910.lab.22A144.iso.1,LEV,4.0,2.00,R,HIGH,Lineage 2,CHN,MDR,gyrA,A90V,90.0,gyrAA90V,0.0,1
18769,site.05.subj.CA-0335.lab.CO-08338-18.iso.1,MXF,0.5,-1.00,S,HIGH,Lineage 4,PER,INH_AND_RIF_S,,,,,,0
18770,site.02.subj.0016.lab.2014231005.iso.1,LEV,0.25,-2.00,S,HIGH,Lineage 2,CHN,INH_AND_RIF_S,,,,,,0


In [28]:
#merge with structure based features for ML
final_df = pd.merge(pgsm, features, how='left', on=['GENE','MUTATION'])
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18412 entries, 0 to 18411
Data columns (total 52 columns):
 #   Column                           Non-Null Count  Dtype   
---  ------                           --------------  -----   
 0   UNIQUEID                         18412 non-null  object  
 1   DRUG                             18412 non-null  category
 2   MIC                              18412 non-null  category
 3   LOG2MIC                          18412 non-null  float64 
 4   BINARY_PHENOTYPE                 18412 non-null  category
 5   PHENOTYPE_QUALITY                18412 non-null  category
 6   LINEAGE_NAME                     18412 non-null  category
 7   COUNTRY_WHERE_SAMPLE_TAKEN       15147 non-null  category
 8   BACKGROUND                       18412 non-null  object  
 9   GENE                             6391 non-null   object  
 10  MUTATION                         6391 non-null   object  
 11  POSITION_x                       6391 non-null   float64 
 12  GENE

In [29]:
#rename columns
final_df=final_df.rename(columns={"POSITION_x": "POSITION"})
final_df=final_df.rename(columns={"MAPP_SCORE": "SNAP2_SCORE"})

In [30]:
final_df.to_csv('ML_DATA_HIGHQ.csv', index=False)