In [3]:
import pandas as pd
import numpy as np
import networkx as nx
import os
from os import listdir
import random

# Constructing Gene Sets Using Shortest Path from an AMR gene to conserved gene

This following steps allow to construct gene sets to run the model from Nguyen et. al. based on shortest path of conserved genes obtained considering the distance from a conserved gene to an AMR gene. We are working with the following subspecies choosed to have the same RefSeq on PATRIC database and STRING:

- Klebsiella pneumoniae subsp. pneumoniae MGH 78578
- Mycobacterium tuberculosis H37Rv
- Salmonella enterica subsp. enterica serovar Typhimurium str. LT2
- Staphylococcus aureus subsp. aureus NCTC 8325

In [195]:
bacteria = ['Mycobacterium', 'Klebsiella', 'Salmonella', 'Staphylococcus']

To construct gene sets for a specific bacteria, modify the index below according to the list above

In [221]:
root = os.getcwd() + '\\' + bacteria[3] 

# Data From PATRIC

## Genomic Features

The table below contains a list of genomic features, including coding DNA.

Each feature is identified by BRC ID from PATRIC and associated to a protein family referred as PATRIC genus-specific families (PLfams). Most of the genes has the associated Protein ID.

In [222]:
features = pd.read_csv(f'{root}\\genome_features.csv')

In [223]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2755 entries, 0 to 2754
Data columns (total 21 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   Genome                                   2755 non-null   object 
 1   Genome ID                                2755 non-null   float64
 2   Accession                                2755 non-null   object 
 3   BRC ID                                   2755 non-null   object 
 4   RefSeq Locus Tag                         2681 non-null   object 
 5   Alt Locus Tag                            2755 non-null   object 
 6   Feature ID                               2755 non-null   object 
 7   Annotation                               2755 non-null   object 
 8   Feature Type                             2755 non-null   object 
 9   Start                                    2755 non-null   int64  
 10  End                                      2755 no

Through this table, we extract useful data to map protein families referred by Nguyen et. al.:

In [224]:
plf = features[['BRC ID', 'PATRIC genus-specific families (PLfams)']].astype("string")
plf.columns = ['BRC_ID', 'PLFam']
plf.set_index('BRC_ID', inplace = True)
plf

Unnamed: 0_level_0,PLFam
BRC_ID,Unnamed: 1_level_1
fig|93061.5.peg.943,PLF_1279_00001903
fig|93061.5.peg.944,PLF_1279_00000989
fig|93061.5.peg.945,PLF_1279_00001254
fig|93061.5.peg.946,PLF_1279_00000620
fig|93061.5.peg.947,PLF_1279_00085250
...,...
fig|93061.5.peg.83,PLF_1279_00002111
fig|93061.5.peg.939,PLF_1279_00000867
fig|93061.5.peg.940,PLF_1279_00000994
fig|93061.5.peg.941,PLF_1279_00000907


## Specialty Genes

A table containing AMR genes for this specie according to PATRIC.

In [225]:
AMR_refseq = pd.DataFrame(pd.read_csv(f'{root}\\specialty_genes.csv')['RefSeq Locus Tag'])
AMR_refseq.columns = ['AMR_RefSeq']

## Data from Nguyen et. al.

In [226]:
plf_500 = []

datadir = root + '\\Nguyen_et_al_2020\\fasta.500.0'
for strain in listdir(datadir):
    with open(os.path.join(datadir, strain), 'r') as sequences:
        for line in sequences:
            if line[0] == '>':
                plf_500.append(line[1:len(line)-1])
                
plf_500 = pd.DataFrame(np.unique(plf_500))
plf_500.columns = ['Protein Family ID']

## RefSeq Mapping

In [227]:
refseq = features[['BRC ID', 'RefSeq Locus Tag']]
refseq.columns = ['BRC_ID', 'RefSeq']
refseq.set_index('BRC_ID', inplace = True)

In [228]:
plf_map_refseq = features[['RefSeq Locus Tag', 'PATRIC genus-specific families (PLfams)']][features['PATRIC genus-specific families (PLfams)'].isin(plf_500['Protein Family ID'])].reset_index(drop = True)
plf_map_refseq.columns = ['RefSeq', 'PLF']
plf_map_refseq.dropna(inplace = True)
plf_map_refseq.reset_index(drop = True, inplace = True)

In [229]:
plf_map_refseq

Unnamed: 0,RefSeq,PLF
0,SAOUHSC_01030,PLF_1279_00001903
1,SAOUHSC_01038,PLF_1279_00000817
2,SAOUHSC_01044,PLF_1279_00002027
3,SAOUHSC_01045,PLF_1279_00062515
4,SAOUHSC_01047,PLF_1279_00000667
...,...,...
494,SAOUHSC_01011,PLF_1279_00000821
495,SAOUHSC_01016,PLF_1279_00000658
496,SAOUHSC_01019,PLF_1279_00001408
497,SAOUHSC_01021,PLF_1279_00000378


## Protein Interaction Network

In [230]:
ppi_patric = pd.read_csv(f'{root}\\ppi_patric.csv')
ppi_patric = ppi_patric[['Interactor A ID', 'Interactor B ID']].astype("string")
ppi_patric.columns = ['Interactor_A_ID', 'Interactor_B_ID']

In [231]:
ppi_refseq = ppi_patric
for i in range(len(ppi_refseq['Interactor_A_ID'])):
    if ppi_refseq['Interactor_A_ID'][i] in list(refseq.index):
        ppi_refseq.at[i, 'Interactor_A_ID'] = refseq.loc[ppi_refseq['Interactor_A_ID'][i]].RefSeq
    else:
        ppi_refseq.drop(inplace = True, labels = i)
        
ppi_refseq.reset_index(inplace = True, drop = True)
        
for i in range(len(ppi_refseq['Interactor_B_ID'])):
    if ppi_refseq['Interactor_B_ID'][i]in (refseq.index):
        ppi_refseq.at[i, 'Interactor_B_ID'] = refseq.loc[ppi_refseq['Interactor_B_ID'][i]].RefSeq
    else:
        ppi_refseq.drop(inplace = True, labels = i)

In [232]:
ppi_string = pd.read_csv(f'{root}\\ppi_string.txt', sep = ' ')
ppi_string.columns = ['Interactor_A_ID', 'Interactor_B_ID', 'weight']
ppi_string.replace('93061.', '', regex = True, inplace = True)

In [233]:
ppi = pd.DataFrame(pd.concat([ppi_refseq, ppi_string], axis = 0)[['Interactor_A_ID', 'Interactor_B_ID']]).reset_index(drop = True)

In [234]:
conserved_ppi_A = plf_map_refseq[plf_map_refseq['RefSeq'].isin(ppi['Interactor_A_ID'])]['RefSeq']
conserved_ppi_B = plf_map_refseq[plf_map_refseq['RefSeq'].isin(ppi['Interactor_B_ID'])]['RefSeq']

conserved_ppi = pd.DataFrame(pd.concat([conserved_ppi_A, conserved_ppi_B], axis = 0).drop_duplicates())

In [235]:
AMR_ppi_A = AMR_refseq[AMR_refseq['AMR_RefSeq'].isin(ppi['Interactor_A_ID'])]['AMR_RefSeq']
AMR_ppi_B = AMR_refseq[AMR_refseq['AMR_RefSeq'].isin(ppi['Interactor_B_ID'])]['AMR_RefSeq']

AMR_ppi = pd.DataFrame(pd.concat([AMR_ppi_A, AMR_ppi_B], axis = 0))
AMR_ppi.reset_index(drop=True, inplace=True)

In [236]:
AMR_ppi['AMR_RefSeq'].nunique()

43

In [237]:
AMR_ppi.drop_duplicates(inplace = True)

In [238]:
ppi_info = pd.DataFrame(columns = ['Conserved Gene', 'Shortest Path to an AMR gene (length)',])

ppi_info['Conserved Gene'] = conserved_ppi.reset_index(drop = True)['RefSeq']

In [239]:
ppi_graph = nx.from_pandas_edgelist(ppi, 'Interactor_A_ID', 'Interactor_B_ID')

In [240]:
ppi_graph.number_of_edges()

32881

In [241]:
ppi_graph.number_of_nodes()

1893

In [242]:
idx = 0
for i in conserved_ppi['RefSeq']:
    lengths = []
    for j in AMR_ppi['AMR_RefSeq']:
        if nx.has_path(ppi_graph, i, j):
            lengths.append(nx.shortest_path_length(ppi_graph, i, j))
    if lengths:        
        ppi_info['Shortest Path to an AMR gene (length)'][idx] = min(lengths)
        
    idx += 1

In [243]:
plf_map_refseq = plf_map_refseq.set_index('RefSeq')

In [244]:
for i in ppi_info['Shortest Path to an AMR gene (length)'].unique():
    gene_set = ppi_info[ppi_info['Shortest Path to an AMR gene (length)'] == i]['Conserved Gene']
    keys = plf_map_refseq.index.intersection(gene_set)
    globals()[f'plf_length_{i}'] = plf_map_refseq.loc[list(keys)]['PLF']

In [151]:
bacdir = f'{root}\\GeneSets\\PathLengths'

os.mkdir(bacdir)

for i in ppi_info['Shortest Path to an AMR gene (length)'].unique():
    newdir = f'Length_{i}'
    lendir = os.path.join(bacdir, newdir)
    os.mkdir(lendir)
    
    sample = f'{root}\\Nguyen_et_al_2020\\fasta.500.0'
    for strain in listdir(sample):
        with open(os.path.join(lendir, strain), 'a') as mystrain:
            with open(os.path.join(sample, strain), 'r') as sequences:
                first_loop = True
                for line in sequences:
                    if line[0] == '>':
                        if first_loop:
                            plfam = line[1:len(line)-1]
                            seq = ''
                            first_loop = False
                            continue
                        if plfam in list((globals()[f'plf_length_{i}'])):
                            mystrain.write('>' + plfam + '\n')
                            mystrain.write(seq)
                        plfam = line[1:len(line)-1]
                        seq = ''
                    else:
                        seq += line
                if plfam in list((globals()[f'plf_length_{i}'])):
                            mystrain.write('>' + plfam + '\n')
                            mystrain.write(seq)

At this point, it is possible to use this new configuration of data to run the model

Klebsiella: we can take 25 genes to each strain for different path lengths (1, 2, 3 and 4), because the results from the paper also are derived from groups of 25 genes, hence, we can compare these results.

Salmonella: note we have 10 genes with length zero. This means that there are 10 conserved genes in the ppi annotated as AMR genes today, but they were not when the data from paper was collected. let's make groups of 10 genes to compare with the group of 10 AMR genes.

In [245]:
print(ppi_info.groupby(['Shortest Path to an AMR gene (length)']).size().reset_index(name='Count'))

   Shortest Path to an AMR gene (length)  Count
0                                      0      1
1                                      1    171
2                                      2    192
3                                      3     22


In [126]:
dir_25genes = os.path.join(bacdir, '25genes')
os.mkdir(dir_25genes)

#Klebsiella
# rand_idx = [sorted(random.sample(range(29), 25)), 
#             sorted(random.sample(range(52), 25)), 
#             sorted(random.sample(range(82), 25)),
#             sorted(random.sample(range(36), 25)),

#Salmonella
rand_idx = [sorted(random.sample(range(10), 10)), 
            sorted(random.sample(range(21), 10)), 
            sorted(random.sample(range(42), 10)),
            sorted(random.sample(range(77), 10)),
            sorted(random.sample(range(49), 10)),
            sorted(random.sample(range(43), 10)),
            sorted(random.sample(range(15), 10)),
            sorted(random.sample(range(13), 10))]

for i in range(1, len(rand_idx)+1):   

    lendir = os.path.join(bacdir, f'Length_{i}')
    repdir = os.path.join(dir_25genes, f'length.{i}')
    os.mkdir(repdir)

    for strain in listdir(lendir):
        with open(os.path.join(repdir, strain), 'a') as mystrain:
            with open(os.path.join(lendir, strain), 'r') as sequences:
                c = 0
                first_loop = True
                for line in sequences:
                    if line[0] == '>':
                        if first_loop:
                            plfam = line
                            seq = ''
                            first_loop = False
                            continue
                        if c in rand_idx[i-1]:
                            mystrain.write(plfam)
                            mystrain.write(seq)
                        plfam = line
                        seq = ''
                        c += 1
                    else:
                        seq += line
                if c in rand_idx[i-1]:
                        mystrain.write(plfam)
                        mystrain.write(seq)

# Randomizing AMR labels