In [2]:
import pandas as pd
import numpy as np
import networkx as nx
import os
import random

In [5]:
features = pd.read_csv('genome_features.csv')

In [7]:
plf = features[['BRC ID', 'PATRIC genus-specific families (PLfams)']].astype("string")
plf.columns = ['BRC_ID', 'PLFam']
plf.set_index('BRC_ID', inplace = True)

In [8]:
specialty_genes = pd.read_csv('specialty_genes.csv')
specialty_genes = specialty_genes[['BRC ID', 'Property']]
specialty_genes.columns = ['BRC_ID', 'Property']
specialty_genes.set_index('BRC_ID', inplace = True)
specialty_genes.Property.unique()
AMR_genes = pd.DataFrame(specialty_genes.loc[specialty_genes.Property == 'Antibiotic Resistance'].reset_index()['BRC_ID'])

# Running the model with different groups of genes

Once we did not have a satisfactory result calculating correlation, we will try to look ate the model performance using groups of genes constructed according to the path length to an AMR gene. 

However, we can not use different replicates because each strain in a different replicate has different protein families associated. So, the separation considering different replicates will gives different samples and we need to have the same features on each strain.

## Constructing sets of genes according to the path length

From now one, we will consider the set of strains using 500 conserved genes.

Taking protein families from this experiment set:

In [3]:
import os
from os import listdir

plf_500 = []

#path to original dataset
datadir = 'E:/User/bruna.fistarol/Documents/GitHub/Nguyen_et_al_2020/Staphylococcus/fasta.500.0'
for strain in listdir(datadir):
    with open(os.path.join(datadir, strain), 'r') as sequences:
        for line in sequences:
            if line[0] == '>':
                plf_500.append(line[1:len(line)-1])
                
plf_500 = pd.DataFrame(np.unique(plf_500))
plf_500.columns = ['Protein Family ID']

Once the information used in the paper from Nguyen is given in therms of protein families, we need to associate every feature for a corresponding protein family.

Constructing the PPI in terms of protein families for conserved genes:

In [9]:
ppi = pd.read_csv('ppi_patric.csv')
ppi = ppi[['Interactor A ID', 'Interactor B ID']].astype("string")
ppi.columns = ['Interactor_A_ID', 'Interactor_B_ID']
ppi.drop(2085, axis = 0, inplace = True)
ppi.reset_index(drop=True, inplace=True)

ppi_plfams = ppi

for i in range(len(ppi['Interactor_A_ID'])):
    if plf.loc[ppi['Interactor_A_ID'][i]].isin(plf_500['Protein Family ID']).bool():
        ppi_plfams.at[i, 'Interactor_A_ID'] = plf.loc[ppi['Interactor_A_ID'][i]].PLFam
        
for i in range(len(ppi['Interactor_B_ID'])):
    if plf.loc[ppi['Interactor_B_ID'][i]].isin(plf_500['Protein Family ID']).bool():
        ppi_plfams.at[i, 'Interactor_B_ID'] = plf.loc[ppi['Interactor_B_ID'][i]].PLFam        
        
ppi_plfams.drop_duplicates(subset=None, keep='first', inplace=True)

Checking which protein families are in PPI:

In [11]:
conserved_ppi_A = plf_500[plf_500['Protein Family ID'].isin(ppi['Interactor_A_ID'])]['Protein Family ID']
conserved_ppi_B = plf_500[plf_500['Protein Family ID'].isin(ppi['Interactor_B_ID'])]['Protein Family ID']

conserved_ppi = pd.DataFrame(pd.concat([conserved_ppi_A, conserved_ppi_B], axis = 0).drop_duplicates())

Checking which resistant genes are in PPI:

In [12]:
AMR_genes_ppi_A = AMR_genes[AMR_genes['BRC_ID'].isin(ppi['Interactor_A_ID'])]['BRC_ID']
AMR_genes_ppi_B = AMR_genes[AMR_genes['BRC_ID'].isin(ppi['Interactor_B_ID'])]['BRC_ID']

AMR_genes_ppi = pd.DataFrame(pd.concat([AMR_genes_ppi_A, AMR_genes_ppi_B], axis = 0))
AMR_genes_ppi.reset_index(drop=True, inplace=True)

Calculating the distance from a conserved gene to an AMR gene:

In [13]:
ppi_info = pd.DataFrame(columns = ['Conserved Gene', 'Shortest Path to an AMR gene (length)',])

ppi_info['Conserved Gene'] = conserved_ppi.reset_index(drop = True)['Protein Family ID']

In [14]:
ppi_graph = nx.from_pandas_edgelist(ppi_plfams, 'Interactor_A_ID', 'Interactor_B_ID')

idx = 0
for i in conserved_ppi['Protein Family ID']:
    lengths = []
    for j in AMR_genes_ppi['BRC_ID']:
        if nx.has_path(ppi_graph, i, j):
            lengths.append(nx.shortest_path_length(ppi_graph, i, j))
    if lengths:        
        ppi_info['Shortest Path to an AMR gene (length)'][idx] = min(lengths)
        
    idx += 1

In [16]:
print(ppi_info.groupby(['Shortest Path to an AMR gene (length)']).size().reset_index(name='Count'))

   Shortest Path to an AMR gene (length)  Count
0                                      1     55
1                                      2    152
2                                      3     87
3                                      4     39
4                                      5      6
5                                      6      1
6                                      7      1


To evaluate the model from Nguyen et. al. with genes according to the path until an AMR gene, we need to separate these families. Lets construct them:

In [17]:
for i in ppi_info['Shortest Path to an AMR gene (length)'].unique():
    globals()[f'plf_length_{i}'] = ppi_info[ppi_info['Shortest Path to an AMR gene (length)'] == i]['Conserved Gene']

The data used is available through the PATRIC FTP (ftp://ftp.patricbrc.org/datasets/) downloading the Nguyen_et_al_2020.tar.gz archive.

Separating genes in files accordin to the length to an AMR gene:

In [18]:
# mydir = 'E:/User/bruna.fistarol/Documents/GitHub/Fistarol_2022/Staphylococcus'
# datadir = 'E:/User/bruna.fistarol/Documents/GitHub/Nguyen_et_al_2020/Staphylococcus'

# for i in ppi_info2['Shortest Path to an AMR gene (length)'].unique():
#     newdir = f'length.{i}'
#     path = os.path.join(mydir, newdir)
#     os.mkdir(path)
    
#     sample = f'E:/User/bruna.fistarol/Documents/GitHub/Nguyen_et_al_2020/Staphylococcus/fasta.500.0'
#     for strain in listdir(sample):
#         with open(os.path.join(path, strain), 'a') as mystrain:
#             with open(os.path.join(sample, strain), 'r') as sequences:
#                 first_loop = True
#                 for line in sequences:
#                     if line[0] == '>':
#                         if first_loop:
#                             plfam = line[1:len(line)-1]
#                             seq = ''
#                             first_loop = False
#                             continue
#                         if plfam in list((globals()[f'plf_length_{i}'])):
#                             mystrain.write('>' + plfam + '\n')
#                             mystrain.write(seq)
#                         plfam = line[1:len(line)-1]
#                         seq = ''
#                     else:
#                         seq += line
#                 if plfam in list((globals()[f'plf_length_{i}'])):
#                             mystrain.write('>' + plfam + '\n')
#                             mystrain.write(seq)

In [19]:
for i in ppi_info['Shortest Path to an AMR gene (length)'].unique():
    globals()[f'len_{i}'] = []
    for strain in listdir(f'E:/User/bruna.fistarol/Documents/GitHub/Fistarol_2022/Staphylococcus/length.{i}'):
        with open(os.path.join(f'E:/User/bruna.fistarol/Documents/GitHub/Fistarol_2022/Staphylococcus/length.{i}', strain), 'r') as sequence:
            genes = 0
            for line in sequence:
                if line[0] == '>':
                    genes += 1
            globals()[f'len_{i}'].append(genes)

For each strain considering different path lengths to an AMR gene, we have the following number of genes:

In [21]:
for i in ppi_info['Shortest Path to an AMR gene (length)'].unique():
    print(i, np.mean(globals()[f'len_{i}']))

1 36.018927444794954
3 69.05520504731861
nan 0.0
2 105.08832807570978
4 28.074132492113566
5 2.001577287066246
7 0.0
6 1.0


At this point, it is possible to use this new configuration of data to run the model. 
We can take 25 genes to each strain for lengths equals to 1, 2, 3 and 4, because the results from the paper also are derived from groups of 25 genes, hence, we can compare these results.

In [22]:
# mydir = 'E:/User/bruna.fistarol/Documents/GitHub/Fistarol_2022/Staphylococcus'
# os.mkdir(os.path.join(mydir, '25genes'))
# dir_25genes = 'E:/User/bruna.fistarol/Documents/GitHub/Fistarol_2022/Staphylococcus/25genes'

# for j in [0, 1, 2, 3, 4]:
    
#     rand_idx = [sorted(random.sample(range(1,36), 25)), 
#                 sorted(random.sample(range(1,105), 25)), 
#                 sorted(random.sample(range(1,69), 25)), 
#                 sorted(random.sample(range(1,28), 25))]
    
#     for i in [1, 2, 3, 4]:   

#         path = f'E:/User/bruna.fistarol/Documents/GitHub/Fistarol_2022/Staphylococcus/length.{i}'
#         mydir = os.path.join(dir_25genes, f'length.{i}.{j}')
#         os.mkdir(mydir)

#         for strain in listdir(path):
#             with open(os.path.join(mydir, strain), 'a') as mystrain:
#                 with open(os.path.join(path, strain), 'r') as sequences:
#                     c = 0
#                     first_loop = True
#                     for line in sequences:
#                         if line[0] == '>':
#                             if first_loop:
#                                 plfam = line
#                                 seq = ''
#                                 c += 1
#                                 first_loop = False
#                                 continue
#                             if c in rand_idx[i-1]:
#                                 mystrain.write(plfam)
#                                 mystrain.write(seq)
#                             plfam = line
#                             seq = ''
#                             c += 1
#                         else:
#                             seq += line
#                     if c in rand_idx[i-1]:
#                             mystrain.write(plfam)
#                             mystrain.write(seq)

Sanity check: is there any gene anottated as AMR today inside this group?

In [24]:
list(set(plf.loc[AMR_genes['BRC_ID']]['PLFam']) & set(plf_500['Protein Family ID']))

['PLF_1279_00000145']

In [25]:
for i in (1, 2, 3, 4, 5, 6, 'nan'): 
    print('PLF_1279_00000145' in list(globals()[f'plf_length_{i}']))

False
False
False
False
False
False
False


The conserved gene annotated as AMR gene is not being used in any set.

In [26]:
# #virulence

# virulence_genes = sa_specialty_genes.loc[sa_specialty_genes.Property == 'Virulence Factor'].reset_index()
# virulence_set = list(set(plf.loc[virulence_genes['BRC_ID']]['PLFam']) & set(plf_500['Protein Family ID']))
# len(virulence_set)

In [27]:
# Completing the set with more 10 random genes:

In [28]:
# virulence_test = virulence_set

# while len(virulence_test) < 25:
#     idx =  random.randint(0,499)
#     if plf_500['Protein Family ID'].iloc[idx] not in virulence_test:
#         virulence_test.append(plf_500['Protein Family ID'].iloc[idx])

In [29]:
# mydir = 'E:/User/bruna.fistarol/Documents/GitHub/Fistarol_2022/Staphylococcus'
# datadir = 'E:/User/bruna.fistarol/Documents/GitHub/Nguyen_et_al_2020/Staphylococcus'

# newdir = 'virulence'
# path = os.path.join(mydir, newdir)
# os.mkdir(path)
    
# sample = 'E:/User/bruna.fistarol/Documents/GitHub/Nguyen_et_al_2020/Staphylococcus/fasta.500.0'
# for strain in listdir(sample):
#     with open(os.path.join(path, strain), 'a') as mystrain:
#         with open(os.path.join(sample, strain), 'r') as sequences:
#             first_loop = True
#             for line in sequences:
#                 if line[0] == '>':
#                     if first_loop:
#                         plfam = line[1:len(line)-1]
#                         seq = ''
#                         first_loop = False
#                         continue
#                     if plfam in virulence_test:
#                         mystrain.write('>' + plfam + '\n')
#                         mystrain.write(seq)
#                     plfam = line[1:len(line)-1]
#                     seq = ''
#                 else:
#                     seq += line
#             if plfam in virulence_test:
#                         mystrain.write('>' + plfam + '\n')
#                         mystrain.write(seq)