# Repeating the experiments using a PPI provided by STRING database

In [162]:
import pandas as pd
import numpy as np
import networkx as nx
import random

In [132]:
features = pd.read_csv('genome_features.csv')

In [133]:
refseq = features[['BRC ID', 'RefSeq Locus Tag']]
refseq.columns = ['BRC_ID', 'RefSeq']
refseq.set_index('BRC_ID', inplace = True)

In [134]:
ppi_string = pd.read_csv('ppi_string.txt', sep = ' ')
ppi_string.columns = ['Interactor_A_ID', 'Interactor_B_ID', 'weight']
ppi_string.replace('93061.', '', regex = True, inplace = True)

In [135]:
specialty_genes = pd.read_csv('specialty_genes.csv')
specialty_genes = specialty_genes[['BRC ID', 'Property']]
specialty_genes.columns = ['BRC_ID', 'Property']
specialty_genes.set_index('BRC_ID', inplace = True)
AMR_genes = pd.DataFrame(specialty_genes.loc[specialty_genes.Property == 'Antibiotic Resistance'].reset_index()['BRC_ID'])

In [136]:
import os
from os import listdir

plf_500 = []

datadir = 'E:/User/bruna.fistarol/Documents/GitHub/Nguyen_et_al_2020/Staphylococcus/fasta.500.0'
for strain in listdir(datadir):
    with open(os.path.join(datadir, strain), 'r') as sequences:
        for line in sequences:
            if line[0] == '>':
                plf_500.append(line[1:len(line)-1])
                
plf_500 = pd.DataFrame(np.unique(plf_500))
plf_500.columns = ['Protein Family ID']

In [137]:
plf_map_refseq = features[['RefSeq Locus Tag', 'PATRIC genus-specific families (PLfams)']][features['PATRIC genus-specific families (PLfams)'].isin(plf_500['Protein Family ID'])].reset_index(drop = True)
plf_map_refseq.columns = ['RefSeq', 'PLF']
plf_map_refseq.dropna(inplace = True)
plf_SAOUHSC = plf_map_refseq.drop_duplicates(subset = 'PLF', keep = 'last')
plf_SAOUHSC.reset_index(drop = True, inplace = True)
plf_SAOUHSC.drop([0, 1, 2], inplace = True)
plf_SAOUHSC.reset_index(drop = True, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [138]:
AMR_refseq = features[['RefSeq Locus Tag']][features['BRC ID'].isin(AMR_genes['BRC_ID'])].reset_index(drop = True)
AMR_refseq.columns = ['AMR_RefSeq']

In [139]:
ppi_patric = pd.read_csv('ppi_patric.csv')
ppi_patric = ppi_patric[['Interactor A ID', 'Interactor B ID']].astype("string")
ppi_patric.columns = ['Interactor_A_ID', 'Interactor_B_ID']
ppi_patric.drop(2085, axis = 0, inplace = True)
ppi_patric.reset_index(drop=True, inplace=True)
ppi_refseq = ppi_patric

for i in range(len(ppi_patric['Interactor_A_ID'])):
    ppi_refseq.at[i, 'Interactor_A_ID'] = refseq.loc[ppi_patric['Interactor_A_ID'][i]].RefSeq
        
for i in range(len(ppi_patric['Interactor_B_ID'])):
    ppi_refseq.at[i, 'Interactor_B_ID'] = refseq.loc[ppi_patric['Interactor_B_ID'][i]].RefSeq 

In [140]:
ppi = pd.DataFrame(pd.concat([ppi_refseq, ppi_string], axis = 0).drop_duplicates())

In [141]:
conserved_ppi_A = plf_SAOUHSC[plf_SAOUHSC['RefSeq'].isin(ppi['Interactor_A_ID'])]['RefSeq']
conserved_ppi_B = plf_SAOUHSC[plf_SAOUHSC['RefSeq'].isin(ppi['Interactor_B_ID'])]['RefSeq']

conserved_ppi = pd.DataFrame(pd.concat([conserved_ppi_A, conserved_ppi_B], axis = 0).drop_duplicates())

In [142]:
conserved_ppi

Unnamed: 0,RefSeq
0,SAOUHSC_01030
1,SAOUHSC_01038
2,SAOUHSC_01044
3,SAOUHSC_01045
4,SAOUHSC_01047
...,...
338,SAOUHSC_02906
346,SAOUHSC_02935
411,SAOUHSC_00565
438,SAOUHSC_00717


In [143]:
AMR_ppi_A = AMR_refseq[AMR_refseq['AMR_RefSeq'].isin(ppi['Interactor_A_ID'])]['AMR_RefSeq']
AMR_ppi_B = AMR_refseq[AMR_refseq['AMR_RefSeq'].isin(ppi['Interactor_B_ID'])]['AMR_RefSeq']

AMR_ppi = pd.DataFrame(pd.concat([AMR_ppi_A, AMR_ppi_B], axis = 0))
AMR_ppi.reset_index(drop=True, inplace=True)

In [144]:
AMR_ppi

Unnamed: 0,AMR_RefSeq
0,SAOUHSC_00099
1,SAOUHSC_01071
2,SAOUHSC_01159
3,SAOUHSC_01260
4,SAOUHSC_01351
...,...
88,SAOUHSC_00694
89,SAOUHSC_00703
90,SAOUHSC_00006
91,SAOUHSC_00921


In [145]:
ppi_info = pd.DataFrame(columns = ['Conserved Gene', 'Shortest Path to an AMR gene (length)',])

ppi_info['Conserved Gene'] = conserved_ppi.reset_index(drop = True)['RefSeq']

In [146]:
ppi_graph = nx.from_pandas_edgelist(ppi, 'Interactor_A_ID', 'Interactor_B_ID')

In [147]:
ppi_graph = nx.from_pandas_edgelist(ppi, 'Interactor_A_ID', 'Interactor_B_ID')

idx = 0
for i in conserved_ppi['RefSeq']:
    lengths = []
    for j in AMR_ppi['AMR_RefSeq']:
        if nx.has_path(ppi_graph, i, j):
            lengths.append(nx.shortest_path_length(ppi_graph, i, j))
    if lengths:        
        ppi_info['Shortest Path to an AMR gene (length)'][idx] = min(lengths)
        
    idx += 1

In [148]:
print(ppi_info.groupby(['Shortest Path to an AMR gene (length)']).size().reset_index(name='Count'))

   Shortest Path to an AMR gene (length)  Count
0                                      0      1
1                                      1    176
2                                      2    216
3                                      3     39
4                                      4      1


In [None]:
plf_SAOUHSC = plf_SAOUHSC.set_index('RefSeq')

In [155]:
for i in ppi_info['Shortest Path to an AMR gene (length)'].unique():
    gene_set = ppi_info[ppi_info['Shortest Path to an AMR gene (length)'] == i]['Conserved Gene']
    globals()[f'plf_length_{i}'] = plf_SAOUHSC.loc[list(gene_set)]['PLF']

In [158]:
mydir = 'E:/User/bruna.fistarol/Documents/GitHub/Fistarol_2022_2.0/Staphylococcus'
datadir = 'E:/User/bruna.fistarol/Documents/GitHub/Nguyen_et_al_2020/Staphylococcus'

for i in ppi_info['Shortest Path to an AMR gene (length)'].unique():
    newdir = f'length.{i}'
    path = os.path.join(mydir, newdir)
    os.mkdir(path)
    
    sample = f'E:/User/bruna.fistarol/Documents/GitHub/Nguyen_et_al_2020/Staphylococcus/fasta.500.0'
    for strain in listdir(sample):
        with open(os.path.join(path, strain), 'a') as mystrain:
            with open(os.path.join(sample, strain), 'r') as sequences:
                first_loop = True
                for line in sequences:
                    if line[0] == '>':
                        if first_loop:
                            plfam = line[1:len(line)-1]
                            seq = ''
                            first_loop = False
                            continue
                        if plfam in list((globals()[f'plf_length_{i}'])):
                            mystrain.write('>' + plfam + '\n')
                            mystrain.write(seq)
                        plfam = line[1:len(line)-1]
                        seq = ''
                    else:
                        seq += line
                if plfam in list((globals()[f'plf_length_{i}'])):
                            mystrain.write('>' + plfam + '\n')
                            mystrain.write(seq)

In [159]:
for i in ppi_info['Shortest Path to an AMR gene (length)'].unique():
    globals()[f'len_{i}'] = []
    for strain in listdir(f'E:/User/bruna.fistarol/Documents/GitHub/Fistarol_2022_2.0/Staphylococcus/length.{i}'):
        with open(os.path.join(f'E:/User/bruna.fistarol/Documents/GitHub/Fistarol_2022_2.0/Staphylococcus/length.{i}', strain), 'r') as sequence:
            genes = 0
            for line in sequence:
                if line[0] == '>':
                    genes += 1
            globals()[f'len_{i}'].append(genes)

For each strain considering different path lengths to an AMR gene, we have the following number of genes:

In [160]:
for i in ppi_info['Shortest Path to an AMR gene (length)'].unique():
    print(i, np.mean(globals()[f'len_{i}']))

2 216.58359621451103
1 177.18927444794951
3 39.03785488958991
0 1.0063091482649842
nan 0.0
4 1.0


At this point, it is possible to use this new configuration of data to run the model. We can take 25 genes to each strain for lengths equals to 1, 2, 3 and 4, because the results from the paper also are derived from groups of 25 genes, hence, we can compare these results.

In [164]:
mydir = 'E:/User/bruna.fistarol/Documents/GitHub/Fistarol_2022_2.0/Staphylococcus'
os.mkdir(os.path.join(mydir, '25genes'))
dir_25genes = 'E:/User/bruna.fistarol/Documents/GitHub/Fistarol_2022_2.0/Staphylococcus/25genes'

for j in [0, 1, 2, 3, 4]:
    
    rand_idx = [sorted(random.sample(range(1,36), 25)), 
                sorted(random.sample(range(1,105), 25)), 
                sorted(random.sample(range(1,69), 25))]
    
    for i in [1, 2, 3]:   

        path = f'E:/User/bruna.fistarol/Documents/GitHub/Fistarol_2022_2.0/Staphylococcus/length.{i}'
        mydir = os.path.join(dir_25genes, f'length.{i}.{j}')
        os.mkdir(mydir)

        for strain in listdir(path):
            with open(os.path.join(mydir, strain), 'a') as mystrain:
                with open(os.path.join(path, strain), 'r') as sequences:
                    c = 0
                    first_loop = True
                    for line in sequences:
                        if line[0] == '>':
                            if first_loop:
                                plfam = line
                                seq = ''
                                c += 1
                                first_loop = False
                                continue
                            if c in rand_idx[i-1]:
                                mystrain.write(plfam)
                                mystrain.write(seq)
                            plfam = line
                            seq = ''
                            c += 1
                        else:
                            seq += line
                    if c in rand_idx[i-1]:
                            mystrain.write(plfam)
                            mystrain.write(seq)