In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import os
from os import listdir
import random

# Staphylococcus aureus subsp. aureus NCTC 8325

In order to have the same RefSeq on both databases, this subspecie is used

In [68]:
root = os.getcwd()

# Data From PATRIC

## Genomic Features

The table below contains a list of genomic features, including coding DNA.

Each feature is solely identified by BRC ID and associated to a protein family referred as PATRIC genus-specific families (PLfams). Most of the genes has the associated RefSeq.

In [70]:
features = pd.read_csv('genome_features.csv')

In [71]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2755 entries, 0 to 2754
Data columns (total 21 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   Genome                                   2755 non-null   object 
 1   Genome ID                                2755 non-null   float64
 2   Accession                                2755 non-null   object 
 3   BRC ID                                   2755 non-null   object 
 4   RefSeq Locus Tag                         2681 non-null   object 
 5   Alt Locus Tag                            2755 non-null   object 
 6   Feature ID                               2755 non-null   object 
 7   Annotation                               2755 non-null   object 
 8   Feature Type                             2755 non-null   object 
 9   Start                                    2755 non-null   int64  
 10  End                                      2755 no

Through this table, we extract useful data to map protein families referred by Nguyen et. al.:

In [72]:
plf = features[['BRC ID', 'PATRIC genus-specific families (PLfams)']].astype("string")
plf.columns = ['BRC_ID', 'PLFam']
plf.set_index('BRC_ID', inplace = True)
plf

Unnamed: 0_level_0,PLFam
BRC_ID,Unnamed: 1_level_1
fig|93061.5.peg.943,PLF_1279_00001903
fig|93061.5.peg.944,PLF_1279_00000989
fig|93061.5.peg.945,PLF_1279_00001254
fig|93061.5.peg.946,PLF_1279_00000620
fig|93061.5.peg.947,PLF_1279_00085250
...,...
fig|93061.5.peg.83,PLF_1279_00002111
fig|93061.5.peg.939,PLF_1279_00000867
fig|93061.5.peg.940,PLF_1279_00000994
fig|93061.5.peg.941,PLF_1279_00000907


## Specialty Genes

The table containing specialty genes relates several genomic features to a relevant property. Here the table is filtered by the antibiotic resistance property

In [73]:
AMR_refseq = pd.DataFrame(pd.read_csv('specialty_genes.csv')['RefSeq Locus Tag'])
AMR_refseq.columns = ['AMR_RefSeq']

In [74]:
AMR_refseq['AMR_RefSeq'].nunique()

49

## Data from Nguyen et. al.

In [75]:
plf_500 = []

datadir = root + '\\Nguyen_et_al_2020\\fasta.500.0'
for strain in listdir(datadir):
    with open(os.path.join(datadir, strain), 'r') as sequences:
        for line in sequences:
            if line[0] == '>':
                plf_500.append(line[1:len(line)-1])
                
plf_500 = pd.DataFrame(np.unique(plf_500))
plf_500.columns = ['Protein Family ID']

## RefSeq Mapping

In [76]:
refseq = features[['BRC ID', 'RefSeq Locus Tag']]
refseq.columns = ['BRC_ID', 'RefSeq']
refseq.set_index('BRC_ID', inplace = True)

In [77]:
plf_map_refseq = features[['RefSeq Locus Tag', 'PATRIC genus-specific families (PLfams)']][features['PATRIC genus-specific families (PLfams)'].isin(plf_500['Protein Family ID'])].reset_index(drop = True)
plf_map_refseq.columns = ['RefSeq', 'PLF']
plf_map_refseq.dropna(inplace = True)
plf_map_refseq.reset_index(drop = True, inplace = True)

In [78]:
plf_map_refseq

Unnamed: 0,RefSeq,PLF
0,SAOUHSC_01030,PLF_1279_00001903
1,SAOUHSC_01038,PLF_1279_00000817
2,SAOUHSC_01044,PLF_1279_00002027
3,SAOUHSC_01045,PLF_1279_00062515
4,SAOUHSC_01047,PLF_1279_00000667
...,...,...
494,SAOUHSC_01011,PLF_1279_00000821
495,SAOUHSC_01016,PLF_1279_00000658
496,SAOUHSC_01019,PLF_1279_00001408
497,SAOUHSC_01021,PLF_1279_00000378


## Protein Interaction Network

In [79]:
ppi_patric = pd.read_csv('ppi_patric.csv')
ppi_patric = ppi_patric[['Interactor A ID', 'Interactor B ID']].astype("string")
ppi_patric.columns = ['Interactor_A_ID', 'Interactor_B_ID']

In [80]:
ppi_refseq = ppi_patric
for i in range(len(ppi_refseq['Interactor_A_ID'])):
    if ppi_refseq['Interactor_A_ID'][i] in list(refseq.index):
        ppi_refseq.at[i, 'Interactor_A_ID'] = refseq.loc[ppi_refseq['Interactor_A_ID'][i]].RefSeq
    else:
        ppi_refseq.drop(inplace = True, labels = i)
        
ppi_refseq.reset_index(inplace = True, drop = True)
        
for i in range(len(ppi_refseq['Interactor_B_ID'])):
    if ppi_refseq['Interactor_B_ID'][i]in (refseq.index):
        ppi_refseq.at[i, 'Interactor_B_ID'] = refseq.loc[ppi_refseq['Interactor_B_ID'][i]].RefSeq
    else:
        ppi_refseq.drop(inplace = True, labels = i)

In [81]:
ppi_string = pd.read_csv('ppi_string.txt', sep = ' ')
ppi_string.columns = ['Interactor_A_ID', 'Interactor_B_ID', 'weight']
ppi_string.replace('93061.', '', regex = True, inplace = True)

In [82]:
ppi = pd.DataFrame(pd.concat([ppi_refseq, ppi_string], axis = 0)[['Interactor_A_ID', 'Interactor_B_ID']]).reset_index(drop = True)

In [83]:
ppi.to_csv(path_or_buf = root + '\\ppi.csv')

In [84]:
conserved_ppi_A = plf_map_refseq[plf_map_refseq['RefSeq'].isin(ppi['Interactor_A_ID'])]['RefSeq']
conserved_ppi_B = plf_map_refseq[plf_map_refseq['RefSeq'].isin(ppi['Interactor_B_ID'])]['RefSeq']

conserved_ppi = pd.DataFrame(pd.concat([conserved_ppi_A, conserved_ppi_B], axis = 0).drop_duplicates())

In [85]:
conserved_ppi

Unnamed: 0,RefSeq
0,SAOUHSC_01030
1,SAOUHSC_01038
3,SAOUHSC_01045
4,SAOUHSC_01047
7,SAOUHSC_00097
...,...
495,SAOUHSC_01016
496,SAOUHSC_01019
497,SAOUHSC_01021
498,SAOUHSC_00093


In [86]:
AMR_ppi_A = AMR_refseq[AMR_refseq['AMR_RefSeq'].isin(ppi['Interactor_A_ID'])]['AMR_RefSeq']
AMR_ppi_B = AMR_refseq[AMR_refseq['AMR_RefSeq'].isin(ppi['Interactor_B_ID'])]['AMR_RefSeq']

AMR_ppi = pd.DataFrame(pd.concat([AMR_ppi_A, AMR_ppi_B], axis = 0))
AMR_ppi.reset_index(drop=True, inplace=True)

In [87]:
AMR_ppi['AMR_RefSeq'].nunique()

43

In [88]:
AMR_ppi.drop_duplicates(inplace = True)

In [89]:
ppi_graph = nx.from_pandas_edgelist(ppi, 'Interactor_A_ID', 'Interactor_B_ID')

In [90]:
ppi_graph.number_of_edges()

32881

In [91]:
ppi_graph.number_of_nodes()

1893

In [92]:
plf_map_refseq = plf_map_refseq.set_index('RefSeq')

## pStep Kernel

After running the R code to get kernel matrix with p = 1 or p = 2, do the following steps:

In [98]:
p = '1'

In [99]:
pStepKernel = np.load(f'pStepKernel_{p}.npy')

In [100]:
with open (f'protein_names_{p}step.txt') as f:
    protein_names = f.readlines()

In [101]:
protein_names = protein_names[1:]
protein_names = [name.replace('\n', '') for name in protein_names]

In [102]:
AMR_bin = np.zeros(shape=(len(protein_names), 1))

In [103]:
for i in range(len(protein_names)):
    if protein_names[i] in AMR_ppi['AMR_RefSeq'].tolist():
        AMR_bin[i] = 1

In [104]:
kernel = pd.concat([pd.DataFrame(protein_names), pd.DataFrame.from_records(np.dot(pStepKernel, AMR_bin))], axis = 1)

In [105]:
kernel.columns = ['protein', 'kernel']

In [106]:
kernel.sort_values(by = 'kernel', ascending = False, inplace = True)
kernel = kernel[kernel['protein'].isin(conserved_ppi['RefSeq'])]
kernel.reset_index(drop = True, inplace = True)

In [107]:
kernel

Unnamed: 0,protein,kernel
0,SAOUHSC_02402,1.073826
1,SAOUHSC_02484,1.054325
2,SAOUHSC_01785,1.048231
3,SAOUHSC_00519,1.044491
4,SAOUHSC_00875,1.019842
...,...,...
382,SAOUHSC_01692,0.000000
383,SAOUHSC_00540,0.000000
384,SAOUHSC_00848,0.000000
385,SAOUHSC_00097,0.000000


Creating sets of 25 genes according to kernel descend order:

In [108]:
for i in range(len(kernel)//25):
    gene_set = kernel.iloc[range(25*i, 25*(i+1))]['protein']
    globals()[f'kernel_top{i}'] = plf_map_refseq.loc[list(gene_set)]['PLF']

In [109]:
kernel_top1

RefSeq
SAOUHSC_00901    PLF_1279_00001960
SAOUHSC_01100    PLF_1279_00000833
SAOUHSC_01153    PLF_1279_00000698
SAOUHSC_02349    PLF_1279_00000632
SAOUHSC_00755    PLF_1279_00000734
SAOUHSC_00575    PLF_1279_00000708
SAOUHSC_01629    PLF_1279_00000905
SAOUHSC_01670    PLF_1279_00001230
SAOUHSC_01278    PLF_1279_00000345
SAOUHSC_00533    PLF_1279_00001548
SAOUHSC_00963    PLF_1279_00000303
SAOUHSC_02457    PLF_1279_00001496
SAOUHSC_02505    PLF_1279_00000352
SAOUHSC_02482    PLF_1279_00001360
SAOUHSC_02495    PLF_1279_00000758
SAOUHSC_01492    PLF_1279_00000553
SAOUHSC_01249    PLF_1279_00000188
SAOUHSC_00518    PLF_1279_00000467
SAOUHSC_01839    PLF_1279_00000917
SAOUHSC_02460    PLF_1279_00001447
SAOUHSC_00836    PLF_1279_00000897
SAOUHSC_02478    PLF_1279_00000812
SAOUHSC_00712    PLF_1279_00001448
SAOUHSC_02512    PLF_1279_00000388
SAOUHSC_03055    PLF_1279_00001126
Name: PLF, dtype: object

In [110]:
bacdir = f'{root}/GeneSets/{p}StepKernel'
#bacdir = f'{root}/GeneSets/{p}StepKernel_RandomAMR_1'
#bacdir = f'{root}/GeneSets/{p}StepKernel_RandomAMR_2'
#bacdir = f'{root}/GeneSets/{p}StepKernel_RandomAMR_3'

os.mkdir(bacdir)

for i in range(len(kernel)//25):
    newdir = f'top.{i}'
    rankdir = os.path.join(bacdir, newdir)
    os.mkdir(rankdir)
    
    sample = datadir
    for strain in listdir(sample):
        with open(os.path.join(rankdir, strain), 'a') as mystrain:
            with open(os.path.join(sample, strain), 'r') as sequences:
                first_loop = True
                for line in sequences:
                    if line[0] == '>':
                        if first_loop:
                            plfam = line[1:len(line)-1]
                            seq = ''
                            first_loop = False
                            continue
                        if plfam in list((globals()[f'kernel_top{i}'])):
                            mystrain.write('>' + plfam + '\n')
                            mystrain.write(seq)
                        plfam = line[1:len(line)-1]
                        seq = ''
                    else:
                        seq += line
                if plfam in list((globals()[f'kernel_top{i}'])):
                            mystrain.write('>' + plfam + '\n')
                            mystrain.write(seq)

# Constructing gene sets randomizing AMR

In [93]:
%run E:\\User\\bruna.fistarol\\Documents\\MasterProject\\GuneyDistance.py

In [94]:
ppi['w'] = 1.0
ppi.columns = ['p1', 'p2', 'w']

G = nx.from_pandas_edgelist(ppi, 'p1', 'p2', edge_attr='w')

s1 = set(AMR_ppi['AMR_RefSeq'])
s2 = set(ppi['p1']) | set(ppi['p2'])
s3 = s1 & s2
samples = GuneyDistance(G).get_random_nodes(s3, seed=0) #it creates 1000 sets of possible new fake AMR genes
df = pd.DataFrame(samples)

In [95]:
AMR_ppi_random = pd.DataFrame(df.iloc[12]) #to create another group, change the index
AMR_ppi_random.columns = ['AMR_RefSeq']

We can check if a set of fake AMR genes has some real AMR gene:

In [96]:
set(AMR_ppi['AMR_RefSeq']) & set(AMR_ppi_random['AMR_RefSeq']) 

{'SAOUHSC_00529'}

Now, substitute AMR_ppi for AMR_ppi_random and run everything again from ''pStep Kernel''  

In [97]:
AMR_ppi = AMR_ppi_random

Remember to change directory name for create new gene set files using random AMR genes.