In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import os
from os import listdir
import random
from datetime import date

# Salmonella enterica subsp. enterica serovar Typhimurium str. LT2

In order to have a specific Protein ID used to map protein families in a specific PPI, we chose a specific bacteria. But this should be resonable once we are carrying about conserved sequences.

In [2]:
bacteria = 'Salmonella'
root = 'E:/User/bruna.fistarol/Documents/MasterProject'

In [3]:
os.getcwd()

'E:\\User\\bruna.fistarol\\Documents\\MasterProject\\Salmonella'

# Data From PATRIC

## Genomic Features

The table below contains a list of genomic features, including coding DNA.

Each feature is identified by BRC ID from PATRIC and associated to a protein family referred as PATRIC genus-specific families (PLfams). Most of the genes has the associated Protein ID.

In [4]:
features = pd.read_csv('genome_features.csv')

In [5]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4970 entries, 0 to 4969
Data columns (total 21 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   Genome                                   4970 non-null   object 
 1   Genome ID                                4970 non-null   float64
 2   Accession                                4970 non-null   object 
 3   BRC ID                                   4970 non-null   object 
 4   RefSeq Locus Tag                         4460 non-null   object 
 5   Alt Locus Tag                            4970 non-null   object 
 6   Feature ID                               4970 non-null   object 
 7   Annotation                               4970 non-null   object 
 8   Feature Type                             4970 non-null   object 
 9   Start                                    4970 non-null   int64  
 10  End                                      4970 no

Through this table, we extract useful data to map protein families referred by Nguyen et. al.:

In [6]:
prot_info = features[['BRC ID', 'PATRIC genus-specific families (PLfams)', 'Protein ID']].astype("string")
prot_info.columns = ['BRC_ID', 'PLFam', 'Prot_ID']
prot_info.set_index('BRC_ID', inplace = True)
prot_info

Unnamed: 0_level_0,PLFam,Prot_ID
BRC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
fig|99287.12.peg.977,PLF_590_00000012,NP_459904.1
fig|99287.12.peg.978,PLF_590_00014201,NP_459905.1
fig|99287.12.peg.979,PLF_590_00019426,
fig|99287.12.peg.980,PLF_590_00015851,
fig|99287.12.peg.981,PLF_590_00000025,NP_459907.1
...,...,...
fig|99287.12.peg.4975,PLF_590_00006047,NP_490593.1
fig|99287.12.peg.4976,PLF_590_00005031,NP_490594.1
fig|99287.12.peg.4844,PLF_590_00005081,NP_490493.1
fig|99287.12.peg.4864,PLF_590_00005446,NP_490507.1


## Specialty Genes

A table containing AMR genes for this specie according to PATRIC.

In [7]:
AMR = pd.read_csv('specialty_genes.csv')['BRC ID']

AMR_ID = pd.DataFrame(prot_info.loc[AMR, 'Prot_ID'])

## Data from Nguyen et. al.

In [8]:
plf_500 = []

datadir = os.getcwd() + '\\Nguyen_et_al_2020\\fasta.500.0'
for strain in listdir(datadir):
    with open(os.path.join(datadir, strain), 'r') as sequences:
        for line in sequences:
            if line[0] == '>':
                plf_500.append(line[1:len(line)-1])
                
plf_500 = pd.DataFrame(np.unique(plf_500))
plf_500.columns = ['PLFam']

## Mapping PLF from Nguyen to Protein ID

In [9]:
plf2id = prot_info[['PLFam', 'Prot_ID']][prot_info['PLFam'].isin(plf_500['PLFam'])]
plf2id.columns = ['PLFam', 'Prot_ID']
plf2id.dropna(inplace = True)
plf2id = plf2id.reset_index(drop = True)

## Protein Interaction Network

In [10]:
transferred_links = pd.read_csv('salmonella.interolog', sep = '\t', names = ['interactor1', 'interactor2', 'source1', 'source2', 
                                                         'interolog_quality', 'evalue', 'source_org_id', 'det_type', 
                                                         'int_type', 'pubmed'])
ppi = pd.DataFrame(transferred_links[['interactor1', 'interactor2']])

In [11]:
transferred_links

Unnamed: 0,interactor1,interactor2,source1,source2,interolog_quality,evalue,source_org_id,det_type,int_type,pubmed
0,NP_459018.1,NP_459428.1,P08622,Q46948,94.810460,1.490000e-135,83333,MI:0004,MI:0915,22157000
1,NP_459428.1,NP_460031.1,P04825,Q46948,94.632629,1.490000e-135,83333,MI:0004,MI:0915,22157000
2,NP_459428.1,NP_460284.1,P21179,Q46948,91.798379,1.490000e-135,83333,MI:0004,MI:0915,22157000
3,NP_459428.1,NP_461748.1,P00957,Q46948,95.165623,1.490000e-135,83333,MI:0004,MI:0915,22157000
4,NP_459051.1,NP_459428.1,P00956,Q46948,90.910213,1.490000e-135,83333,MI:0004,MI:0915,22157000
...,...,...,...,...,...,...,...,...,...,...
26393,NP_459733.1,NP_461210.1,P02932,P0A855,73.952374,1.630000e-138,83333,MI:0404,MI:0915,9393690
26394,NP_459733.1,NP_463096.1,P02943,P0A855,87.587772,0.000000e+00,83333,MI:0404,MI:0915,9393690
26395,NP_460851.1,NP_460852.1,P0A809,P0A812,95.749103,2.180000e-142,83333,MI:0020,MI:0915,7885479
26396,NP_460912.1,NP_461793.1,P06179,P41785,100.000000,1.390000e-68,99287,MI:0096,MI:0915,26449475


In [21]:
ppi

Unnamed: 0,interactor1,interactor2
0,NP_459018.1,NP_459428.1
1,NP_459428.1,NP_460031.1
2,NP_459428.1,NP_460284.1
3,NP_459428.1,NP_461748.1
4,NP_459051.1,NP_459428.1
...,...,...
26376,NP_459733.1,NP_460490.1
26377,NP_459733.1,NP_460531.1
26378,NP_459733.1,NP_460946.1
26379,NP_459733.1,NP_461210.1


In [13]:
conserved_ppi_1 = plf2id[plf2id['Prot_ID'].isin(ppi['interactor1'])]['Prot_ID']
conserved_ppi_2 = plf2id[plf2id['Prot_ID'].isin(ppi['interactor2'])]['Prot_ID']

conserved_ppi = pd.DataFrame(pd.concat([conserved_ppi_1, conserved_ppi_2], axis = 0).drop_duplicates())

In [14]:
conserved_ppi

Unnamed: 0,Prot_ID
0,NP_459911.1
1,NP_459913.1
2,NP_459919.1
3,NP_459937.1
4,NP_459944.1
...,...
432,NP_463414.1
433,NP_463423.1
434,NP_463426.1
435,NP_463435.1


In [15]:
AMR_ppi_1 = AMR_ID[AMR_ID['Prot_ID'].isin(ppi['interactor1'])]['Prot_ID']
AMR_ppi_2 = AMR_ID[AMR_ID['Prot_ID'].isin(ppi['interactor2'])]['Prot_ID']

AMR_ppi = pd.DataFrame(pd.concat([AMR_ppi_1, AMR_ppi_2], axis = 0))
AMR_ppi.reset_index(drop=True, inplace=True)

In [16]:
AMR_ppi.drop_duplicates(inplace = True)

In [17]:
ppi_info = pd.DataFrame(columns = ['Conserved Gene', 'Shortest Path to an AMR gene (length)',])

ppi_info['Conserved Gene'] = conserved_ppi.reset_index(drop = True)['Prot_ID']

In [18]:
ppi_graph = nx.from_pandas_edgelist(ppi, 'interactor1', 'interactor2')

In [19]:
ppi_graph.number_of_edges()

16219

In [20]:
ppi_graph.number_of_nodes()

2639

In [22]:
idx = 0
for i in conserved_ppi['Prot_ID']:
    lengths = []
    for j in AMR_ppi['Prot_ID']:
        if nx.has_path(ppi_graph, i, j):
            lengths.append(nx.shortest_path_length(ppi_graph, i, j))
    if lengths:        
        ppi_info['Shortest Path to an AMR gene (length)'][idx] = min(lengths)
        
    idx += 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [23]:
list(set(AMR_ID['Prot_ID']) & set(plf2id['Prot_ID']))

['NP_463158.1',
 'NP_459051.1',
 'NP_459919.1',
 'NP_461740.1',
 'NP_462101.2',
 'NP_462772.2',
 'NP_460659.1',
 'NP_463157.1',
 'NP_462300.1',
 'NP_461214.1',
 'NP_462345.1']

In [24]:
ppi_info[ppi_info['Shortest Path to an AMR gene (length)'] == 1]['Conserved Gene']

3      NP_459937.1
9      NP_460033.1
10     NP_460035.1
13     NP_460124.1
16     NP_460166.1
          ...     
333    NP_462908.1
337    NP_463034.1
339    NP_463044.1
350    NP_463220.1
351    NP_463242.1
Name: Conserved Gene, Length: 83, dtype: string

In [25]:
plf2id.set_index('Prot_ID', drop = True, inplace = True)

In [26]:
for i in ppi_info['Shortest Path to an AMR gene (length)'].unique():
    gene_set = ppi_info[ppi_info['Shortest Path to an AMR gene (length)'] == i]['Conserved Gene']
    globals()[f'plf_length_{i}'] = plf2id.loc[list(gene_set)]['PLFam']

In [27]:
date = date.today().strftime("%Y.%m.%d")

In [29]:
bacdir = f'{root}/{bacteria}/{date}'

os.mkdir(bacdir)

for i in ppi_info['Shortest Path to an AMR gene (length)'].unique():
    newdir = f'length.{i}'
    lendir = os.path.join(bacdir, newdir)
    os.mkdir(lendir)
    
    sample = f'{root}/{bacteria}/Nguyen_et_al_2020/fasta.500.0'
    for strain in listdir(sample):
        with open(os.path.join(lendir, strain), 'a') as mystrain:
            with open(os.path.join(sample, strain), 'r') as sequences:
                first_loop = True
                for line in sequences:
                    if line[0] == '>':
                        if first_loop:
                            plfam = line[1:len(line)-1]
                            seq = ''
                            first_loop = False
                            continue
                        if plfam in list((globals()[f'plf_length_{i}'])):
                            mystrain.write('>' + plfam + '\n')
                            mystrain.write(seq)
                        plfam = line[1:len(line)-1]
                        seq = ''
                    else:
                        seq += line
                if plfam in list((globals()[f'plf_length_{i}'])):
                            mystrain.write('>' + plfam + '\n')
                            mystrain.write(seq)

In [70]:
with open(f'{bacdir}/readme.txt', 'w') as f:
    f.write('first try using mateo\'s ppi\'s\napproach to select genes: naive\ninterolog quality not considered')

In [28]:
print(ppi_info.groupby(['Shortest Path to an AMR gene (length)']).size().reset_index(name='Count'))

   Shortest Path to an AMR gene (length)  Count
0                                      0      9
1                                      1     83
2                                      2    219
3                                      3     38
4                                      4      5
5                                      5      3


At this point, it is possible to use this new configuration of data to run the model. We can take 25 genes to each strain for lengths equals to 1, 2 and 3, because the results from the paper also are derived from groups of 25 genes, hence, we can compare these results.

In [30]:
dir_genes = os.path.join(bacdir, '9genes')
os.mkdir(dir_genes)

num_genes = [9, 81, 216, 36]

for i in range(4):
    for j in range(1, int(num_genes[i]/9)+2):
        repdir = os.path.join(dir_genes, f'length.{i}.{j}')
        os.mkdir(repdir)

In [32]:
for i in [0, 1, 2, 3]:
    j = 1
    lendir = os.path.join(bacdir, f'length.{i}')
    for strain in listdir(lendir):
        with open(os.path.join(lendir, strain), 'r') as sequences:
            repdir = os.path.join(dir_genes, f'length.{i}.{j}')
            c = 0
            mystrain = open(os.path.join(repdir, strain), 'a')
            first_loop = True        
            for line in sequences:
                if line[0] == '>':
                    if first_loop:
                        plfam = line
                        seq = ''
                        first_loop = False
                        continue
                    mystrain.write(plfam)
                    mystrain.write(seq)
                    plfam = line
                    seq = ''
                    c += 1
                else:
                    seq += line
                if c == 9:
                    mystrain.close()
                    j += 1
                    repdir = os.path.join(dir_genes, f'length.{i}.{j}')
                    c = 0
                    mystrain = open(os.path.join(repdir, strain), 'a')
            j = 1
            mystrain.close()

In [40]:
AMR_prot_ID = ['NP_463158.1', 'NP_459051.1', 'NP_459919.1', 'NP_461740.1', 'NP_462101.2', 'NP_462772.2', 'NP_460659.1', 'NP_463157.1','NP_462300.1', 'NP_461214.1', 'NP_462345.1']

In [45]:
AMR_plf = plf2id.loc[AMR_prot_ID]['PLFam']

In [46]:
AMR_plf

Prot_ID
NP_463158.1    PLF_590_00024086
NP_459051.1    PLF_590_00000236
NP_459919.1    PLF_590_00000215
NP_461740.1    PLF_590_00002421
NP_462101.2    PLF_590_00001611
NP_462772.2    PLF_590_00001982
NP_460659.1    PLF_590_00002841
NP_463157.1    PLF_590_00002480
NP_462300.1    PLF_590_00000543
NP_461214.1    PLF_590_00000441
NP_462345.1    PLF_590_00003204
Name: PLFam, dtype: string

In [49]:
dir_genes = os.path.join(bacdir, 'AMR')
os.mkdir(dir_genes)

num_genes = [9, 81, 216, 36]

for i in range(len(AMR_plf)):
    repdir = os.path.join(dir_genes, f'{AMR_plf[i]}')
    os.mkdir(repdir)

In [52]:
for i in range(len(AMR_plf)):
    setdir = bacdir + '\\AMR' + '\\' + AMR_plf[i]
    sample = f'{root}/{bacteria}/Nguyen_et_al_2020/fasta.500.0'
    for strain in listdir(sample):
        with open(os.path.join(setdir, strain), 'a') as mystrain:
            with open(os.path.join(sample, strain), 'r') as sequences:
                first_loop = True
                for line in sequences:
                    if line[0] == '>':
                        if first_loop:
                            plfam = line[1:len(line)-1]
                            seq = ''
                            first_loop = False
                            continue
                        if plfam == AMR_plf[i]:
                            mystrain.write('>' + plfam + '\n')
                            mystrain.write(seq)
                        plfam = line[1:len(line)-1]
                        seq = ''
                    else:
                        seq += line
                if plfam == AMR_plf[i]:
                            mystrain.write('>' + plfam + '\n')
                            mystrain.write(seq)