In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import os
from os import listdir
import random
from datetime import date

# Salmonella enterica subsp. enterica serovar Typhimurium str. LT2

In order to have a specific Protein ID used to map protein families in a specific PPI, we chose a specific bacteria. But this should be resonable once we are carrying about conserved sequences.

In [2]:
bacteria = 'Salmonella'
root = 'E:/User/bruna.fistarol/Documents/MasterProject'

In [3]:
os.getcwd()

'E:\\User\\bruna.fistarol\\Documents\\MasterProject\\Salmonella'

# Data From PATRIC

## Genomic Features

The table below contains a list of genomic features, including coding DNA.

Each feature is identified by BRC ID from PATRIC and associated to a protein family referred as PATRIC genus-specific families (PLfams). Most of the genes has the associated Protein ID.

In [4]:
features = pd.read_csv('genome_features.csv')

In [5]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4970 entries, 0 to 4969
Data columns (total 21 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   Genome                                   4970 non-null   object 
 1   Genome ID                                4970 non-null   float64
 2   Accession                                4970 non-null   object 
 3   BRC ID                                   4970 non-null   object 
 4   RefSeq Locus Tag                         4460 non-null   object 
 5   Alt Locus Tag                            4970 non-null   object 
 6   Feature ID                               4970 non-null   object 
 7   Annotation                               4970 non-null   object 
 8   Feature Type                             4970 non-null   object 
 9   Start                                    4970 non-null   int64  
 10  End                                      4970 no

Through this table, we extract useful data to map protein families referred by Nguyen et. al.:

In [6]:
prot_info = features[['BRC ID', 'PATRIC genus-specific families (PLfams)', 'Protein ID']].astype("string")
prot_info.columns = ['BRC_ID', 'PLFam', 'Prot_ID']
prot_info.set_index('BRC_ID', inplace = True)
prot_info

Unnamed: 0_level_0,PLFam,Prot_ID
BRC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
fig|99287.12.peg.977,PLF_590_00000012,NP_459904.1
fig|99287.12.peg.978,PLF_590_00014201,NP_459905.1
fig|99287.12.peg.979,PLF_590_00019426,
fig|99287.12.peg.980,PLF_590_00015851,
fig|99287.12.peg.981,PLF_590_00000025,NP_459907.1
...,...,...
fig|99287.12.peg.4975,PLF_590_00006047,NP_490593.1
fig|99287.12.peg.4976,PLF_590_00005031,NP_490594.1
fig|99287.12.peg.4844,PLF_590_00005081,NP_490493.1
fig|99287.12.peg.4864,PLF_590_00005446,NP_490507.1


## Specialty Genes

A table containing AMR genes for this specie according to PATRIC.

In [7]:
AMR = pd.read_csv('specialty_genes.csv')['BRC ID']

AMR_ID = pd.DataFrame(prot_info.loc[AMR, 'Prot_ID'])

## Data from Nguyen et. al.

In [8]:
plf_500 = []

datadir = os.getcwd() + '\\Nguyen_et_al_2020\\fasta.500.0'
for strain in listdir(datadir):
    with open(os.path.join(datadir, strain), 'r') as sequences:
        for line in sequences:
            if line[0] == '>':
                plf_500.append(line[1:len(line)-1])
                
plf_500 = pd.DataFrame(np.unique(plf_500))
plf_500.columns = ['PLFam']

## Mapping PLF from Nguyen to Protein ID

In [9]:
plf2id = prot_info[['PLFam', 'Prot_ID']][prot_info['PLFam'].isin(plf_500['PLFam'])]
plf2id.columns = ['PLFam', 'Prot_ID']
plf2id.dropna(inplace = True)
plf2id = plf2id.reset_index(drop = True)

## Protein Interaction Network

In [10]:
transferred_links = pd.read_csv('salmonella.interolog', sep = '\t', names = ['interactor1', 'interactor2', 'source1', 'source2', 
                                                         'interolog_quality', 'evalue', 'source_org_id', 'det_type', 
                                                         'int_type', 'pubmed'])
ppi = pd.DataFrame(transferred_links[['interactor1', 'interactor2']])

In [11]:
transferred_links

Unnamed: 0,interactor1,interactor2,source1,source2,interolog_quality,evalue,source_org_id,det_type,int_type,pubmed
0,NP_459018.1,NP_459428.1,P08622,Q46948,94.810460,1.490000e-135,83333,MI:0004,MI:0915,22157000
1,NP_459428.1,NP_460031.1,P04825,Q46948,94.632629,1.490000e-135,83333,MI:0004,MI:0915,22157000
2,NP_459428.1,NP_460284.1,P21179,Q46948,91.798379,1.490000e-135,83333,MI:0004,MI:0915,22157000
3,NP_459428.1,NP_461748.1,P00957,Q46948,95.165623,1.490000e-135,83333,MI:0004,MI:0915,22157000
4,NP_459051.1,NP_459428.1,P00956,Q46948,90.910213,1.490000e-135,83333,MI:0004,MI:0915,22157000
...,...,...,...,...,...,...,...,...,...,...
26393,NP_459733.1,NP_461210.1,P02932,P0A855,73.952374,1.630000e-138,83333,MI:0404,MI:0915,9393690
26394,NP_459733.1,NP_463096.1,P02943,P0A855,87.587772,0.000000e+00,83333,MI:0404,MI:0915,9393690
26395,NP_460851.1,NP_460852.1,P0A809,P0A812,95.749103,2.180000e-142,83333,MI:0020,MI:0915,7885479
26396,NP_460912.1,NP_461793.1,P06179,P41785,100.000000,1.390000e-68,99287,MI:0096,MI:0915,26449475


In [12]:
ppi

Unnamed: 0,interactor1,interactor2
0,NP_459018.1,NP_459428.1
1,NP_459428.1,NP_460031.1
2,NP_459428.1,NP_460284.1
3,NP_459428.1,NP_461748.1
4,NP_459051.1,NP_459428.1
...,...,...
26393,NP_459733.1,NP_461210.1
26394,NP_459733.1,NP_463096.1
26395,NP_460851.1,NP_460852.1
26396,NP_460912.1,NP_461793.1


In [13]:
ppi.to_csv(path_or_buf = os.getcwd() + '\\ppi.csv')

In [18]:
conserved_ppi_1 = plf2id[plf2id['Prot_ID'].isin(ppi['interactor1'])]['Prot_ID']
conserved_ppi_2 = plf2id[plf2id['Prot_ID'].isin(ppi['interactor2'])]['Prot_ID']

conserved_ppi = pd.DataFrame(pd.concat([conserved_ppi_1, conserved_ppi_2], axis = 0).drop_duplicates())

In [19]:
conserved_ppi

Unnamed: 0,Prot_ID
0,NP_459911.1
1,NP_459913.1
2,NP_459919.1
3,NP_459937.1
4,NP_459944.1
...,...
432,NP_463414.1
433,NP_463423.1
434,NP_463426.1
435,NP_463435.1


In [20]:
AMR_ppi_1 = AMR_ID[AMR_ID['Prot_ID'].isin(ppi['interactor1'])]['Prot_ID']
AMR_ppi_2 = AMR_ID[AMR_ID['Prot_ID'].isin(ppi['interactor2'])]['Prot_ID']

AMR_ppi = pd.DataFrame(pd.concat([AMR_ppi_1, AMR_ppi_2], axis = 0))
AMR_ppi.reset_index(drop=True, inplace=True)

In [21]:
AMR_ppi.drop_duplicates(inplace = True)

In [22]:
ppi_info = pd.DataFrame(columns = ['Conserved Gene', 'Shortest Path to an AMR gene (length)',])

ppi_info['Conserved Gene'] = conserved_ppi.reset_index(drop = True)['Prot_ID']

In [23]:
ppi_graph = nx.from_pandas_edgelist(ppi, 'interactor1', 'interactor2')

In [24]:
ppi_graph.number_of_edges()

16219

In [25]:
ppi_graph.number_of_nodes()

2639

In [26]:
idx = 0
for i in conserved_ppi['Prot_ID']:
    lengths = []
    for j in AMR_ppi['Prot_ID']:
        if nx.has_path(ppi_graph, i, j):
            lengths.append(nx.shortest_path_length(ppi_graph, i, j))
    if lengths:        
        ppi_info['Shortest Path to an AMR gene (length)'][idx] = min(lengths)
        
    idx += 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [27]:
list(set(AMR_ID['Prot_ID']) & set(plf2id['Prot_ID']))

['NP_460659.1',
 'NP_462772.2',
 'NP_459919.1',
 'NP_461214.1',
 'NP_463158.1',
 'NP_462345.1',
 'NP_461740.1',
 'NP_459051.1',
 'NP_463157.1',
 'NP_462300.1',
 'NP_462101.2']

In [28]:
plf2id.set_index('Prot_ID', drop = True, inplace = True)

## pStep Kernel, p = 2

In [14]:
pStepKernel_2 = np.load('pStepKernel_2.npy')

In [15]:
with open ('protein_names.txt') as f:
    protein_names = f.readlines()

In [16]:
protein_names = protein_names[1:]
protein_names = [name.replace('\n', '') for name in protein_names]

In [32]:
AMR_bin = np.zeros(shape=(len(protein_names), 1))
for i in range(len(protein_names)):
    if protein_names[i] in AMR_ppi['Prot_ID'].tolist():
        AMR_bin[i] = 1

In [34]:
kernel = pd.concat([pd.DataFrame(protein_names), pd.DataFrame.from_records(np.dot(pStepKernel_2, AMR_bin))], axis = 1)
kernel.columns = ['protein', 'kernel']
kernel.sort_values(by = 'kernel', ascending = False, inplace = True)
kernel = kernel[kernel['protein'].isin(conserved_ppi['Prot_ID'])]
kernel.reset_index(drop = True, inplace = True)
kernel

Unnamed: 0,protein,kernel
0,NP_461740.1,2.004706
1,NP_462300.1,1.952091
2,NP_459919.1,1.655862
3,NP_461214.1,1.468215
4,NP_463157.1,1.389946
...,...,...
356,NP_460869.1,0.000000
357,NP_460779.1,0.000000
358,NP_460675.1,0.000000
359,NP_461398.1,0.000000


Creating sets of 25 genes according to kernel descend order:

In [38]:
for i in range(14):
    gene_set = kernel.iloc[range(25*i, 25*(i+1))]['protein']
    globals()[f'kernel_top{i}'] = plf2id.loc[list(gene_set)]['PLFam']

In [40]:
bacdir = f'{root}/Fistarol_2022/{bacteria}'
os.mkdir(bacdir)

for i in range(14):
    newdir = f'top.{i}'
    rankdir = os.path.join(bacdir, newdir)
    os.mkdir(rankdir)
    
    sample = f'{root}/{bacteria}/Nguyen_et_al_2020/fasta.500.0'
    for strain in listdir(sample):
        with open(os.path.join(rankdir, strain), 'a') as mystrain:
            with open(os.path.join(sample, strain), 'r') as sequences:
                first_loop = True
                for line in sequences:
                    if line[0] == '>':
                        if first_loop:
                            plfam = line[1:len(line)-1]
                            seq = ''
                            first_loop = False
                            continue
                        if plfam in list((globals()[f'kernel_top{i}'])):
                            mystrain.write('>' + plfam + '\n')
                            mystrain.write(seq)
                        plfam = line[1:len(line)-1]
                        seq = ''
                    else:
                        seq += line
                if plfam in list((globals()[f'kernel_top{i}'])):
                            mystrain.write('>' + plfam + '\n')
                            mystrain.write(seq)