# Mapping from PATRIC to RefSeq Locus Tag

To work with another PPI, we need to have gene index compatible with another databases. Once the table containing feature information has the RefSeq Locus Tag, we will use this information.

In [2]:
import pandas as pd
import numpy as np

In [40]:
features = pd.read_csv('genome_features.csv')
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10999 entries, 0 to 10998
Data columns (total 21 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   Genome                                   10999 non-null  object 
 1   Genome ID                                10999 non-null  float64
 2   Accession                                10999 non-null  object 
 3   BRC ID                                   10999 non-null  object 
 4   RefSeq Locus Tag                         10703 non-null  object 
 5   Alt Locus Tag                            5488 non-null   object 
 6   Feature ID                               10999 non-null  object 
 7   Annotation                               10999 non-null  object 
 8   Feature Type                             10999 non-null  object 
 9   Start                                    10999 non-null  int64  
 10  End                                      10999

In [41]:
refseq = features[['BRC ID', 'RefSeq Locus Tag']]
refseq.columns = ['BRC_ID', 'RefSeq']
refseq.set_index('BRC_ID', inplace = True)

In [43]:
specialty_genes = pd.read_csv('specialty_genes.csv')
specialty_genes = specialty_genes[['BRC ID', 'Property']]
specialty_genes.columns = ['BRC_ID', 'Property']
specialty_genes.set_index('BRC_ID', inplace = True)

In [44]:
AMR_genes = pd.DataFrame(specialty_genes.loc[specialty_genes.Property == 'Antibiotic Resistance'].reset_index()['BRC_ID'])

In [45]:
AMR_genes

Unnamed: 0,BRC_ID
0,fig|1413510.3.peg.2169
1,fig|93061.5.peg.1154
2,fig|93061.5.peg.2089
3,fig|93061.5.peg.842
4,fig|158879.11.peg.1813
...,...
264,fig|158879.11.peg.2331
265,fig|1241616.6.peg.1396
266,fig|158879.11.peg.647
267,fig|158879.11.peg.2107


In [46]:
import os
from os import listdir

plf_500 = []

datadir = 'E:/User/bruna.fistarol/Documents/GitHub/Nguyen_et_al_2020/Staphylococcus/fasta.500.0'
for strain in listdir(datadir):
    with open(os.path.join(datadir, strain), 'r') as sequences:
        for line in sequences:
            if line[0] == '>':
                plf_500.append(line[1:len(line)-1])
                
plf_500 = pd.DataFrame(np.unique(plf_500))
plf_500.columns = ['Protein Family ID']

In [47]:
plf_map_ncbi = features[['RefSeq Locus Tag', 'PATRIC genus-specific families (PLfams)']][features['PATRIC genus-specific families (PLfams)'].isin(plf_500['Protein Family ID'])].reset_index(drop = True)
plf_map_ncbi.columns = ['RefSeq', 'PLF']
plf_map_ncbi.dropna(inplace = True)
plf_SAOUHSC = plf_map_ncbi.drop_duplicates(subset = 'PLF', keep = 'last', inplace = True)

In [48]:
AMR_map_ncbi = features[['RefSeq Locus Tag']][features['BRC ID'].isin(AMR_genes['BRC_ID'])].reset_index(drop = True)
AMR_map_ncbi.columns = ['AMR_RefSeq']
AMR_map_ncbi

Unnamed: 0,AMR_RefSeq
0,AA076_00455
1,AA076_05535
2,AA076_05935
3,AA076_06405
4,AA076_06830
...,...
200,SAOUHSC_00694
201,SAOUHSC_00703
202,SAOUHSC_00006
203,SAOUHSC_00921


In [42]:
# ppi = pd.read_csv('ppi_patric.csv')
# ppi = ppi[['Interactor A ID', 'Interactor B ID']].astype("string")
# ppi.columns = ['Interactor_A_ID', 'Interactor_B_ID']
# ppi

Unnamed: 0,Interactor_A_ID,Interactor_B_ID
0,fig|93061.5.peg.452,fig|93061.5.peg.713
1,fig|93061.5.peg.1920,fig|93061.5.peg.1921
2,fig|93061.5.peg.111,fig|93061.5.peg.119
3,fig|93061.5.peg.112,fig|93061.5.peg.121
4,fig|93061.5.peg.1069,fig|93061.5.peg.1071
...,...,...
4995,fig|93061.5.peg.544,fig|93061.5.peg.856
4996,fig|93061.5.peg.1411,fig|93061.5.peg.1412
4997,fig|93061.5.peg.1112,fig|93061.5.peg.2465
4998,fig|93061.5.peg.2477,fig|93061.5.peg.2638


In [57]:
# Lets  check if every genomic feature in the PPI is on the table 'features':

In [49]:
# alist = list(ppi['Interactor_A_ID'].isin(features['BRC ID']))
# [(i, alist.count(i)) for i in set(alist)]

[(True, 5000)]

In [50]:
# alist = list(ppi['Interactor_B_ID'].isin(features['BRC ID']))
# [(i, alist.count(i)) for i in set(alist)]

[(False, 1), (True, 4999)]

In [51]:
# ppi['Interactor_B_ID'][ppi['Interactor_B_ID'].isin(features['BRC ID']) == False]

2085    fig|93061.5.peg.894
Name: Interactor_B_ID, dtype: string

In [52]:
# ppi.drop(2085, axis = 0, inplace = True)
# ppi.reset_index(drop=True, inplace=True)

In [56]:
# Creating a new PPI substituing the feature for its RefSeq Locus Tag:

In [53]:
# ppi_refseq = ppi

# for i in range(len(ppi['Interactor_A_ID'])):
#     ppi_refseq.at[i, 'Interactor_A_ID'] = refseq.loc[ppi['Interactor_A_ID'][i]].RefSeq
        
# for i in range(len(ppi['Interactor_B_ID'])):
#     ppi_refseq.at[i, 'Interactor_B_ID'] = refseq.loc[ppi['Interactor_B_ID'][i]].RefSeq  

In [54]:
# ppi_refseq.to_excel('ppi_refseq.xlsx')

In [55]:
# ppi_refseq #is in terms of SAOUHSC subspecie

Unnamed: 0,Interactor_A_ID,Interactor_B_ID
0,SAOUHSC_00505,SAOUHSC_00790
1,SAOUHSC_02116,SAOUHSC_02117
2,SAOUHSC_00119,SAOUHSC_00127
3,SAOUHSC_00120,SAOUHSC_00129
4,SAOUHSC_01166,SAOUHSC_01169
...,...,...
4994,SAOUHSC_00607,SAOUHSC_00935
4995,SAOUHSC_01550,SAOUHSC_01551
4996,SAOUHSC_01213,SAOUHSC_02722
4997,SAOUHSC_02739,SAOUHSC_02919
