In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import os
from os import listdir
import random

# Klebsiella pneumoniae subsp. pneumoniae MGH 78578
In order to have the same RefSeq on both databases, this subspecie is used

In [2]:
bacteria = 'Klebsiella'
root = 'E:/User/bruna.fistarol/Documents/MasterProject'

# Data From PATRIC

## Genomic Features

The table below contains a list of genomic features, including coding DNA.

Each feature is solely identified by BRC ID and associated to a protein family referred as PATRIC genus-specific families (PLfams). Most of the genes has the associated RefSeq.

In [3]:
features = pd.read_csv('genome_features.csv')

In [4]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5267 entries, 0 to 5266
Data columns (total 21 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   Genome                                   5267 non-null   object 
 1   Genome ID                                5267 non-null   float64
 2   Accession                                5267 non-null   object 
 3   BRC ID                                   5267 non-null   object 
 4   RefSeq Locus Tag                         4942 non-null   object 
 5   Alt Locus Tag                            5267 non-null   object 
 6   Feature ID                               5267 non-null   object 
 7   Annotation                               5267 non-null   object 
 8   Feature Type                             5267 non-null   object 
 9   Start                                    5267 non-null   int64  
 10  End                                      5267 no

Through this table, we extract useful data to map protein families referred by Nguyen et. al.:

In [5]:
plf = features[['BRC ID', 'PATRIC genus-specific families (PLfams)']].astype("string")
plf.columns = ['BRC_ID', 'PLFam']
plf.set_index('BRC_ID', inplace = True)
plf

Unnamed: 0_level_0,PLFam
BRC_ID,Unnamed: 1_level_1
fig|272620.9.peg.923,PLF_570_00001646
fig|272620.9.peg.924,PLF_570_00000827
fig|272620.9.peg.925,PLF_570_00002014
fig|272620.9.peg.10,PLF_570_00001421
fig|272620.9.peg.926,PLF_570_00002244
...,...
fig|272620.9.peg.5264,PLF_570_00006439
fig|272620.9.peg.5266,PLF_570_00023239
fig|272620.9.peg.5267,PLF_570_00005744
fig|272620.9.peg.5268,PLF_570_00116871


## Specialty Genes

The table containing specialty genes relates several genomic features to a relevant property. Here the table is filtered by the antibiotic resistance property

In [6]:
AMR_refseq = pd.DataFrame(pd.read_csv('specialty_genes.csv')['RefSeq Locus Tag'])
AMR_refseq.columns = ['AMR_RefSeq']

In [7]:
AMR_refseq['AMR_RefSeq'].nunique()

83

## Data from Nguyen et. al.

In [8]:
plf_500 = []

datadir = f'E:/User/bruna.fistarol/Documents/GitHub/Nguyen_et_al_2020/{bacteria}/fasta.500.0'
for strain in listdir(datadir):
    with open(os.path.join(datadir, strain), 'r') as sequences:
        for line in sequences:
            if line[0] == '>':
                plf_500.append(line[1:len(line)-1])
                
plf_500 = pd.DataFrame(np.unique(plf_500))
plf_500.columns = ['Protein Family ID']

## RefSeq Mapping

In [9]:
refseq = features[['BRC ID', 'RefSeq Locus Tag']]
refseq.columns = ['BRC_ID', 'RefSeq']
refseq.set_index('BRC_ID', inplace = True)

In [10]:
plf_map_refseq = features[['RefSeq Locus Tag', 'PATRIC genus-specific families (PLfams)']][features['PATRIC genus-specific families (PLfams)'].isin(plf_500['Protein Family ID'])].reset_index(drop = True)
plf_map_refseq.columns = ['RefSeq', 'PLF']
plf_map_refseq.dropna(inplace = True)
plf_map_refseq.reset_index(drop = True, inplace = True)

In [11]:
plf_map_refseq

Unnamed: 0,RefSeq,PLF
0,KPN_00896,PLF_570_00000827
1,KPN_00900,PLF_570_00002125
2,KPN_00902,PLF_570_00000651
3,KPN_00909,PLF_570_00001323
4,KPN_00917,PLF_570_00001430
...,...,...
479,KPN_00081,PLF_570_00000905
480,KPN_00859,PLF_570_00002832
481,KPN_00868,PLF_570_00002022
482,KPN_00884,PLF_570_00000996


## Protein Interaction Network

In [12]:
ppi_patric = pd.read_csv('ppi_patric.csv')
ppi_patric = ppi_patric[['Interactor A ID', 'Interactor B ID']].astype("string")
ppi_patric.columns = ['Interactor_A_ID', 'Interactor_B_ID']

In [13]:
ppi_refseq = ppi_patric
for i in range(len(ppi_refseq['Interactor_A_ID'])):
    if ppi_refseq['Interactor_A_ID'][i] in list(refseq.index):
        ppi_refseq.at[i, 'Interactor_A_ID'] = refseq.loc[ppi_refseq['Interactor_A_ID'][i]].RefSeq
    else:
        ppi_refseq.drop(inplace = True, labels = i)
        
ppi_refseq.reset_index(inplace = True, drop = True)
        
for i in range(len(ppi_refseq['Interactor_B_ID'])):
    if ppi_refseq['Interactor_B_ID'][i]in (refseq.index):
        ppi_refseq.at[i, 'Interactor_B_ID'] = refseq.loc[ppi_refseq['Interactor_B_ID'][i]].RefSeq
    else:
        ppi_refseq.drop(inplace = True, labels = i)

In [14]:
ppi_string = pd.read_csv('ppi_string.txt', sep = ' ')
ppi_string.columns = ['Interactor_A_ID', 'Interactor_B_ID', 'weight']
ppi_string.replace('93061.', '', regex = True, inplace = True)

In [15]:
ppi = pd.DataFrame(pd.concat([ppi_refseq, ppi_string], axis = 0)[['Interactor_A_ID', 'Interactor_B_ID']]).reset_index(drop = True)

In [16]:
ppi.to_csv(path_or_buf = os.getcwd() + '\\ppi.csv')

In [17]:
conserved_ppi_A = plf_map_refseq[plf_map_refseq['RefSeq'].isin(ppi['Interactor_A_ID'])]['RefSeq']
conserved_ppi_B = plf_map_refseq[plf_map_refseq['RefSeq'].isin(ppi['Interactor_B_ID'])]['RefSeq']

conserved_ppi = pd.DataFrame(pd.concat([conserved_ppi_A, conserved_ppi_B], axis = 0).drop_duplicates())

In [18]:
conserved_ppi

Unnamed: 0,RefSeq
1,KPN_00900
5,KPN_00923
7,KPN_00932
9,KPN_00092
11,KPN_00970
...,...
420,KPN_04655
422,KPN_04666
432,KPN_04788
466,KPN_00730


In [19]:
AMR_ppi_A = AMR_refseq[AMR_refseq['AMR_RefSeq'].isin(ppi['Interactor_A_ID'])]['AMR_RefSeq']
AMR_ppi_B = AMR_refseq[AMR_refseq['AMR_RefSeq'].isin(ppi['Interactor_B_ID'])]['AMR_RefSeq']

AMR_ppi = pd.DataFrame(pd.concat([AMR_ppi_A, AMR_ppi_B], axis = 0))
AMR_ppi.reset_index(drop=True, inplace=True)

In [20]:
AMR_ppi.drop_duplicates(inplace = True)

In [21]:
ppi_graph = nx.from_pandas_edgelist(ppi, 'Interactor_A_ID', 'Interactor_B_ID')

In [22]:
ppi_graph.number_of_edges()

60016

In [23]:
ppi_graph.number_of_nodes()

5410

In [24]:
plf_map_refseq = plf_map_refseq.set_index('RefSeq')

## pStep Kernel, p = 1

In [25]:
pStepKernel_2 = np.load('pStepKernel_1.npy')

In [26]:
with open ('protein_names1.txt') as f:
    protein_names = f.readlines()

In [27]:
protein_names = protein_names[1:]
protein_names = [name.replace('\n', '') for name in protein_names]

In [28]:
AMR_bin = np.zeros(shape=(len(protein_names), 1))

In [29]:
for i in range(len(protein_names)):
    if protein_names[i] in AMR_ppi['AMR_RefSeq'].tolist():
        AMR_bin[i] = 1

In [30]:
kernel = pd.concat([pd.DataFrame(protein_names), pd.DataFrame.from_records(np.dot(pStepKernel_2, AMR_bin))], axis = 1)

In [31]:
kernel.columns = ['protein', 'kernel']

In [32]:
kernel.sort_values(by = 'kernel', ascending = False, inplace = True)
kernel = kernel[kernel['protein'].isin(conserved_ppi['RefSeq'])]
kernel.reset_index(drop = True, inplace = True)

In [33]:
kernel

Unnamed: 0,protein,kernel
0,KPN_00364,0.358447
1,KPN_00372,0.316228
2,KPN_02472,0.282843
3,KPN_01236,0.267261
4,KPN_03326,0.250000
...,...,...
214,KPN_02295,0.000000
215,KPN_02696,0.000000
216,KPN_01196,0.000000
217,KPN_03432,0.000000


Creating sets of 25 genes according to kernel descend order:

In [34]:
for i in range(8):
    gene_set = kernel.iloc[range(25*i, 25*(i+1))]['protein']
    globals()[f'kernel_top{i}'] = plf_map_refseq.loc[list(gene_set)]['PLF']

In [35]:
kernel_top1

RefSeq
KPN_04233    PLF_570_00000838
KPN_03410    PLF_570_00003223
KPN_01535    PLF_570_00001512
KPN_00433    PLF_570_00000583
KPN_03735    PLF_570_00001835
KPN_00312    PLF_570_00002960
KPN_04788    PLF_570_00001299
KPN_04630    PLF_570_00000859
KPN_01000    PLF_570_00001600
KPN_00204    PLF_570_00001269
KPN_04655    PLF_570_00000265
KPN_04600    PLF_570_00000377
KPN_03956    PLF_570_00000808
KPN_00900    PLF_570_00002125
KPN_03159    PLF_570_00000712
KPN_00144    PLF_570_00000798
KPN_01071    PLF_570_00001153
KPN_01063    PLF_570_00000246
KPN_01745    PLF_570_00004265
KPN_03800    PLF_570_00000723
KPN_03426    PLF_570_00002375
KPN_04421    PLF_570_00001144
KPN_00230    PLF_570_00001685
KPN_02198    PLF_570_00002274
KPN_02429    PLF_570_00001259
Name: PLF, dtype: object

In [36]:
bacdir = f'{root}/Fistarol_2022/1StepKernel{bacteria}'
os.mkdir(bacdir)

for i in range(8):
    newdir = f'top.{i}'
    rankdir = os.path.join(bacdir, newdir)
    os.mkdir(rankdir)
    
    sample = f'{root}/{bacteria}/Nguyen_et_al_2020/fasta.500.0'
    for strain in listdir(sample):
        with open(os.path.join(rankdir, strain), 'a') as mystrain:
            with open(os.path.join(sample, strain), 'r') as sequences:
                first_loop = True
                for line in sequences:
                    if line[0] == '>':
                        if first_loop:
                            plfam = line[1:len(line)-1]
                            seq = ''
                            first_loop = False
                            continue
                        if plfam in list((globals()[f'kernel_top{i}'])):
                            mystrain.write('>' + plfam + '\n')
                            mystrain.write(seq)
                        plfam = line[1:len(line)-1]
                        seq = ''
                    else:
                        seq += line
                if plfam in list((globals()[f'kernel_top{i}'])):
                            mystrain.write('>' + plfam + '\n')
                            mystrain.write(seq)