In [2]:
import pandas as pd
import numpy as np
import networkx as nx
import os
from os import listdir
import random

# Constructing Gene Sets Using Kernel Scores

This following steps allow to construct gene sets to run the model from Nguyen et. al. based on kernel scores of conserved genes obtained considering the distance from a conserved gene to an AMR gene. We are working with the following subspecies choosed to have the same RefSeq on PATRIC database and STRING:

- Salmonella enterica subsp. enterica serovar Typhimurium str. LT2


In [3]:
bacteria = 'Salmonella'

To construct gene sets for a specific bacteria, modify the index below according to the list above

In [4]:
root = os.getcwd() + '\\' + bacteria

# Data From PATRIC

## Genomic Features

The table below contains a list of genomic features, including coding DNA.

Each feature is solely identified by BRC ID and associated to a protein family referred as PATRIC genus-specific families (PLfams). Most of the genes has the associated RefSeq.

In [5]:
features = pd.read_csv(f'{root}\\genome_features.csv')

In [6]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4970 entries, 0 to 4969
Data columns (total 21 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   Genome                                   4970 non-null   object 
 1   Genome ID                                4970 non-null   float64
 2   Accession                                4970 non-null   object 
 3   BRC ID                                   4970 non-null   object 
 4   RefSeq Locus Tag                         4460 non-null   object 
 5   Alt Locus Tag                            4970 non-null   object 
 6   Feature ID                               4970 non-null   object 
 7   Annotation                               4970 non-null   object 
 8   Feature Type                             4970 non-null   object 
 9   Start                                    4970 non-null   int64  
 10  End                                      4970 no

Through this table, we extract useful data to map protein families referred by Nguyen et. al.:

In [7]:
plf = features[['BRC ID', 'PATRIC genus-specific families (PLfams)']].astype("string")
plf.columns = ['BRC_ID', 'PLFam']
plf.set_index('BRC_ID', inplace = True)
plf

Unnamed: 0_level_0,PLFam
BRC_ID,Unnamed: 1_level_1
fig|99287.12.peg.977,PLF_590_00000012
fig|99287.12.peg.978,PLF_590_00014201
fig|99287.12.peg.979,PLF_590_00019426
fig|99287.12.peg.980,PLF_590_00015851
fig|99287.12.peg.981,PLF_590_00000025
...,...
fig|99287.12.peg.4975,PLF_590_00006047
fig|99287.12.peg.4976,PLF_590_00005031
fig|99287.12.peg.4844,PLF_590_00005081
fig|99287.12.peg.4864,PLF_590_00005446


In [108]:
prot_info = features[['BRC ID', 'PATRIC genus-specific families (PLfams)', 'RefSeq Locus Tag']].astype("string")
prot_info.columns = ['BRC_ID', 'PLFam', 'Prot_ID']
prot_info.set_index('BRC_ID', inplace = True)
prot_info

Unnamed: 0_level_0,PLFam,Prot_ID
BRC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
fig|99287.12.peg.977,PLF_590_00000012,STM0927
fig|99287.12.peg.978,PLF_590_00014201,STM0928
fig|99287.12.peg.979,PLF_590_00019426,
fig|99287.12.peg.980,PLF_590_00015851,
fig|99287.12.peg.981,PLF_590_00000025,STM0930
...,...,...
fig|99287.12.peg.4975,PLF_590_00006047,PSLT110
fig|99287.12.peg.4976,PLF_590_00005031,PSLT111
fig|99287.12.peg.4844,PLF_590_00005081,PSLT001
fig|99287.12.peg.4864,PLF_590_00005446,PSLT015


## Specialty Genes

The table containing specialty genes relates several genomic features to a relevant property. Here the table is filtered by the antibiotic resistance property

In [118]:
AMR = pd.read_csv(f'{root}\\specialty_genes.csv')['BRC ID']

AMR_refseq = pd.DataFrame(prot_info.loc[AMR, 'Prot_ID'])

In [119]:
AMR_refseq['Prot_ID'].nunique()

72

## Data from Nguyen et. al.

In [10]:
plf_500 = []

datadir = root + '\\Nguyen_et_al_2020\\fasta.500.0'
for strain in listdir(datadir):
    with open(os.path.join(datadir, strain), 'r') as sequences:
        for line in sequences:
            if line[0] == '>':
                plf_500.append(line[1:len(line)-1])
                
plf_500 = pd.DataFrame(np.unique(plf_500))
plf_500.columns = ['Protein Family ID']

## RefSeq Mapping

In [109]:
refseq = features[['BRC ID', 'RefSeq Locus Tag']]
refseq.columns = ['BRC_ID', 'Prot_ID']
refseq.set_index('BRC_ID', inplace = True)

In [110]:
plf_map_refseq = features[['RefSeq Locus Tag', 'PATRIC genus-specific families (PLfams)']][features['PATRIC genus-specific families (PLfams)'].isin(plf_500['Protein Family ID'])].reset_index(drop = True)
plf_map_refseq.columns = ['Prot_ID', 'PLF']
plf_map_refseq.dropna(inplace = True)
plf_map_refseq.reset_index(drop = True, inplace = True)

In [111]:
plf_map_refseq

Unnamed: 0,Prot_ID,PLF
0,STM0934,PLF_590_00001917
1,STM0936,PLF_590_00003070
2,STM0942,PLF_590_00000215
3,STM0961,PLF_590_00001933
4,STM0969,PLF_590_00001385
...,...,...
487,STM0869,PLF_590_00002654
488,STM0874,PLF_590_00000196
489,STM0876,PLF_590_00001271
490,STM0889,PLF_590_00000747


## Protein Interaction Network

In [21]:
#FROM TRANSFERRED LINKS

transferred_links = pd.read_csv(f'{root}\\salmonella.interolog', sep = '\t', names = ['interactor1', 'interactor2', 'source1', 'source2', 
                                                         'interolog_quality', 'evalue', 'source_org_id', 'det_type', 
                                                         'int_type', 'pubmed'])
ppi = pd.DataFrame(transferred_links[['interactor1', 'interactor2']])
ppi.columns = ['Interactor_A_ID', 'Interactor_B_ID']
#ppi.to_csv(path_or_buf = root + '\\ppi_transferred.csv')

In [107]:
ppi

Unnamed: 0,Interactor_A_ID,Interactor_B_ID,weight
0,STM0002,STM0003,323
1,STM0003,STM0002,323
2,STM0002,STM0067,411
3,STM0067,STM0002,411
4,STM0002,STM0097,436
...,...,...,...
105069,STM4585,STM4578,403
105070,STM4588,STM4589,627
105071,STM4589,STM4588,627
105072,STM4592,STM4593,173


In [97]:
#FROM STRING DB

ppi = pd.read_csv(f'{root}\\ppi_string.txt', sep = ' ')
ppi.columns = ['Interactor_A_ID', 'Interactor_B_ID', 'weight']

for i in ppi.index:
    ppi['Interactor_A_ID'][i] = ppi['Interactor_A_ID'][i].split('.')[1]
    ppi['Interactor_B_ID'][i] = ppi['Interactor_B_ID'][i].split('.')[1]
    
#ppi.to_csv(path_or_buf = root + '\\ppi_string.csv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [112]:
conserved_ppi_A = plf_map_refseq[plf_map_refseq['Prot_ID'].isin(ppi['Interactor_A_ID'])]['Prot_ID']
conserved_ppi_B = plf_map_refseq[plf_map_refseq['Prot_ID'].isin(ppi['Interactor_B_ID'])]['Prot_ID']

conserved_ppi = pd.DataFrame(pd.concat([conserved_ppi_A, conserved_ppi_B], axis = 0).drop_duplicates())

In [113]:
conserved_ppi.reset_index(inplace = True, drop = True)

In [114]:
conserved_ppi.drop_duplicates(inplace = True)

In [115]:
conserved_ppi

Unnamed: 0,Prot_ID
0,STM0934
1,STM0936
2,STM0942
3,STM0961
4,STM0969
...,...
434,STM0869
435,STM0874
436,STM0876
437,STM0889


In [120]:
AMR_ppi_A = AMR_refseq[AMR_refseq['Prot_ID'].isin(ppi['Interactor_A_ID'])]['Prot_ID']
AMR_ppi_B = AMR_refseq[AMR_refseq['Prot_ID'].isin(ppi['Interactor_B_ID'])]['Prot_ID']

AMR_ppi = pd.DataFrame(pd.concat([AMR_ppi_A, AMR_ppi_B], axis = 0))
AMR_ppi.reset_index(drop=True, inplace=True)

In [123]:
AMR_ppi['Prot_ID'].nunique()

62

In [122]:
AMR_ppi.drop_duplicates(inplace = True)

In [124]:
ppi_graph = nx.from_pandas_edgelist(ppi, 'Interactor_A_ID', 'Interactor_B_ID')

In [125]:
ppi_graph.number_of_edges()

52537

In [126]:
ppi_graph.number_of_nodes()

3498

In [128]:
plf_map_refseq = plf_map_refseq.set_index('Prot_ID')

## Removing AMR anotated genes from conserved genes list

As we saw in the previous experiment using path lengths, there are 10 conserved genes anotated as AMR genes. Let's remove them from the conserved genes list:

In [127]:
conserved_ppi[conserved_ppi['Prot_ID'].isin(list(set(AMR_ppi['Prot_ID']) & set(conserved_ppi['Prot_ID'])))]

Unnamed: 0,Prot_ID
2,STM0942
89,STM1700
146,STM2272
198,STM2814
236,STM3186
259,STM3390
262,STM3441
357,STM4292
358,STM4293
389,STM0046


In [129]:
conserved_amr_idx = conserved_ppi[conserved_ppi['Prot_ID'].isin(list(set(AMR_ppi['Prot_ID']) & set(conserved_ppi['Prot_ID'])))].index

In [130]:
conserved_ppi.drop(labels=conserved_amr_idx, axis=0, inplace = True)

## pStep Kernel

After running the R code to get kernel matrix with p = 1 or p = 2, do the following steps:

In [131]:
p = '1'

In [132]:
pStepKernel = np.load(f'{root}\\pStepKernel_{p}_string.npy')

In [133]:
with open (f'{root}\\protein_names_string.txt') as f:
    protein_names = f.readlines()

In [134]:
protein_names = protein_names[1:]
protein_names = [name.replace('\n', '') for name in protein_names]

In [135]:
AMR_bin = np.zeros(shape=(len(protein_names), 1))

for i in range(len(protein_names)):
    if protein_names[i] in AMR_ppi['Prot_ID'].tolist():
        AMR_bin[i] = 1
        
kernel = pd.concat([pd.DataFrame(protein_names), pd.DataFrame.from_records(np.dot(pStepKernel, AMR_bin))], axis = 1)

kernel.columns = ['protein', 'kernel']

kernel.sort_values(by = 'kernel', ascending = False, inplace = True)
kernel = kernel[kernel['protein'].isin(conserved_ppi['Prot_ID'])]
kernel.reset_index(drop = True, inplace = True)

In [136]:
gene_set = kernel.iloc[range(20)]['protein']
globals()[f'kernel_set_0'] = plf_map_refseq.loc[list(gene_set)]['PLF']

bacdir = f'{root}/GeneSets/String/{p}StepKernel_20genes'

os.mkdir(bacdir)

newdir = f'set_0'
setdir = os.path.join(bacdir, newdir)
os.mkdir(setdir)

sample = datadir
for strain in listdir(sample):
    with open(os.path.join(setdir, strain), 'a') as mystrain:
        with open(os.path.join(sample, strain), 'r') as sequences:
            first_loop = True
            for line in sequences:
                if line[0] == '>':
                    if first_loop:
                        plfam = line[1:len(line)-1]
                        seq = ''
                        first_loop = False
                        continue
                    if plfam in list((globals()[f'kernel_set_0'])):
                        mystrain.write('>' + plfam + '\n')
                        mystrain.write(seq)
                    plfam = line[1:len(line)-1]
                    seq = ''
                else:
                    seq += line
            if plfam in list((globals()[f'kernel_set_0'])):
                        mystrain.write('>' + plfam + '\n')
                        mystrain.write(seq)

# Constructing gene sets randomizing AMR

In [84]:
%run E:\\User\\bruna.fistarol\\Documents\\MasterProject\\GuneyDistance.py

In [137]:
#ppi['w'] = 1.0
ppi.columns = ['p1', 'p2', 'w']

G = nx.from_pandas_edgelist(ppi, 'p1', 'p2', edge_attr='w')

s1 = set(AMR_ppi['Prot_ID'])
s2 = set(ppi['p1']) | set(ppi['p2'])
s3 = s1 & s2
samples = GuneyDistance(G).get_random_nodes(s3, seed=0) #it creates 1000 sets of possible new fake AMR genes
random_AMR = pd.DataFrame(samples)

In [172]:
AMR_ppi_random = pd.DataFrame(random_AMR.iloc[98]) #to create another group, change the index
AMR_ppi_random.columns = ['AMR_RefSeq']

We can check if a set of fake AMR genes has some real AMR gene:

In [118]:
set(AMR_ppi['AMR_RefSeq']) & set(AMR_ppi_random['AMR_RefSeq']) 

set()

Now, substitute AMR_ppi for AMR_ppi_random to create randomized groups:

In [138]:
for j in range(1, 31):
    
    AMR_ppi_random = pd.DataFrame(random_AMR.iloc[j])
    AMR_ppi_random.columns = ['Prot_ID']

    AMR_bin = np.zeros(shape=(len(protein_names), 1))

    for i in range(len(protein_names)):
        if protein_names[i] in AMR_ppi_random['Prot_ID'].tolist():
            AMR_bin[i] = 1

    kernel = pd.concat([pd.DataFrame(protein_names), pd.DataFrame.from_records(np.dot(pStepKernel, AMR_bin))], axis = 1)

    kernel.columns = ['protein', 'kernel']

    kernel.sort_values(by = 'kernel', ascending = False, inplace = True)
    kernel = kernel[kernel['protein'].isin(conserved_ppi['Prot_ID'])]
    kernel.reset_index(drop = True, inplace = True)

    gene_set = kernel.iloc[range(20)]['protein']
    globals()[f'kernel_set_{j}'] = plf_map_refseq.loc[list(gene_set)]['PLF']

    bacdir = f'{root}/GeneSets/String/{p}StepKernel_20genes'

    newdir = f'set_{j}'
    setdir = os.path.join(bacdir, newdir)
    os.mkdir(setdir)

    sample = datadir
    for strain in listdir(sample):
        with open(os.path.join(setdir, strain), 'a') as mystrain:
            with open(os.path.join(sample, strain), 'r') as sequences:
                first_loop = True
                for line in sequences:
                    if line[0] == '>':
                        if first_loop:
                            plfam = line[1:len(line)-1]
                            seq = ''
                            first_loop = False
                            continue
                        if plfam in list((globals()[f'kernel_set_{j}'])):
                            mystrain.write('>' + plfam + '\n')
                            mystrain.write(seq)
                        plfam = line[1:len(line)-1]
                        seq = ''
                    else:
                        seq += line
                if plfam in list((globals()[f'kernel_set_{j}'])):
                            mystrain.write('>' + plfam + '\n')
                            mystrain.write(seq)

In [174]:
for j in range(1, 21):
    
    AMR_ppi_random = pd.DataFrame(random_AMR.iloc[j])
    AMR_ppi_random.columns = ['AMR_RefSeq']

    AMR_bin = np.zeros(shape=(len(protein_names), 1))

    for i in range(len(protein_names)):
        if protein_names[i] in AMR_ppi_random['AMR_RefSeq'].tolist():
            AMR_bin[i] = 1

    kernel = pd.concat([pd.DataFrame(protein_names), pd.DataFrame.from_records(np.dot(pStepKernel, AMR_bin))], axis = 1)

    kernel.columns = ['protein', 'kernel']

    kernel.sort_values(by = 'kernel', ascending = False, inplace = True)
    kernel = kernel[kernel['protein'].isin(conserved_ppi['RefSeq'])]
    kernel.reset_index(drop = True, inplace = True)

    gene_set = kernel.iloc[range(20)]['protein']
    globals()[f'kernel_set_{j}'] = plf_map_refseq.loc[list(gene_set)]['PLF']

    bacdir = f'{root}/GeneSets/{p}StepKernel_20genes'

    newdir = f'set_{j}'
    setdir = os.path.join(bacdir, newdir)
    os.mkdir(setdir)

    sample = datadir
    for strain in listdir(sample):
        with open(os.path.join(setdir, strain), 'a') as mystrain:
            with open(os.path.join(sample, strain), 'r') as sequences:
                first_loop = True
                for line in sequences:
                    if line[0] == '>':
                        if first_loop:
                            plfam = line[1:len(line)-1]
                            seq = ''
                            first_loop = False
                            continue
                        if plfam in list((globals()[f'kernel_set_{j}'])):
                            mystrain.write('>' + plfam + '\n')
                            mystrain.write(seq)
                        plfam = line[1:len(line)-1]
                        seq = ''
                    else:
                        seq += line
                if plfam in list((globals()[f'kernel_set_{j}'])):
                            mystrain.write('>' + plfam + '\n')
                            mystrain.write(seq)