In [1]:
import pandas as pd
import numpy as np
import networkx as nx

We are going try to recalculate the correlation between the feature importance and distance between conserved genes and AMR genes once we have a more complete PPI

## Conserved Genes used for prediction in Nguyen et. al. 2020

The table below is listening protein families of 10 experiments (each one with 100 non overlapping protein families) selected from a set of conserved genes and used in the paper of Nguyen et. al.

Each protein family has a feature importance value derived from XGBoost, which means a contribution degree from a protein family given to classificate in resistant/susceptible phenotype.

In [2]:
feature_importance = pd.read_excel('saureus_feature_importance.xlsx')

In [3]:
feature_importance

Unnamed: 0,Protein Family ID,Model,Total Feature Importance,Annotation
0,PLF_1279_00001080,1,162.412577,hypothetical protein
1,PLF_1279_00001505,1,81.039855,ABC transporter-like sensor ATP-binding protei...
2,PLF_1279_00001583,1,67.782436,Polysaccharide intercellular adhesin (PIA) bio...
3,PLF_1279_00001118,1,60.701992,"Nickel ABC transporter, substrate-binding prot..."
4,PLF_1279_00001691,1,54.623888,Activator of the mannose operon (transcription...
...,...,...,...,...
995,PLF_1279_00007034,10,0.000000,Cold shock protein of CSP family
996,PLF_1279_00001353,10,0.000000,UPF0398 protein YpsA
997,PLF_1279_00000861,10,0.000000,LSU ribosomal protein L15p (L27Ae)
998,PLF_1279_00000601,10,0.000000,LSU ribosomal protein L30p (L7e)


## Data required

In [4]:
features = pd.read_csv('genome_features.csv')

In [5]:
refseq = features[['BRC ID', 'RefSeq Locus Tag']]
refseq.columns = ['BRC_ID', 'RefSeq']
refseq.set_index('BRC_ID', inplace = True)

In [6]:
ppi_string = pd.read_csv('ppi_string.txt', sep = ' ')
ppi_string.columns = ['Interactor_A_ID', 'Interactor_B_ID', 'weight']
ppi_string.replace('93061.', '', regex = True, inplace = True)

In [7]:
specialty_genes = pd.read_csv('specialty_genes.csv')
specialty_genes = specialty_genes[['BRC ID', 'Property']]
specialty_genes.columns = ['BRC_ID', 'Property']
specialty_genes.set_index('BRC_ID', inplace = True)
AMR_genes = pd.DataFrame(specialty_genes.loc[specialty_genes.Property == 'Antibiotic Resistance'].reset_index()['BRC_ID'])

In [8]:
plf_map_refseq = features[['RefSeq Locus Tag', 'PATRIC genus-specific families (PLfams)']][features['PATRIC genus-specific families (PLfams)'].isin(feature_importance['Protein Family ID'])].reset_index(drop = True)
plf_map_refseq.columns = ['RefSeq', 'PLF']
plf_map_refseq.dropna(inplace = True)
plf_SAOUHSC = plf_map_refseq.drop_duplicates(subset = 'PLF', keep = 'last')
plf_SAOUHSC.reset_index(drop = True, inplace = True)


In [11]:
AMR_refseq = features[['RefSeq Locus Tag']][features['BRC ID'].isin(AMR_genes['BRC_ID'])].reset_index(drop = True)
AMR_refseq.columns = ['AMR_RefSeq']

In [12]:
ppi_patric = pd.read_csv('ppi_patric.csv')
ppi_patric = ppi_patric[['Interactor A ID', 'Interactor B ID']].astype("string")
ppi_patric.columns = ['Interactor_A_ID', 'Interactor_B_ID']
ppi_patric.drop(2085, axis = 0, inplace = True)
ppi_patric.reset_index(drop=True, inplace=True)
ppi_refseq = ppi_patric

for i in range(len(ppi_patric['Interactor_A_ID'])):
    ppi_refseq.at[i, 'Interactor_A_ID'] = refseq.loc[ppi_patric['Interactor_A_ID'][i]].RefSeq
        
for i in range(len(ppi_patric['Interactor_B_ID'])):
    ppi_refseq.at[i, 'Interactor_B_ID'] = refseq.loc[ppi_patric['Interactor_B_ID'][i]].RefSeq 

In [13]:
ppi = pd.DataFrame(pd.concat([ppi_refseq, ppi_string], axis = 0).drop_duplicates())

In [14]:
conserved_ppi_A = plf_SAOUHSC[plf_SAOUHSC['RefSeq'].isin(ppi['Interactor_A_ID'])]['RefSeq']
conserved_ppi_B = plf_SAOUHSC[plf_SAOUHSC['RefSeq'].isin(ppi['Interactor_B_ID'])]['RefSeq']

conserved_ppi = pd.DataFrame(pd.concat([conserved_ppi_A, conserved_ppi_B], axis = 0).drop_duplicates())

In [15]:
conserved_ppi

Unnamed: 0,RefSeq
5,SAOUHSC_01030
6,SAOUHSC_01032
7,SAOUHSC_01036
8,SAOUHSC_01039
9,SAOUHSC_01044
...,...
823,SAOUHSC_00565
829,SAOUHSC_00605
855,SAOUHSC_00659
883,SAOUHSC_00754


In [16]:
AMR_ppi_A = AMR_refseq[AMR_refseq['AMR_RefSeq'].isin(ppi['Interactor_A_ID'])]['AMR_RefSeq']
AMR_ppi_B = AMR_refseq[AMR_refseq['AMR_RefSeq'].isin(ppi['Interactor_B_ID'])]['AMR_RefSeq']

AMR_ppi = pd.DataFrame(pd.concat([AMR_ppi_A, AMR_ppi_B], axis = 0))
AMR_ppi.reset_index(drop=True, inplace=True)

In [17]:
AMR_ppi

Unnamed: 0,AMR_RefSeq
0,SAOUHSC_00099
1,SAOUHSC_01071
2,SAOUHSC_01159
3,SAOUHSC_01260
4,SAOUHSC_01351
...,...
88,SAOUHSC_00694
89,SAOUHSC_00703
90,SAOUHSC_00006
91,SAOUHSC_00921


In [18]:
ppi_info = pd.DataFrame(columns = ['Conserved Gene', 'Shortest Path to an AMR gene (length)',])

ppi_info['Conserved Gene'] = conserved_ppi.reset_index(drop = True)['RefSeq']

In [19]:
ppi_graph = nx.from_pandas_edgelist(ppi, 'Interactor_A_ID', 'Interactor_B_ID')

In [20]:
ppi_graph = nx.from_pandas_edgelist(ppi, 'Interactor_A_ID', 'Interactor_B_ID')

idx = 0
for i in conserved_ppi['RefSeq']:
    lengths = []
    for j in AMR_ppi['AMR_RefSeq']:
        if nx.has_path(ppi_graph, i, j):
            lengths.append(nx.shortest_path_length(ppi_graph, i, j))
    if lengths:        
        ppi_info['Shortest Path to an AMR gene (length)'][idx] = min(lengths)
        
    idx += 1

In [23]:
ppi_info

Unnamed: 0,Conserved Gene,Shortest Path to an AMR gene (length)
0,SAOUHSC_01030,2
1,SAOUHSC_01032,2
2,SAOUHSC_01036,1
3,SAOUHSC_01039,2
4,SAOUHSC_01044,2
...,...,...
869,SAOUHSC_00565,3
870,SAOUHSC_00605,2
871,SAOUHSC_00659,2
872,SAOUHSC_00754,3


In [36]:
ppi_info['Feature Score'] = pd.Series(dtype='float')
for i in range(len(ppi_info['Conserved Gene'])):
    ppi_info['Feature Score'][i] = feature_importance[feature_importance['Protein Family ID'] == (plf_map_refseq[plf_map_refseq['RefSeq'] == ppi_info['Conserved Gene'][i]]['PLF'].iloc[0])]['Total Feature Importance']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [37]:
ppi_info

Unnamed: 0,Conserved Gene,Shortest Path to an AMR gene (length),Feature Score
0,SAOUHSC_01030,2,3.044391
1,SAOUHSC_01032,2,8.875277
2,SAOUHSC_01036,1,0.000000
3,SAOUHSC_01039,2,1.987799
4,SAOUHSC_01044,2,0.790515
...,...,...,...
869,SAOUHSC_00565,3,8.636960
870,SAOUHSC_00605,2,4.103818
871,SAOUHSC_00659,2,16.589901
872,SAOUHSC_00754,3,9.519017
