In [27]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import os

We are going try to recalculate the correlation between the feature importance and distance between conserved genes and AMR genes once we have a more complete PPI

## Conserved Genes used for prediction in Nguyen et. al. 2020

The table below is listening protein families of 10 experiments (each one with 100 non overlapping protein families) selected from a set of conserved genes and used in the paper of Nguyen et. al.

Each protein family has a feature importance value derived from XGBoost, which means a contribution degree from a protein family given to classificate in resistant/susceptible phenotype.

In [5]:
feature_importance = pd.read_excel('saureus_feature_importance.xlsx')

In [6]:
feature_importance

Unnamed: 0,Protein Family ID,Model,Total Feature Importance,Annotation
0,PLF_1279_00001080,1,162.412577,hypothetical protein
1,PLF_1279_00001505,1,81.039855,ABC transporter-like sensor ATP-binding protei...
2,PLF_1279_00001583,1,67.782436,Polysaccharide intercellular adhesin (PIA) bio...
3,PLF_1279_00001118,1,60.701992,"Nickel ABC transporter, substrate-binding prot..."
4,PLF_1279_00001691,1,54.623888,Activator of the mannose operon (transcription...
...,...,...,...,...
995,PLF_1279_00007034,10,0.000000,Cold shock protein of CSP family
996,PLF_1279_00001353,10,0.000000,UPF0398 protein YpsA
997,PLF_1279_00000861,10,0.000000,LSU ribosomal protein L15p (L27Ae)
998,PLF_1279_00000601,10,0.000000,LSU ribosomal protein L30p (L7e)


## Data required

In [7]:
features = pd.read_csv('genome_features.csv')

In [8]:
refseq = features[['BRC ID', 'RefSeq Locus Tag']]
refseq.columns = ['BRC_ID', 'RefSeq']
refseq.set_index('BRC_ID', inplace = True)

In [9]:
ppi_string = pd.read_csv('ppi_string.txt', sep = ' ')
ppi_string.columns = ['Interactor_A_ID', 'Interactor_B_ID', 'weight']
ppi_string.replace('93061.', '', regex = True, inplace = True)

In [10]:
specialty_genes = pd.read_csv('specialty_genes.csv')
specialty_genes = specialty_genes[['BRC ID', 'Property']]
specialty_genes.columns = ['BRC_ID', 'Property']
specialty_genes.set_index('BRC_ID', inplace = True)
AMR_genes = pd.DataFrame(specialty_genes.loc[specialty_genes.Property == 'Antibiotic Resistance'].reset_index()['BRC_ID'])

In [11]:
plf_map_refseq = features[['RefSeq Locus Tag', 'PATRIC genus-specific families (PLfams)']][features['PATRIC genus-specific families (PLfams)'].isin(feature_importance['Protein Family ID'])].reset_index(drop = True)
plf_map_refseq.columns = ['RefSeq', 'PLF']
plf_map_refseq.dropna(inplace = True)
plf_SAOUHSC = plf_map_refseq.drop_duplicates(subset = 'PLF', keep = 'last')
plf_SAOUHSC.reset_index(drop = True, inplace = True)


In [12]:
AMR_refseq = features[['RefSeq Locus Tag']][features['BRC ID'].isin(AMR_genes['BRC_ID'])].reset_index(drop = True)
AMR_refseq.columns = ['AMR_RefSeq']

In [17]:
ppi_refseq = ppi_patric
for i in range(len(ppi_refseq['Interactor_A_ID'])):
    if ppi_refseq['Interactor_A_ID'][i] in list(refseq.index):
        ppi_refseq.at[i, 'Interactor_A_ID'] = refseq.loc[ppi_refseq['Interactor_A_ID'][i]].RefSeq
    else:
        ppi_refseq.drop(inplace = True, labels = i)
        
ppi_refseq.reset_index(inplace = True, drop = True)
        
for i in range(len(ppi_refseq['Interactor_B_ID'])):
    if ppi_refseq['Interactor_B_ID'][i]in (refseq.index):
        ppi_refseq.at[i, 'Interactor_B_ID'] = refseq.loc[ppi_refseq['Interactor_B_ID'][i]].RefSeq
    else:
        ppi_refseq.drop(inplace = True, labels = i)

In [18]:
ppi = pd.DataFrame(pd.concat([ppi_refseq, ppi_string], axis = 0).drop_duplicates())

In [19]:
conserved_ppi_A = plf_SAOUHSC[plf_SAOUHSC['RefSeq'].isin(ppi['Interactor_A_ID'])]['RefSeq']
conserved_ppi_B = plf_SAOUHSC[plf_SAOUHSC['RefSeq'].isin(ppi['Interactor_B_ID'])]['RefSeq']

conserved_ppi = pd.DataFrame(pd.concat([conserved_ppi_A, conserved_ppi_B], axis = 0).drop_duplicates())

In [20]:
conserved_ppi

Unnamed: 0,RefSeq
0,SAOUHSC_01030
1,SAOUHSC_01032
3,SAOUHSC_01039
5,SAOUHSC_00096
6,SAOUHSC_01045
...,...
497,SAOUHSC_02448
546,SAOUHSC_02576
647,SAOUHSC_02883
678,SAOUHSC_02937


In [21]:
AMR_ppi_A = AMR_refseq[AMR_refseq['AMR_RefSeq'].isin(ppi['Interactor_A_ID'])]['AMR_RefSeq']
AMR_ppi_B = AMR_refseq[AMR_refseq['AMR_RefSeq'].isin(ppi['Interactor_B_ID'])]['AMR_RefSeq']

AMR_ppi = pd.DataFrame(pd.concat([AMR_ppi_A, AMR_ppi_B], axis = 0))
AMR_ppi.reset_index(drop=True, inplace=True)

In [22]:
AMR_ppi

Unnamed: 0,AMR_RefSeq
0,SAOUHSC_00099
1,SAOUHSC_01071
2,SAOUHSC_01159
3,SAOUHSC_01260
4,SAOUHSC_01351
...,...
78,SAOUHSC_00668
79,SAOUHSC_00694
80,SAOUHSC_00006
81,SAOUHSC_00921


In [24]:
ppi_graph = nx.from_pandas_edgelist(ppi, 'Interactor_A_ID', 'Interactor_B_ID')

In [28]:
ppi.to_csv(path_or_buf = os.getcwd() + '\\ppi.csv')

In [30]:
pStepKernel_2 = np.load('pStepKernel_2.npy')

with open ('protein_names.txt') as f:
    protein_names = f.readlines()

protein_names = protein_names[1:]
protein_names = [name.replace('\n', '') for name in protein_names]
    
AMR_bin = np.zeros(shape=(len(protein_names), 1))

for i in range(len(protein_names)):
    if protein_names[i] in AMR_ppi['AMR_RefSeq'].tolist():
        AMR_bin[i] = 1

kernel = pd.concat([pd.DataFrame(protein_names), pd.DataFrame.from_records(np.dot(pStepKernel_2, AMR_bin))], axis = 1)
kernel.columns = ['protein', 'kernel']
kernel.sort_values(by = 'kernel', ascending = False, inplace = True)
kernel = kernel[kernel['protein'].isin(conserved_ppi['RefSeq'])]
kernel.reset_index(drop = True, inplace = True)

kernel

Unnamed: 0,protein,kernel
0,SAOUHSC_01420,1.351133
1,SAOUHSC_00315,1.171818
2,SAOUHSC_02003,1.035017
3,SAOUHSC_00316,0.979634
4,SAOUHSC_00831,0.643670
...,...,...
756,SAOUHSC_02608,0.000000
757,SAOUHSC_00479,0.000000
758,SAOUHSC_01141,0.000000
759,SAOUHSC_00014,0.000000


In [34]:
kernel['feature_score'] = pd.Series(dtype='float')

Unnamed: 0,protein,kernel,feature_score
0,SAOUHSC_01420,1.351133,
1,SAOUHSC_00315,1.171818,
2,SAOUHSC_02003,1.035017,
3,SAOUHSC_00316,0.979634,
4,SAOUHSC_00831,0.643670,
...,...,...,...
756,SAOUHSC_02608,0.000000,
757,SAOUHSC_00479,0.000000,
758,SAOUHSC_01141,0.000000,
759,SAOUHSC_00014,0.000000,


In [40]:
kernel

Unnamed: 0,protein,kernel,feature_score
0,SAOUHSC_01420,1.351133,3.348671
1,SAOUHSC_00315,1.171818,10.657664
2,SAOUHSC_02003,1.035017,17.447789
3,SAOUHSC_00316,0.979634,6.811600
4,SAOUHSC_00831,0.643670,9.962976
...,...,...,...
756,SAOUHSC_02608,0.000000,2.647835
757,SAOUHSC_00479,0.000000,61.892540
758,SAOUHSC_01141,0.000000,31.476515
759,SAOUHSC_00014,0.000000,10.350048


In [39]:
for i in range(len(kernel['protein'])):
    kernel['feature_score'][i] = feature_importance[feature_importance['Protein Family ID'] == (plf_map_refseq[plf_map_refseq['RefSeq'] == kernel['protein'][i]]['PLF'].iloc[0])]['Total Feature Importance']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


## What is the correlaton between the feature score and the length of the path?

In [41]:
kernel['kernel'].astype('int').corr(kernel['feature_score'].astype('float64'))

-0.0022867033061357567