Create table storing DockQ metrics from best predicted antibody-antigen complexes

In [8]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
import numpy as np
import os

parent_dir = os.path.expanduser('~/Analysis_scenarios/Metrics_files_Bound')

def get_top_metrics(metrics_file):
    df = pd.read_csv(metrics_file)
    df['Index'] = df.index
    
    acceptable_models = df[(df['Quality'] == 'Acceptable') | (df['Quality'] == 'Medium') | (df['Quality'] == 'High')].index
    acceptable_models = sorted(acceptable_models)
    #acceptable_model = acceptable_models.index[0]
    
    if len(acceptable_models) != 0:
        acceptable_model = acceptable_models[0]
    else:
        acceptable_model = 100

    best_model = df['DockQ'].idxmax()

    #cols = ['Structure_file','Haddock_score','DockQ','Quality','iRMS','lRMS','Fnat', 'Index']
    cols = [0,1,2,3,4,5,6]
    df_best = df.iloc[[best_model],:]
    cols2 = ['Structure_file','Haddock_score','DockQ','Quality','iRMS','lRMS','Fnat', 'Index']
    if acceptable_model != 100:
        df_acceptable = df.iloc[[acceptable_model],:]
    else:
        empty_df = pd.DataFrame(columns=cols2)
        empty_df.loc[len(empty_df)] = [np.nan] * len(empty_df.columns)
        df_acceptable = empty_df     
    return df_best, df_acceptable

def list_files(parent_dir):
    elements = os.listdir(parent_dir)
    files = []
    for element in elements:
        if os.path.isfile(os.path.join(parent_dir, element)):
            files.append(os.path.join(parent_dir, element))
    return files

metrics_files = list_files(parent_dir)

cols = ['Structure_file','Haddock_score','DockQ','Quality','iRMS','lRMS','Fnat', 'Index']
df_best_all = pd.DataFrame(columns=cols)
df_acceptable_all = pd.DataFrame(columns=cols)

for file in metrics_files:
     df_best, df_acceptable = get_top_metrics(file)
     df_best_all = pd.concat([df_best_all, df_best], axis=0, ignore_index=True)
     df_acceptable_all = pd.concat([df_acceptable_all, df_acceptable], axis=0, ignore_index=True)

df_summary = pd.concat([df_best_all, df_acceptable_all], axis=1, ignore_index=True)
print(df_summary)
df_summary.to_csv('ranks_metrics.csv', index=False)

                           0         1      2           3       4       5   \
0     3HI6_hv_epi_af_131w.pdb  -30.9779  0.105   Incorrect  13.575  23.376   
1        3HI6_hv_epi_137w.pdb  -12.0275  0.173   Incorrect   4.981  14.008   
2       3MXW_AF_REAL_100w.pdb -154.1244  0.736      Medium   1.200   2.722   
3   3G6D_hv_preda_af_136w.pdb  -25.7512  0.196   Incorrect   5.201   9.673   
4        3HI6_AF_REAL_78w.pdb  -67.5700  0.105   Incorrect  13.622  23.258   
5   3G6D_hv_predp_af_150w.pdb  -39.7811  0.221   Incorrect   4.733   9.291   
6      3EO1_hv_epi_af_32w.pdb  -55.3770  0.140   Incorrect   5.785  28.850   
7       3HI6_hv_predp_56w.pdb  -53.7504  0.376  Acceptable   3.437   7.247   
8          2VXT_real_159w.pdb -186.2408  0.665      Medium   1.476   3.686   
9          3MXW_real_158w.pdb -203.1726  0.686      Medium   1.765   3.956   
10     2VXT_hv_predp_164w.pdb  -83.0996  0.252  Acceptable   4.656   8.773   
11          3HI6_real_94w.pdb  -63.7305  0.425  Acceptable   3.0

  df_best_all = pd.concat([df_best_all, df_best], axis=0, ignore_index=True)
  df_acceptable_all = pd.concat([df_acceptable_all, df_acceptable], axis=0, ignore_index=True)


In [9]:
df_summary['Complex'] = df_summary[0].str[:4]
df_summary['Scenario'] = df_summary[0].str[5:]

scenarios = ['HV_Pred-P_AF', 'HV_Pred-A_AF', 'HV_Pred-P', 'HV_Pred-A', 'Real_AF', 'HV_Epi_AF', 'HV_Epi', 'Real']
complex_list = ['3MXW', '2VXT', '3EO1', '3HI6', '3G6D']

scen_csv = {'real': 'Real',
            'hv_epi': 'HV_Epi',
            'hv_epi_af': 'HV_Epi_AF',
            'AF_REAL': 'Real_AF',
            'hv_preda': 'HV_Pred-A',
            'hv_predp': 'HV_Pred-P',
            'hv_preda_af': 'HV_Pred-A_AF',
            'hv_predp_af': 'HV_Pred-P_AF'
            }

# Docking categories
categories = ['Incorrect', 'Acceptable', 'Medium']

df_summary['Complex'] = pd.Categorical(df_summary['Complex'], categories=complex_list, ordered=True)
df_summary = df_summary.sort_values(['Scenario', 'Complex'], ascending=[False, False])

df_summary.to_csv('ranks_metrics_ordered.csv', index=False)