This notebook is for extracting information from the files created by piping the DockQ results to text files.

In [20]:
import os
import pandas as pd

In [21]:
# Define a function for each line of the file

def autoinhibitory_pdb_full_depth(line):
    # Line looks like: "Model : ./autoinhibitory/native/{uniprot}_{pdb}.pdb"

    id_dict = {}
    fp = line.split()[2]
    fn = fp.split('/')[3]
    ids = fn.split('_')
    uniprot = ids[0]
    pdb = ids[1].split('.')[0]

    id_dict['uniprot'] = uniprot
    id_dict['pdb'] = pdb

    return id_dict


def pdb_cluster_ids(line):
    # Line looks like: "Model  : /share/scratch/bjechow/dockq/md_pdb_cluster/native/{cluster}_{pdb}.pdb"

    id_dict = {}
    fp = line.split()[2]
    fn = fp.split('/')[8]
    ids = fn.split('_')
    cluster = ids[0]
    pdb = ids[1].split('.')[0]

    id_dict['cluster'] = cluster
    id_dict['pdb'] = pdb

    return id_dict

def full_depth_cluster_ids(line):
    # Line looks like: "Model  : /share/scratch/bjechow/dockq/md_full_depth_cluster/native/{uniprot}_{cluster}.pdb"

    id_dict = {}
    fp = line.split()[2]
    fn = fp.split('/')[7]
    ids = fn.split('_')
    uniprot = ids[0]
    cluster = ids[1].split('.')[0]

    id_dict['uniprot'] = uniprot
    id_dict['cluster'] = cluster

    return id_dict

def pdb_full_depth_ids(line):
    # Line looks like: "Model : /share/scratch/bjechow/dockq/md_pdb_full_depth/native/{uniprot}_{pdb}.pdb"

    id_dict = {}
    fp = line.split()[2]
    fn = fp.split('/')[7]
    ids = fn.split('_')
    uniprot = ids[0]
    pdb = ids[1].split('.')[0]

    id_dict['uniprot'] = uniprot
    id_dict['pdb'] = pdb

    return id_dict

def get_fnat(line):
    # Line looks like: "Fnat 0.804 123 correct of 153 native contacts"
    fnat_dict = {}

    fnat = float(line.split()[1])
    fnat_correct = int(line.split()[2])
    fnat_total = int(line.split()[5])

    fnat_dict['fnat'] = fnat
    fnat_dict['fnat_correct'] = fnat_correct
    fnat_dict['fnat_total'] = fnat_total

    return fnat_dict

def get_fnonnat(line):
    # Line looks like: "Fnonnat 0.134 19 non-native of 142 model contacts"
    fnonnat_dict = {}

    fnonnat = float(line.split()[1])
    fnonnat_nnative = int(line.split()[2])
    fnonnat_model = int(line.split()[5])

    fnonnat_dict['fnonnat'] = fnonnat
    fnonnat_dict['fnonnat_nnative'] = fnonnat_nnative
    fnonnat_dict['fnonnat_model'] = fnonnat_model

    return fnonnat_dict

def get_irms(line):
    # Line looks like: "iRMS 1.297"
    irms_dict = {}

    irms = float(line.split()[1])

    irms_dict['irms'] = irms

    return irms_dict

def get_lrms(line):
    # Line looks like: "LRMS 1.027"
    lrms_dict = {}

    lrms = float(line.split()[1])

    lrms_dict['lrms'] = lrms

    return lrms_dict

def get_dockq(line):
    # Line looks like: "DockQ 0.421"
    dockq_dict = {}

    dockq = float(line.split()[1])

    dockq_dict['dockq'] = dockq

    return dockq_dict

def file_info(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()

    info = {}
    for line in lines:
        if line.startswith('Model'):
            if 'pdb_cluster' in line:
                info.update(pdb_cluster_ids(line))
            elif 'full_depth_cluster' in line:
                info.update(full_depth_cluster_ids(line))
            elif 'pdb_full_depth' in line:
                info.update(pdb_full_depth_ids(line))
            elif 'autoinhibitory' in line:
                info.update(autoinhibitory_pdb_full_depth(line))
            else:
                info.update(pdb_cluster_ids(line))

        elif line.startswith('Fnat'):
            info.update(get_fnat(line))

        elif line.startswith('Fnonnat'):
            info.update(get_fnonnat(line))

        elif line.startswith('iRMS'):
            info.update(get_irms(line))

        elif line.startswith('LRMS'):
            info.update(get_lrms(line))
        
        elif line.startswith('DockQ'):
            info.update(get_dockq(line))

    return info

def capri_class(score):
    if score >= 0.80:
        return 'High'
    elif 0.80 > score >= 0.49:
        return 'Medium'
    elif 0.49 > score >= 0.23:
        return 'Acceptable'
    else:
        return 'Incorrect'

def get_info(fp):

    all_info = []

    for file in os.listdir(fp):
        if file.endswith('.txt'):
            file_path = os.path.join(fp, file)
            info = file_info(file_path)
            all_info.append(info)

    df = pd.DataFrame(all_info)

    # Assign capri scores
    df['capri'] = df['dockq'].apply(capri_class)

    return df

In [22]:
results_path = './project_pipeline/data/output/dockq/'
ai_fd_cl = 'ai_full_depth_cluster'
ai_pdb_cl = 'ai_pdb_cluster'
ai_pdb_fd = 'ai_pdb_full_depth'
md_fd_cl = 'md_full_depth_cluster'
md_pdb_cl = 'md_pdb_cluster'
md_pdb_fd = 'md_pdb_full_depth'

fp_list = [ai_fd_cl, ai_pdb_cl, ai_pdb_fd, md_fd_cl, md_pdb_cl, md_pdb_fd]

# for fp in fp_list:
#     df = get_info(os.path.join(results_path, fp))
#     df.to_csv(f'./project_pipeline/data/{fp}_dockq.csv', index=False)

df = get_info(os.path.join(results_path, ai_pdb_cl))
df.to_csv(f'./project_pipeline/data/{ai_pdb_cl}_dockq.csv', index=False)