# Imports and Setup

In [5]:
import os
from pprint import pprint
import json
import random
import numpy as np
from helpers.helper import get_cath

from Bio import pairwise2
from Bio.pairwise2 import format_alignment
from Bio.Seq import Seq
from Bio import SeqIO

import requests
from requests.packages.urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
import shutil

from scipy.stats import ttest_ind

cath = get_cath()


In [6]:
# read the reverse mapping
with open('../data/sword2/SWORD2/misc/reverse_mappings_compact.json') as json_file:
    pdb_uniprot_mappings_reverse = json.load(json_file)

# Functions

## SWORD parsing

In [7]:
def get_sword2(code, version, verb=False):
    file = f"../data/sword2/SWORD2/results/{version}/{code}/{code}_A/sword.txt"
    with open(file, "r") as f:
        data = {}
        lines = f.readlines()
        option = 0
        for i, line in enumerate(lines):
            lines[i] = "".join([c for c in line if c not in ["\n",'']])
            if line != "\n":
                if not line.startswith(("PDB:", "#D", "A")):
                    res = lines[i].split("|")
                    boundaries = res[2]
                    domains = boundaries.strip().split(" ")
                    data[f"option{option}"] = {}
                    for j in range(len(domains)):
                        data[f"option{option}"][str(j+1)] = domains[j]
                    option += 1
    verb and pprint(data)
    return data

## Metrics

In [36]:

def dbd_score(y_pred, y_true, margin=20):
    scores = []
    for i in range(len(y_pred)):
        window = y_true[max(0, i-margin):min(len(y_true), i+margin+1)]
        indices_window = list(range(max(0, i-margin), min(len(y_true), i+margin+1)))
        if y_pred[i] == 1.0:
            if 1.0 in window:
                # if it's within the window, calculate the score
                pos = np.where(window == 1.0)[0][0]
                j = indices_window[pos]
                diff = abs(i - j)
                k = 0 if diff == 0 else 1
                score = ((margin - diff) + k) / margin
            else:
                # false positive
                score = 0
            scores.append(score)

    number_of_true_boundaries = np.sum(y_true)
    number_of_pred_boundaries = np.sum(y_pred)
    max_len = max(number_of_true_boundaries,number_of_pred_boundaries)
    if max_len == 0:
        return 1.0

    return np.sum(scores) / max_len


def observations(y_pred, y_true, margin):
    tp = 0
    tn = 0
    fp = 0
    fn = 0

    dbd = dbd_score(y_pred, y_true, margin)
    for i in range(len(y_pred)):
        window = y_true[max(0, i-margin):min(len(y_true), i+margin+1)]
        indices_window = list(range(max(0, i-margin), min(len(y_true), i+margin+1)))
        if y_pred[i] == 1.0:
            if 1.0 in window:
                pos = np.where(window == 1.0)[0][0]
                j = indices_window[pos]
                y_true[j] = 0.0
                tp += 1
            else:
                fp += 1


        elif y_pred[i] == 0.0:
            if  y_true[i] == 1.0:
                fn += 1
            else:
                tn += 1

    return (tp, tn, fp, fn)


def metrics(y_pred, y_true, margin=20):
    tp, tn, fp, fn = observations(y_pred, y_true, margin)

    accuracy = (tn + tp) / (tn + tp + fn + fp) if (tn + tp + fn + fp) else 0

    precision = tp / (tp + fp) if (tp + fp) else 0

    recall = tp / (tp + fn) if (tp + fn) else 0

    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) else 0

    mcc_num = (tp * tn) - (fp * fn)
    mcc_den = np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
    mcc = mcc_num / mcc_den if mcc_den else 0

    dbd = dbd_score(y_pred, y_true, margin)

    return (accuracy, precision, recall, f1, mcc, dbd)

## Other

## Boundaries

In [9]:
def boundaries2(len_seq, domain, discontinuity_delimiter):
	"""
		Defines a boundary as the beginning of a domain ONLY in multi-domain proteins
	"""
	first_start = np.inf
	bounds = np.zeros((len_seq), dtype=np.int8)
	for k, v in domain.items():
		boundary_positions = v.split(discontinuity_delimiter)
		for b in boundary_positions:
			start, end = [int(i) for i in b.split('-')]
			if start < first_start:
				first_start = start
			bounds[start-1] = 1
	bounds[first_start-1] = 0            
	return np.array(bounds, dtype=np.bool_)

In [16]:
def get_af_chain(code):
	file_path = f"../data/sword2/SWORD2/misc/af_pdbs/AF-{code}-F1-model_v3.pdb"
	chains = {record.id: record.seq for record in SeqIO.parse(file_path, 'pdb-seqres')}
	a_chain_uniprot_seq = chains['XXXX:A']
	return a_chain_uniprot_seq


def get_pdb_chain(code):
	pdb_file_path = f"../data/pdb/bulk/balanced/backup/data/{code}.pdb"
	pdb_chains = {record.id: record.seq for record in SeqIO.parse(pdb_file_path, 'pdb-seqres')}

	for key in pdb_chains.keys():
		if key[-1] == 'A':
			a_chain_pdb_seq = pdb_chains[key]
			return a_chain_pdb_seq

# Analysis

In [39]:
path = '../data/sword2/SWORD2/results/af'

uniprots = os.listdir(path)
# print(uniprots)
# print()

pdb_mcc = []
af_mcc = []

for i, id in enumerate(uniprots):

    pdb = pdb_uniprot_mappings_reverse[id]
    a_chain_uniprot_seq = get_af_chain(id)
    a_chain_pdb_seq = get_pdb_chain(pdb)
    chain_len = None
    if len(a_chain_pdb_seq) != len(a_chain_uniprot_seq):
        raise ValueError("Different sequence lengths is not expected")
    else:
        chain_len = len(a_chain_pdb_seq)
    
    baseline = cath[pdb]['A']
    af_sword_results = get_sword2(id, 'af', verb=False)
    try:
        pdb_sword_results = get_sword2(pdb, 'pdb', verb=False)
    except FileNotFoundError:
        print("File not found", id, pdb)
        continue


    margin = 20
    baseline_boundaries = boundaries2(len(a_chain_pdb_seq), baseline, ',').astype(int)
    pdb_mccs = []
    af_mccs = []
    pdb_dbds = []
    af_dbds = []

    for option, domain in pdb_sword_results.items():
        pdb_sword_boundaries = boundaries2(chain_len, domain, ';').astype(int)
        pdb_sword_metrics = metrics(pdb_sword_boundaries, baseline_boundaries, margin)
        pdb_sword_mcc = pdb_sword_metrics[-2]
        pdb_sword_dbd = pdb_sword_metrics[-1]
        pdb_mccs.append(pdb_sword_mcc)
        pdb_dbds.append(pdb_sword_dbd)

    for option, domain in af_sword_results.items():
        af_sword_boundaries = boundaries2(chain_len, domain, ';').astype(int)
        af_sword_metrics = metrics(af_sword_boundaries, baseline_boundaries, margin)
        af_sword_mcc = af_sword_metrics[-2]
        af_sword_dbd = af_sword_metrics[-1]
        af_mccs.append(af_sword_mcc)
        af_dbds.append(af_sword_dbd)

    best_pdb_i = np.argmax(pdb_mccs)
    best_af_i = np.argmax(af_mccs)

    pdb_mcc.append(pdb_mccs[best_pdb_i])
    af_mcc.append(af_mccs[best_af_i])

    if (i + 1) % 50 == 0:
        print(f"[{i + 1}/{len(uniprots)}]")
print(f"[{i + 1}/{len(uniprots)}]")


[50/443]
[100/443]
[150/443]
[200/443]
[250/443]
File not found Q9YDZ4 2hls
[300/443]
[350/443]
File not found P38505 1jfj
[400/443]
[443/443]


In [40]:
t_stat, p_val = ttest_ind(pdb_mcc, af_mcc)

print("Mean PDB MCC:", np.mean(pdb_mcc))
print("Mean AF MCC:", np.mean(af_mcc))

print(t_stat, float(p_val))

Mean PDB MCC: 0.6210332441611623
Mean AF MCC: 0.13217732495763904
27.3760386666944 7.451803918104624e-120


Mean PDB MCC: 0.7312201785001735

Mean AF MCC: 0.04587217486107879

56.20185031552613 2.0480532531702836e-293