# Download CASP data

This notebook downloads all raw protein data from the CASP website.

Each CASP edition is downloaded into `./CASP*` which is a symlink to `../data/CASP*`.

In [1]:
%matplotlib agg
import io
import os
import re
import time
import json
import pickle
import hashlib
import tarfile
import requests
import tempfile
import warnings
import functools
import contextlib
import subprocess
from pathlib import Path

import bs4
import docker
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.animation
import natsort as ns
import tqdm.notebook as tqdm

import Bio.PDB
import Bio.SeqIO
import Bio.Align.AlignInfo
import Bio.AlignIO
import Bio.Alphabet

from loguru import logger
from joblib import Parallel, delayed
from mpl_toolkits.mplot3d import Axes3D
from IPython.display import display, Markdown, HTML, Video

from graphqa.data.aminoacids import *
from graphqa.data.decoys import ca_coord_and_orientation
import casp13_secret

@functools.lru_cache(maxsize=128)
def requests_get(url):
    return requests.get(url)

Documentation of CASP download area for CASP 11

In [2]:
response = requests_get('https://predictioncenter.org/download_area/README')
readme = response.text.splitlines()
i = readme.index('CASP11')
print(*readme[i: i+12], sep='\n')

CASP11
   SUMMARY_TABLES	    - text version of the result tables (per target and archived)
   extra_experiments	    - input data (selected contacts) for Tp, Tc, Ts and Tx contact-assisted categories) 
				and starting models for the TR refinement category
   predictions              - all predictions (DR, FN, QA, RR, 3D)
   predictions_trimmed_to_domains - all tertiary structure (TS) predictions trimmed according to 
			       the official domain definitions
   results_LGA_sda          - results (complete data and summaries) of sequence-dependent LGA analysis for all models
   results_LGA_sia          - results (complete data and summaries) of sequence-independent LGA analysis for all models
   server_predictions	    - 3D-structure predictions from servers
   targets                  - target sequences; preprocessed target structures
   templates		    - results of LGA sequence-independent comparison of the best 25 templates 


## Primary structure

Download all sequences from the CASP website and save them as `CASP*/sequences.fasta`.

Metadata about all sequences is saved in [`sequences.csv`](./sequences.csv).

In [3]:
regex_name = re.compile(r'(T|H)\d\d\d\d')
regex_subunit = re.compile(r'\bsubunit\s+(\d+)')

def is_qa_target(target):
    return bool(regex_name.fullmatch(target.name))

def rename(target):
    original_name = target.name
    match = regex_name.fullmatch(target.name)
    if match.groups()[0] == 'H':
        subunit = regex_subunit.search(target.description).groups()[0]
        casp_id = target.name.replace('H', 'T') + 's' + subunit
        target.description = (
            target.description.replace(target.name, casp_id, 1)
            + f' (original target name {original_name})'
        )
        target.name = casp_id
        target.id = casp_id
    return target, original_name

df_sequences = []
for casp_ed in [9, 10, 11, 12, 13]:
    dest = Path(f'CASP{casp_ed}/sequences.fasta')
    dest.parent.mkdir(exist_ok=True, parents=True)
    
    response = requests_get(f'https://predictioncenter.org/download_area/'
                            f'CASP{casp_ed}/sequences/casp{casp_ed}.seq.txt')
    sequences = Bio.SeqIO.parse(io.StringIO(response.text), format='fasta', 
                                alphabet=Bio.Alphabet.ProteinAlphabet())
    sequences = filter(is_qa_target, sequences)
    sequences = map(rename, sequences)
    
    with dest.open('w') as f:
        for seq, original_name in sequences:
            df_sequences.append({
                'casp_ed': casp_ed,
                'target_id': seq.id,
                'target_id_orig': original_name,
                'length': len(seq),
            })
            Bio.SeqIO.write(seq, f, format='fasta')

df_sequences = pd.DataFrame(df_sequences)
df_sequences.to_csv('sequences.csv', index=False)
df_sequences.groupby('casp_ed').size().rename_axis('Edition').to_frame('Targets')

Unnamed: 0_level_0,Targets
Edition,Unnamed: 1_level_1
9,129
10,115
11,105
12,90
13,90


In some cases the same protein (`target_id_orig`) corresponds to more targets (`target_id`):

In [4]:
df_sequences.query('target_id != target_id_orig')

Unnamed: 0,casp_ed,target_id,target_id_orig,length
439,13,T0953s1,H0953,72
440,13,T0953s2,H0953,249
441,13,T0957s1,H0957,163
442,13,T0957s2,H0957,164
443,13,T0968s1,H0968,126
444,13,T0968s2,H0968,116
445,13,T0974s1,H0974,72
446,13,T0974s2,H0974,95
447,13,T0980s1,H0980,111
448,13,T0980s2,H0980,52


## Tertiary structure

### Native structures

#### Downloaded from the CASP download area

These native structures are downloaded from the CASP download area.

Metadata about these structures is saved in [`natives_casp.csv`](./natives_casp.csv).

- all names start with `T` and end with an optional subunit `s`
- some of the published primary sequences don't have an official native structure (maybe canceled?)

In [5]:
parser = Bio.PDB.PDBParser(QUIET=True)
df_natives = []
# Match files like T0759s1.pdb but not T0759-D1.pdb
regex = re.compile(r'T\d\d\d\d(?:s\d)?.pdb')

native_urls = {
    9: 'https://predictioncenter.org/download_area/CASP9/targets/casp9.targ_unsplit.tgz',
    10: 'https://predictioncenter.org/download_area/CASP10/targets/casp10.targets_unsplitted.noT0695T0739.tgz',
    11: 'https://predictioncenter.org/download_area/CASP11/targets/casp11.targets_unsplitted.release11242014.tgz',
    12: 'https://predictioncenter.org/download_area/CASP12/targets/casp12.targets_T0.releaseDec022016.tgz',
    13: 'https://predictioncenter.org/download_area/CASP13/targets/casp13.targets.T.4public.tar.gz',
}

for casp_ed, url in native_urls.items():
    dest = Path(f'CASP{casp_ed}') / 'native'
    dest.mkdir(exist_ok=True, parents=True)
    ! curl -s {url} | tar xz --directory {dest.as_posix()}
    
    if casp_ed == 13:
        ! echo '--user {casp13_secret.user}:{casp13_secret.pwd}' | curl -s --config - {casp13_secret.url} | tar xz --directory {dest}

    for f in dest.iterdir():
        if not regex.fullmatch(f.name):
            f.unlink()
            continue
        target_id = f.with_suffix('').name
        structure = parser.get_structure(target_id, f)
        df_natives.append({
            'casp_ed': casp_ed,
            'target_id': target_id,
            'chains': len(list(structure.get_chains())),
            'residues': len(list(structure.get_residues())),
            'atoms': len(list(structure.get_atoms())),
        })
        
df_natives = pd.DataFrame(df_natives).sort_values(['casp_ed', 'target_id'])
df_natives.to_csv('natives_casp.csv', index=False)
df_natives.groupby('casp_ed').size().rename_axis('Edition').to_frame('Targets')

Unnamed: 0_level_0,Targets
Edition,Unnamed: 1_level_1
9,117
10,103
11,85
12,40
13,82


A full outer join between the sequences in `.fasta` format 
and the sequences found in the native's `.pdb` file
shows the some discrepancies.

In the following:
- **left:**  primary sequences published on the CASP website
- **right:** native structures published on the CASP website

In [6]:
df_merge = pd.merge(
    df_sequences, 
    df_natives.drop(columns=['chains', 'atoms']), 
    on=['casp_ed', 'target_id'], how='outer', indicator=True
)

# Some targets are present in the .fasta files but absent from the native .pdb files
display(
    df_merge['_merge']
    .value_counts()
    .to_frame('Outer join counts')
)
display(df_merge.query("_merge!='both'"))

# Some targets have different lenghts in the .fasta and .pdb files
display(
    (df_merge['length'] == df_merge['residues'])
    .value_counts()
    .to_frame('Same length?')
)
display(df_merge.query('length != residues and _merge=="both"'))

del df_merge

Unnamed: 0,Outer join counts
both,431
left_only,98
right_only,0


Unnamed: 0,casp_ed,target_id,target_id_orig,length,residues,_merge
4,9,T0519,T0519,180,,left_only
20,9,T0535,T0535,294,,left_only
31,9,T0546,T0546,134,,left_only
39,9,T0554,T0554,135,,left_only
41,9,T0556,T0556,73,,left_only
...,...,...,...,...,...,...
470,13,T0952,T0952,35,,left_only
473,13,T0956,T0956,178,,left_only
487,13,T0972,T0972,106,,left_only
518,13,T1007,T1007,149,,left_only


Unnamed: 0,Same length?
False,469
True,60


Unnamed: 0,casp_ed,target_id,target_id_orig,length,residues,_merge
0,9,T0515,T0515,365,348.0,both
1,9,T0516,T0516,229,227.0,both
3,9,T0518,T0518,288,256.0,both
5,9,T0520,T0520,189,173.0,both
6,9,T0521,T0521,179,168.0,both
...,...,...,...,...,...,...
522,13,T1011,T1011,534,444.0,both
524,13,T1013,T1013,537,290.0,both
525,13,T1014,T1014,276,268.0,both
526,13,T1016,T1016,203,202.0,both


#### Downloaded from the Protein Data Bank

These native structures are downloaded from the Protein Data Bank using 
[the mapping from CASP id to PDB code can](https://predictioncenter.org/casp11/targetlist.cgi?view_targets=all).

Metadata about these structures is saved in [`natives_pdb.csv`](./natives_pdb.csv).

- Some targets are listed using `H????` names and some others using `T????[s?]` names, we only download the latter
- Some structures downloaded from PDB actually have more than one model in the same `.pdb` file (but CASP only cares about one, right?)

In [7]:
# Match names like T0759s1
regex_name = re.compile(r'T\d\d\d\d(?:s\d)?')
parser = Bio.PDB.PDBParser(QUIET=True)
logger.disable('__main__:extract_targets')
df_natives_pdb = []

def extract_targets(soup):
    for target_row in soup.select('tr.datarow'):
        target_cols = target_row.select('td')
        
        try:
            casp_id = target_cols[1].select_one('a').text.strip()
            if not regex_name.match(casp_id):
                continue
        except Exception:
            msg = re.sub('\s+', ' ', target_cols[1].text).strip()
            logger.exception('Could not parse target id: ' + msg)
            continue
            
        try:
            length = int(target_cols[3].text.strip())
        except Exception:
            msg = re.sub('\s+', ' ', target_cols[3].text).strip()
            logger.exception('Could not parse length: ' + msg)
            continue
                
        try:
            txt = target_cols[-1].text.lower()
            if 'no structure' in txt or 'canceled' in txt:
                continue            
            pdb_code = target_cols[-1].select_one('a').text.strip()
        except Exception:
            msg = msg = re.sub('\s+', ' ', target_cols[-1].text).strip()
            logger.warning('Could not parse PDB code: ' + msg)
            continue
                        
        yield {
            'target_id': casp_id,
            'length': length,
            'pdb_id': pdb_code,
        }
        

def download_native(target_dict, dest):
    if not dest.is_file():
        response = requests.get(f'https://files.rcsb.org/download/{target_dict["pdb_id"]}.pdb')
        with dest.open('w') as f:
            f.write(response.text)

for casp_ed in [9,10,11,12,13]:
    with logger.contextualize(casp=casp_ed):
        dest = Path(f'CASP{casp_ed}') / 'native_pdb'
        dest.mkdir(exist_ok=True, parents=True)

        response = requests_get(f'https://predictioncenter.org/casp{casp_ed}/targetlist.cgi?view_targets=all')
        soup = bs4.BeautifulSoup(response.content)

        for target_dict in extract_targets(soup):
            dest_path = dest / f'{target_dict["target_id"]}.pdb'
            download_native(target_dict, dest_path)
            structure = parser.get_structure(target_dict["target_id"], dest_path)
            df_natives_pdb.append({
                'casp_ed': casp_ed,
                'chains': len(list(structure.get_chains())),
                'residues': len(list(structure.get_residues())),
                'atoms': len(list(structure.get_atoms())),
                **target_dict
            })
        
df_natives_pdb = pd.DataFrame(df_natives_pdb)
df_natives_pdb.to_csv('natives_pdb.csv', index=False)
df_natives_pdb.groupby('casp_ed').size().rename_axis('Edition').to_frame('Targets')



Unnamed: 0_level_0,Targets
Edition,Unnamed: 1_level_1
9,116
10,92
11,82
12,51
13,53


Difference between:
- **left:**  primary sequences published on the CASP website
- **right:** native structures downloaded from PDB

In [8]:
df_merge = pd.merge(
    df_sequences, 
    df_natives_pdb.drop(columns=['chains', 'atoms','length']),
    suffixes=['_fasta', '_casp'],
    on=['casp_ed', 'target_id'],
    how='outer', indicator=True
)

display(
    df_merge['_merge']
    .value_counts()
    .to_frame('Outer join counts')
)
display(df_merge.query("_merge!='both'"))

display(
    (df_merge['length'] == df_merge['residues'])
    .value_counts()
    .to_frame('Same length?')
)
display(df_merge.query('length != residues'))

del df_merge

Unnamed: 0,Outer join counts
both,399
left_only,130
right_only,0


Unnamed: 0,casp_ed,target_id,target_id_orig,length,residues,pdb_id,_merge
4,9,T0519,T0519,180,,,left_only
31,9,T0546,T0546,134,,,left_only
34,9,T0549,T0549,84,,,left_only
39,9,T0554,T0554,135,,,left_only
41,9,T0556,T0556,73,,,left_only
...,...,...,...,...,...,...,...
515,13,T1004,T1004,458,,,left_only
518,13,T1007,T1007,149,,,left_only
523,13,T1012,T1012,199,,,left_only
524,13,T1013,T1013,537,,,left_only


Unnamed: 0,Same length?
False,528
True,1


Unnamed: 0,casp_ed,target_id,target_id_orig,length,residues,pdb_id,_merge
0,9,T0515,T0515,365,757.0,3mt1,both
1,9,T0516,T0516,229,1715.0,3no6,both
2,9,T0517,T0517,159,1149.0,3pnx,both
3,9,T0518,T0518,288,464.0,3nmb,both
4,9,T0519,T0519,180,,,left_only
...,...,...,...,...,...,...,...
524,13,T1013,T1013,537,,,left_only
525,13,T1014,T1014,276,476.0,6qrj,both
526,13,T1016,T1016,203,882.0,6e4b,both
527,13,T1018,T1018,334,1069.0,6n91,both


### Server predictions

These are the tertiary structures as predicted from the servers participating in CASP
([submission file format](https://predictioncenter.org/casp13/index.cgi?page=format#TS)).

The submission happens in two stages, the same target might get different names in the two stages.

Each server can submit up to 5 models for each target, as indicated by the field `MODEL`.

In [9]:
@contextlib.contextmanager
def read_archive(response):
    with io.BytesIO(response.content) as fileobj:
        with tarfile.open(fileobj=fileobj, mode='r') as archive:
            yield archive

df_decoys = {}
for target in df_natives.itertuples():
    dest_dir = Path(f'CASP{target.casp_ed}/decoys/{target.target_id}')    
    dest_dir.mkdir(parents=True, exist_ok=True)

    # Download all decoys for this target, compute md5sum and save them to disk
    response = requests_get(
        f'https://predictioncenter.org/download_area/'
        f'CASP{target.casp_ed}/server_predictions/'
        f'{target.target_id}.3D.srv.tar.gz'
    )
    if response.status_code != 200:
        logger.warning(f'{response.url} {response.status_code}')
        continue
    with read_archive(response) as archive:
        for member in archive.getmembers():
            if not member.isfile():
                continue
            if Path(member.name).is_absolute():
                logger.warning(f'Invalid path in tarfile: {response.url} {member.name}')
                continue

            decoy_id = Path(member.name).with_suffix('').name
            with archive.extractfile(member) as fileobj:
                content = fileobj.read()
            with dest_dir.joinpath(decoy_id).with_suffix('.pdb').open('wb') as out:
                out.write(content)
            df_decoys[hashlib.md5(content).digest()] = {
                'casp_ed': target.casp_ed,
                'target_id': target.target_id,
                'decoy_id': decoy_id,
                'stage1': False,
                'stage2': False,
            }

    if target.casp_ed == 9:
        continue

    # Download all stage1 and stage2 decoys for this target,
    # don't save to disk, just compute md5sum and update the dict
    for s in ['stage1', 'stage2']:
        response = requests_get(
            f'https://predictioncenter.org/download_area/'
            f'CASP{target.casp_ed}/server_predictions/'
            f'{target.target_id}.{s}.3D.srv.tar.gz'
        )
        if response.status_code != 200:
            logger.warning(f'{response.url} {response.status_code}')
            continue
        with read_archive(response) as archive:
            for member in archive.getmembers():
                if not member.isfile():
                    continue
                if Path(member.name).is_absolute():
                    logger.warning(f'Invalid path in tarfile: {response.url} {member.name}')
                    continue
                with archive.extractfile(member) as fileobj:
                    content = fileobj.read()
                try:
                    df_decoys[hashlib.md5(content).digest()][s] = True
                except KeyError:
                    logger.error(f'Could not find stage {s[-1]} decoy {member.name} '
                                 f'among all decoys of CASP{target.casp_ed}/{target.target_id}')
    
df_decoys = pd.DataFrame(df_decoys.values())
df_decoys.to_csv('decoys.csv', index=False)

2020-09-21 02:14:13.490 | ERROR    | __main__:<module>:68 - Could not find stage 1 decoy T0649/server20_TS1 among all decoys of CASP10/T0649
2020-09-21 02:14:53.406 | ERROR    | __main__:<module>:68 - Could not find stage 2 decoy T0653/Distill_TS1 among all decoys of CASP10/T0653
2020-09-21 02:14:53.408 | ERROR    | __main__:<module>:68 - Could not find stage 2 decoy T0653/Distill_TS2 among all decoys of CASP10/T0653
2020-09-21 02:14:53.410 | ERROR    | __main__:<module>:68 - Could not find stage 2 decoy T0653/Distill_TS3 among all decoys of CASP10/T0653
2020-09-21 02:14:53.413 | ERROR    | __main__:<module>:68 - Could not find stage 2 decoy T0653/Distill_TS4 among all decoys of CASP10/T0653
2020-09-21 02:14:53.415 | ERROR    | __main__:<module>:68 - Could not find stage 2 decoy T0653/Distill_TS5 among all decoys of CASP10/T0653
2020-09-21 02:14:53.417 | ERROR    | __main__:<module>:68 - Could not find stage 2 decoy T0653/Distill_roll_TS1 among all decoys of CASP10/T0653
2020-09-21 02:

In [10]:
! du -shc CASP*/decoys

df_decoys = pd.read_csv('decoys.csv')
display(
    df_decoys.groupby('casp_ed')
    .agg({'target_id': 'nunique', 'decoy_id': 'size', 'stage1': 'sum', 'stage2': 'sum'})
    .rename(columns={'target_id': 'Unique targets', 'decoy_id': 'Total decoys', 
                     'stage1': 'Decoys in stage 1', 'stage2': 'Decoys in stage 2'})
    .rename_axis('Edition')
    .astype(int)
)
display(
    df_decoys.groupby(['casp_ed', 'target_id'])
        .agg({'stage1': 'sum', 'stage2': 'sum', 'decoy_id': 'size'})
        .rename(columns={'decoy_id': 'Total', 'stage1': 'Stage 1', 'stage2': 'Stage 2'})
        .rename_axis(['Edition', 'Target'])
        .astype(int)
)

3.7G	CASP10/decoys
2.5G	CASP11/decoys
1.2G	CASP12/decoys
2.6G	CASP13/decoys
4.8G	CASP9/decoys
15G	total


Unnamed: 0_level_0,Unique targets,Total decoys,Decoys in stage 1,Decoys in stage 2
Edition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
9,117,33737,0,0
10,103,24526,2050,14551
11,85,16012,1680,12448
12,40,6689,800,5791
13,82,14621,1640,12139


Unnamed: 0_level_0,Unnamed: 1_level_0,Stage 1,Stage 2,Total
Edition,Target,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
9,T0515,0,0,286
9,T0516,0,0,303
9,T0517,0,0,280
9,T0518,0,0,279
9,T0520,0,0,305
...,...,...,...,...
13,T1021s1,20,148,183
13,T1021s2,20,148,174
13,T1021s3,20,149,179
13,T1022s1,20,149,177


### Visualization

Focus on `CASP11/T0759` and its decoy `T0759/3D-Jigsaw-V5_1_TS1` as an example.

For `CASP11/T0759` we have the following decoys:

In [11]:
display(
    df_decoys.query('target_id == "T0759"')
    .agg({'stage1': 'sum', 'stage2': 'sum', 'decoy_id': 'size'})
    .rename({'decoy_id': 'Total', 'stage1': 'Stage 1', 'stage2': 'Stage 2'})
    .to_frame('T0759')
    .transpose()
)

Unnamed: 0,Stage 1,Stage 2,Total
T0759,20,150,179


Draw the coordinates of a residue's carbon alpha and direction `CA->CB`.
If the residue is `GLY`, the atom `CB` is "virtual".

<img src="https://vignette.wikia.nocookie.net/foldit/images/8/86/Backbone_overview.stickpolarh.png/revision/latest?cb=20180101214816" style="background-color:white;width:15%;"/>

Native structure:

In [12]:
parser = Bio.PDB.PDBParser(QUIET=True)
model = parser.get_structure('T0759', 'CASP11/native/T0759.pdb')[0]

fig = plt.figure(figsize=(12, 6), facecolor='white', tight_layout=True)
ax = fig.add_subplot(1, 1, 1, projection='3d')

for chain_idx, chain in enumerate(model):
    aa = [r.get_resname() for r in chain if Bio.PDB.is_aa(r)]
    ca, orient = zip(*(ca_coord_and_orientation(r) for r in chain if Bio.PDB.is_aa(r)))
    ca = np.stack(ca, axis=0)    
    ax.plot(*ca.T, color=plt.get_cmap('tab10')(chain_idx))
    for ca, aa, orient in zip(ca, aa, orient):
        ax.plot(*zip(ca.T, (ca+orient).T), linewidth=3, color=plt.get_cmap('tab20')(aa_3_mapping[aa]))

animate = lambda i: ax.view_init(30, i)
ani = matplotlib.animation.FuncAnimation(fig, animate, frames=10 + 20 * np.sin(np.linspace(0, np.pi, num=100)), interval=50)
ani.save('T0759.mp4')
display(Video('T0759.mp4', embed=False))
plt.close(fig)

Decoy structure:

In [13]:
regex_target = re.compile(r'TARGET\s+(T\d\d\d\d(:?s\d+)?)')
regex_model = re.compile(r'MODEL\s+(\d+)')

with open('CASP11/decoys/T0759/3D-Jigsaw-V5_1_TS1.pdb') as f:
    # Print the first lines of the pdb file
    print(*f.readlines()[:9], sep='')

    # The parser does not recognize TARGET and MODEL fields automatically
    f.seek(0)
    f.readline()
    target_id = regex_target.match(f.readline()).groups()[0]
    model_id = regex_model.match(f.readline()).groups()[0]

    # Parse the structure
    f.seek(0)
    structure = parser.get_structure(f'T0759/3D-Jigsaw-V5_1_TS1', f)


# Print parsed structure
for model in structure:
    print(f'Model {model.get_full_id()} ({len(model)} chains)')
    for chain in model:
        print(f'  Chain {chain.get_full_id()} ({len(chain)} residues)')
        chain = list(chain)
        for residue in chain[:5]:
            print(f'   {residue.get_id()[1]:>3} {residue.get_resname()}  {len(residue):>2} atoms')
        print('   ...')
        for residue in chain[-5:]:
            print(f'   {residue.get_id()[1]:>3} {residue.get_resname()}  {len(residue):>2} atoms')
            

# Show animation
fig = plt.figure(figsize=(12, 6), facecolor='white', tight_layout=True)
ax = fig.add_subplot(1, 1, 1, projection='3d')

model = structure[0]
for chain_idx, chain in enumerate(model):
    aa = [r.get_resname() for r in chain]
    ca, orient = zip(*(ca_coord_and_orientation(r) for r in chain))
    ca = np.stack(ca, axis=0)    
    ax.plot(*ca.T, c='gray')
    for ca, aa, orient in zip(ca, aa, orient):
        ax.plot(*zip(ca.T, (ca+orient).T), linewidth=3, color=plt.get_cmap('tab20')(aa_3_mapping[aa]))

animate = lambda i: ax.view_init(30, i)
ani = matplotlib.animation.FuncAnimation(fig, animate, frames=10 + 20 * np.sin(np.linspace(0, np.pi, num=100)), interval=50)
ani.save('T0759_3D-Jigsaw-V5_1_TS1.mp4')
display(Video('T0759_3D-Jigsaw-V5_1_TS1.mp4', embed=False))
plt.close(fig)

PFRMAT TS
TARGET T0759
MODEL  1
PARENT N/A
ATOM      1  N   HIS    10      -0.109  -0.003   0.008  1.00  0.00           N  
ATOM      2  CA  HIS    10       0.018   0.008   1.433  1.00  0.00           C  
ATOM      3  C   HIS    10       0.057   1.463   1.816  1.00  0.00           C  
ATOM      4  O   HIS    10       0.617   2.274   1.082  1.00  0.00           O  
ATOM      5  CB  HIS    10       1.267  -0.762   1.866  1.00  0.00           C  

Model ('T0759/3D-Jigsaw-V5_1_TS1', 0) (1 chains)
  Chain ('T0759/3D-Jigsaw-V5_1_TS1', 0, ' ') (100 residues)
    10 HIS  10 atoms
    11 MET   8 atoms
    12 VAL   7 atoms
    13 VAL   7 atoms
    14 ILE   8 atoms
   ...
   105 VAL   7 atoms
   106 SER   6 atoms
   107 GLY   4 atoms
   108 GLN   9 atoms
   109 LYS   9 atoms


## Official QA global scores (summary tables)

Summary tables contain official QA scores computed by CASP by comparison with the native structure for all tertiary structure predictions from all participants.
QA metrics are only computed for global scores.

A unique model `ACCESSION CODE` is composed from the number of the target, prediction format category, prediction group number, and model index. 
Example:
```
Accession code  T0444TS005_2  has the following components:
 T0044   target number
 TS      Tertiary Structure (PFRMAT TS)
 005     prediction group 5
 2       model index 2 
```

Summary tables are a bit different in each CASP edition.

For each edition, the table is saved as `CASP*/QA_official/table.pkl.xz`

In [14]:
# Match files like T0759s1.txt but not T0759-D1.txt
regex = re.compile(r'T\d\d\d\d(?:s\d)?.txt')

### CASP 9
Single table with all targets and decoys

In [15]:
url = 'https://predictioncenter.org/download_area/CASP9/refinement_result_tables_assessor.txt'
response = requests_get(url)

df = pd.read_csv(io.BytesIO(response.content), sep='\t')
df.rename(columns={c: c.strip() for c in df.columns}, inplace=True)
df.rename(columns={'GDT-HA': 'GDT_HA', 'GDC-SC': 'GDT_SC'}, inplace=True)
df['Exclude(missing too many atoms)'] = (
    df['Exclude(missing too many atoms)']
    .str.strip()
    .map({'True': True, 'False': False})
)
df['Target'] = df['Target'].map('T{:04d}'.format)
df['Model'] = df['Model'].astype(str)
df.set_index(['Target', 'Group', 'Model'], inplace=True)
df.sort_index(inplace=True)

Path('CASP9/QA_official').mkdir(exist_ok=True, parents=True)
df.to_pickle('CASP9/QA_official/table.pkl.xz')

print('Unique targets:', len(df.index.unique(level='Target')))
print('Unique groups:', len(df.index.unique(level='Group')))
df

Unique targets: 14
Unique groups: 34


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,GDT_HA,RMSD,GDT_SC,SphGr,MolProb,Exclude(missing too many atoms)
Target,Group,Model,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
T0517,1,1,48.43,6.725,28.31,46.346,1.344,False
T0517,1,2,49.05,7.495,24.92,47.604,1.123,False
T0517,1,3,48.90,7.624,31.18,46.375,1.010,False
T0517,1,4,46.54,7.446,26.73,46.346,1.246,False
T0517,1,5,49.37,8.490,32.17,47.604,0.928,False
...,...,...,...,...,...,...,...,...
T0624,484,1,34.06,5.174,15.26,34.783,3.004,False
T0624,484,2,36.59,5.502,13.38,46.377,2.634,False
T0624,484,3,38.05,4.473,16.36,40.580,2.570,False
T0624,484,4,35.15,5.327,13.35,46.377,2.840,False


### CASP 10
Single .tar.gz file with separate .txt files inside

In [16]:
url = 'https://predictioncenter.org/download_area/CASP10/SUMMARY_TABLES/T0_all.tar.gz'
response = requests_get(url)
archive = tarfile.open(fileobj=io.BytesIO(response.content), mode='r')

dfs = {}
for member in archive.getmembers():
    if member.isfile() and regex.fullmatch(member.name):
        f = archive.extractfile(member)
        df = pd.read_csv(f, sep='\s+')               
        dfs[member.name] = df
df = pd.concat(dfs.values(), axis=0)

# Split accession codes
split = df['Model'].str.split('TS', expand=True)
df['Target'] = split[0]
split = split[1].str.split('_', expand=True, n=1)
df['Group'] = split[0].astype(int)
df['Model'] = split[1]
df.set_index(['Target', 'Group', 'Model'], inplace=True)
df.sort_index(inplace=True)

Path('CASP10/QA_official').mkdir(exist_ok=True, parents=True)
df.to_pickle('CASP10/QA_official/table.pkl.xz')

print('Unique targets:', len(df.index.unique(level='Target')))
print('Unique groups:', len(df.index.unique(level='Group')))
df

Unique targets: 96
Unique groups: 150


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,#,GR#,GDT_TS,NP_P,RANK,Z-M1-GDT,Z-M1s-GDT,Z-MA-GDT,Z-MAs-GDT,GDT_HA,...,Z-Score[D],Al.Res.,RMSD[D],ProSA_Z-Score,MolPrb_Score,LDDT,SphGr,CAD_AA,RPF,SphGr_A
Target,Group,Model,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
T0644,24,1,255,024,75.36,97.16,65,0.51,,0.58,,55.32,...,18.2,134.0,2.0,-7.34,2.27,0.62,53.90,0.59,0.67,53.85
T0644,24,2,477,024,29.25,93.62,94,,,-1.38,,17.02,...,6.6,69.0,3.1,-2.38,2.50,0.33,12.06,0.41,0.31,12.06
T0644,24,3,282,024,73.76,97.87,53,,,0.51,,53.19,...,17.9,132.0,1.6,-7.29,2.58,0.61,53.19,0.59,0.66,53.14
T0644,26,1,131,026,80.85,100.00,31,0.75,,0.81,,63.12,...,18.3,136.0,2.0,-6.99,3.18,0.68,64.54,0.62,0.71,64.47
T0644,26,2,189,026,78.19,100.00,39,,,0.69,,62.06,...,19.9,138.0,2.5,-5.91,3.57,0.68,65.25,0.60,0.73,65.18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
T0758,498,1,205,498s,65.50,100.00,55,-0.36,-0.36,-0.37,-0.37,43.92,...,38.5,348.0,2.7,-9.55,3.46,0.62,63.93,0.61,0.67,63.93
T0758,498,2,171,498s,67.62,100.00,38,,,0.04,0.04,46.58,...,39.2,349.0,2.6,-9.84,3.19,0.62,65.85,0.62,0.69,65.85
T0758,498,3,15,498s,70.22,100.00,4,,,0.54,0.54,48.98,...,39.8,355.0,2.5,-9.90,3.56,0.64,69.67,0.62,0.71,69.67
T0758,498,4,203,498s,65.57,100.00,34,,,-0.36,-0.36,43.58,...,39.0,351.0,2.5,-10.13,3.34,0.61,65.30,0.60,0.68,65.30


### CASP 11
Single .tar.gz file with separate .txt files inside, 
but also has some additional targets that are not in the .tar.gz file

In [17]:
url = 'https://predictioncenter.org/download_area/CASP11/SUMMARY_TABLES/T0xxx_09.05.tar.gz'
response = requests_get(url)
archive = tarfile.open(fileobj=io.BytesIO(response.content), mode='r')

dfs = {}
for member in archive.getmembers():
    if member.isfile() and regex.fullmatch(member.name):
        f = archive.extractfile(member)
        df = pd.read_csv(f, sep='\s+')
        dfs[member.name] = df
        
# Additional results that in the .tar.gz file are missing or outdated
url = 'https://predictioncenter.org/download_area/CASP11/SUMMARY_TABLES/'
for f in ['T0774.txt', 'T0812.txt', 'T0837.txt', 
          'T0840.txt', 'T0841.txt', 'T0851.txt']:
    response = requests_get(url + f)
    df = pd.read_csv(io.BytesIO(response.content), sep='\s+')
    dfs[f] = df
    
df = pd.concat(dfs.values(), axis=0)

# Split accession codes
split = df['Model'].str.split('TS', expand=True)
df['Target'] = split[0]
split = split[1].str.split('_', expand=True, n=1)
df['Group'] = split[0].astype(int)
df['Model'] = split[1]
df.set_index(['Target', 'Group', 'Model'], inplace=True)
df.sort_index(inplace=True)

Path('CASP11/QA_official').mkdir(exist_ok=True, parents=True)
df.to_pickle('CASP11/QA_official/table.pkl.xz')

print('Unique targets:', len(df.index.unique(level='Target')))
print('Unique groups:', len(df.index.unique(level='Group')))
df

Unique targets: 93
Unique groups: 143


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,#,GR#,GDT_TS,NP_P,RANK,Z-M1-GDT,Z-M1s-GDT,Z-MA-GDT,Z-MAs-GDT,GDT_HA,...,CODM,DFM,Handed.,SOV,CE,QCS,CONTS,TMscore,Dali(raw),FlexE
Target,Group,Model,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
T0759,6,1,499,006,18.75,100.0,499,-2.29,,-2.26,,11.46,...,0.47,1.52,0.49,37.3,2.30,35.55,45.89,0.33,,36.48
T0759,6,2,475,006,21.35,100.0,473,,,-1.90,,12.50,...,0.42,1.91,0.40,23.2,2.30,31.00,44.13,0.25,,52.01
T0759,6,3,482,006,20.83,100.0,480,,,-1.97,,12.24,...,0.37,1.70,0.41,35.1,2.30,32.40,43.06,0.26,,65.66
T0759,6,4,503,006,18.23,100.0,502,,,-2.33,,12.50,...,0.43,1.73,0.49,36.2,2.58,32.87,46.09,0.24,,43.48
T0759,6,5,507,006,17.71,100.0,506,,,-2.40,,11.46,...,0.26,1.71,0.54,36.9,1.64,28.65,42.86,0.27,,48.04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
T0858,499,1,51,499s,75.22,100.0,51,0.31,0.31,0.39,0.39,54.16,...,0.95,0.20,0.94,80.1,7.84,90.28,81.19,0.92,6868.4,5.28
T0858,499,2,63,499s,74.72,100.0,63,,,0.35,0.35,53.50,...,0.95,0.20,0.94,77.7,7.84,90.04,81.42,0.92,6911.2,4.79
T0858,499,3,84,499s,73.39,100.0,84,,,0.25,0.25,52.33,...,0.94,0.23,0.93,69.0,7.84,89.30,79.37,0.91,6700.3,6.36
T0858,499,4,103,499s,72.61,100.0,103,,,0.19,0.19,51.61,...,0.95,0.19,0.93,73.9,7.74,89.44,80.58,0.91,6714.6,5.02


### CASP 12
No .tar.gz file, only a list of .txt files

In [18]:
base_url = 'https://predictioncenter.org/download_area/CASP12/SUMMARY_TABLES/'
request = requests_get(base_url)
soup = bs4.BeautifulSoup(request.content)

dfs = {}
for a in soup.select('table tr a'):
    href = a.attrs['href']
    if regex.fullmatch(href):
        response = requests_get(base_url + href)
        df = pd.read_csv(io.BytesIO(response.content), sep='\s+')                
        dfs[href] = df
df = pd.concat(dfs.values(), axis=0)

# Split accession codes
split = df['Model'].str.split('TS', expand=True)
df['Target'] = split[0]
split = split[1].str.split('_', expand=True, n=1)
df['Group'] = split[0].astype(int)
df['Model'] = split[1]
df.set_index(['Target', 'Group', 'Model'], inplace=True)
df.sort_index(inplace=True)

Path('CASP12/QA_official').mkdir(exist_ok=True, parents=True)
df.to_pickle('CASP12/QA_official/table.pkl.xz')

print('Unique targets:', len(df.index.unique(level='Target')))
print('Unique groups:', len(df.index.unique(level='Group')))
df

Unique targets: 77
Unique groups: 128


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,#,GR#,GDT_TS,NP_P,RANK,Z-M1-GDT,Z-M1s-GDT,Z-MA-GDT,Z-MAs-GDT,GDT_HA,...,DFM,Handed.,SOV,CE,QCS,CONTS,TMscore,Dali(raw),FlexE,QSE
Target,Group,Model,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
T0859,1,1,4,001,27.66,100.00,2,1.85,,2.18,,19.25,...,1.54,0.51,51.40,4.25,28.90,38.83,0.58,401.6,257.42,
T0859,4,1,109,004,22.79,100.00,107,0.63,,0.80,,16.15,...,1.47,0.51,42.10,3.70,33.08,38.28,0.32,129.5,187.42,62.36
T0859,4,2,148,004,21.90,100.00,144,,,0.55,,15.93,...,1.63,0.51,34.00,3.29,31.68,39.16,0.34,120.9,150.16,70.40
T0859,4,3,34,004,25.22,100.00,32,,,1.49,,15.71,...,1.77,0.54,37.10,3.29,25.14,38.00,0.37,188.1,139.78,74.50
T0859,4,4,147,004,21.90,100.00,144,,,0.55,,15.71,...,1.78,0.48,39.10,3.50,26.82,36.88,0.32,162.4,165.13,74.22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
T0948,498,1,8,498,75.50,100.00,6,1.01,,1.07,,59.23,...,0.33,0.90,85.70,6.11,89.75,81.57,0.82,1188.2,7.02,52.85
T0948,498,2,220,498,64.43,100.00,220,,,0.62,,46.98,...,0.47,0.82,79.60,5.73,83.88,74.51,0.76,1052.4,16.44,56.69
T0948,498,3,103,498,71.48,100.00,102,,,0.91,,52.85,...,0.32,0.87,84.80,5.99,88.80,79.78,0.82,1205.4,9.30,57.08
T0948,498,4,200,498,66.78,97.99,200,,,0.72,,50.00,...,0.48,0.81,78.29,5.86,85.37,74.40,0.76,1073.6,21.06,34.25


### CASP 13
Single .tar.gz file with separate .txt files inside, 
but also has some additional targets that are not in the .tar.gz file

In [19]:
url = 'https://predictioncenter.org/download_area/CASP13/results/tables/casp13.res_tables.T.tar.gz'
response = requests_get(url)
archive = tarfile.open(fileobj=io.BytesIO(response.content), mode='r')

dfs = {}
for member in archive.getmembers():
    if member.isfile() and regex.fullmatch(member.name):
        f = archive.extractfile(member)
        df = pd.read_csv(f, sep='\s+')                
        dfs[member.name] = df
df = pd.concat(dfs.values(), axis=0)

# Split accession codes
split = df['Model'].str.split('TS', expand=True)
df['Target'] = split[0]
split = split[1].str.split('_', expand=True, n=1)
df['Group'] = split[0].astype(int)
df['Model'] = split[1]
df.set_index(['Target', 'Group', 'Model'], inplace=True)
df.sort_index(inplace=True)

Path('CASP13/QA_official').mkdir(exist_ok=True, parents=True)
df.to_pickle('CASP13/QA_official/table.pkl.xz')

print('Unique targets:', len(df.index.unique(level='Target')))
print('Unique groups:', len(df.index.unique(level='Group')))
df

Unique targets: 20
Unique groups: 98


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,#,GR#,GDT_TS,NP_P,RANK,Z-M1-GDT,Z-M1s-GDT,Z-MA-GDT,Z-MAs-GDT,GDT_HA,...,CONTS,TMscore,Dali(raw),FlexE,QSE,CAD_SS,MolPrb_clash,MolPrb_rotout,MolPrb_ramout,MolPrb_ramfv
Target,Group,Model,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
T0950,4,1,144,004s,10.89,100.00,143,-0.74,-0.74,-0.71,-0.71,7.61,...,48.55,0.16,208.7,170.08,29.05,0.08,0.00,2.50,0.28,94.59
T0950,4,2,46,004s,17.62,100.00,46,,,0.35,0.35,9.57,...,54.46,0.29,583.2,266.73,29.29,0.10,0.00,1.88,0.28,95.16
T0950,4,3,175,004s,8.63,100.00,175,,,-1.07,-1.07,5.92,...,49.27,0.12,223.3,271.86,24.11,0.09,0.17,2.50,1.42,95.44
T0950,4,4,142,004s,10.96,56.14,142,,,-0.70,-0.70,7.82,...,30.26,0.16,193.7,77.98,34.65,0.05,0.00,4.73,1.58,95.26
T0950,4,5,143,004s,10.89,100.00,143,,,-0.71,-0.71,7.61,...,48.55,0.16,267.0,170.08,29.05,0.08,0.00,2.50,0.28,94.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
T1022s1,498,1,74,498s,35.87,100.00,73,0.84,1.58,1.11,1.89,22.54,...,67.50,0.48,673.6,8.07,60.68,0.31,16.14,16.92,9.25,77.53
T1022s1,498,2,60,498s,37.11,100.00,60,,,1.22,2.04,22.53,...,68.75,0.51,677.0,8.19,63.65,0.31,28.04,25.64,8.81,80.18
T1022s1,498,3,101,498s,33.86,100.00,98,,,0.92,1.66,22.31,...,68.13,0.45,684.3,9.02,60.23,0.32,16.71,21.03,8.81,79.74
T1022s1,498,4,116,498s,32.85,100.00,115,,,0.83,1.54,20.85,...,66.63,0.44,655.5,9.00,62.49,0.30,17.56,17.95,9.69,78.41


### All CASPs together

In [20]:
df = pd.concat([
    pd.read_pickle(p)
    for p in Path().glob('CASP*/QA_official/table.pkl.xz')
], keys=[9,10,11,12,13], names=['Edition'])

print('Unique targets:', len(df.index.unique(level='Target')))
print('Unique groups:', len(df.index.unique(level='Group')))

with pd.option_context('display.max_rows', 15):
    cols_to_keep = ['GDT_TS', 'GDT_HA', 'LDDT', 'CAD_AA', 'TMscore']
    display(df[cols_to_keep])
    display(
        df.groupby('Edition')
            .apply(lambda df: df.index.get_level_values('Target').nunique())
            .to_frame('Targets')
    )
del df

Unique targets: 300
Unique groups: 363


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,GDT_TS,GDT_HA,LDDT,CAD_AA,TMscore
Edition,Target,Group,Model,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
9,T0517,1,1,,48.43,,,
9,T0517,1,2,,49.05,,,
9,T0517,1,3,,48.90,,,
9,T0517,1,4,,46.54,,,
9,T0517,1,5,,49.37,,,
...,...,...,...,...,...,...,...,...
13,T0858,499,1,75.22,54.16,0.64,0.61,0.92
13,T0858,499,2,74.72,53.50,0.65,0.61,0.92
13,T0858,499,3,73.39,52.33,0.64,0.61,0.91
13,T0858,499,4,72.61,51.61,0.64,0.62,0.91


Unnamed: 0_level_0,Targets
Edition,Unnamed: 1_level_1
9,14
10,96
11,20
12,77
13,93


## Other group's QA predictions

These are the QA predictions submitted by other participants to the QA track in CASP 
([file format reference](https://predictioncenter.org/casp13/index.cgi?page=format#QA)).

Start with `PFRMAT QA`

Use `MODEL 1` for predictions submitted in the first stage </br>
(i.e., estimating quality of the selected server models released 5 days after the initial target release)

Use `MODEL 2` for predictions submitted on the second, larger set of TS models </br>
(i.e., estimating quality of models released 7 days after the initial target release).

Timeline example.
- May 1, 9am PDT - target T0644 is released for prediction in non-QA categories.
- May 4, noon - the deadline for submitting tertiary structure predictions by servers.
- May 6, noon - the first set of server TS predictions (up to 20 models selected primarily to test single-model methods) is sent to the registered QA servers and posted on the casp14 archive page (https://predictioncenter.org/download_area/CASP14/server_predictions/). QA predictions (marked as MODEL 1) for this subset are accepted for two days.
- May 8, noon - deadline for "stage 1" QA predictions. The second set of server TS predictions (150 models selected to test both, single-model and clustering methods) is sent to the registered QA servers and posted on the casp14 archive page. QA predictions (marked as MODEL 2) for this second subset of models are accepted for two more days.
- May 10, noon - deadline for "stage 2" QA predictions. All server TS predictions are posted on the casp14 archive page. No further QA predictions (from servers or manual groups) are accepted for this target.

Data are inserted between `MODEL` and `END` records of the submission file. </br>
You may submit your quality assessment prediction in one of the two different modes:
- `QMODE 1` :   global model quality score (MQS - one number per model)
- `QMODE 2` :   MQS and error estimates on per-residue basis.

In both modes, the first column in each line contains model identifier (file name of the accepted 3D prediction). </br>
The second column contains the accuracy score for a model as a whole (MQS). The accuracy score is a real number between 0.0 and 1.0 (1.0 being a perfect model). </br>
If you don't provide error estimates on per residue basis, your data table will consist of these two columns only (Example A).

If you do additionally provide residue error estimates (QMODE 2), 
each consecutive column should contain error estimate in Angstroms for all the consecutive residues in the target 
(i.e., column 3 corresponds to residue 1 in the target, column 4 - to residue 2 and so on). </br>
This way data constitute a table (Number_of_models_for_the_target) BY (Number_of_residues_in_the_target + 1). </br>
Do not skip columns if you are not predicting error estimates for some residues - instead put "X" in the corresponding column (Example B).</br>
Please specify in the REMARKS what you consider to be an error estimate for a residue (CA location error, geometrical center error, etc.).

Note 1. Please, be advised that a QA record line may be very long and that some editors/mailing programs may force line wrap potentially causing unexpected parsing errors. </br>
To avoid this problem we recommend that you split long lines into shorter sublines (50-100 columns of data) by yourself. </br>
Our parser will consider consecutive sublines (starting with the line containing evaluated model name and ending with the line containing the next model name or tag END) a part of the same logical line.

Note 2. Please, be advised that model quality predictions in CASP are evaluated by comparing submitted estimates of 
global reliability and per-residue accuracy of structural models with the values obtained from CASP model evaluation packages (LGA, LDDT, CAD-score and others). </br>
Since the evaluation score that is used across the categories in CASP is GDT_TS, predictors should strive to predict this score in QMODE1 (QA1). </br>
Predicted per-residue distances in QMODE2 should ideally reproduce those extracted from the LGA optimal model-target superpositions.

Examples:
- (A) Global Model Quality Score
    ```
    PFRMAT QA
    TARGET T0999
    AUTHOR 1234-5678-9000
    METHOD Description of methods used
    MODEL 1
    QMODE 1
    3D-JIGSAW_TS1 0.8 
    FORTE1_AL1.pdb 0.7 
    END
    ```
- (B) Residue-based Quality Assessment (fragment of the table). 
  Note, that this case includes case (A) and there is no need to submit QMODE 1 predictions additionally to QMODE 2.
    ```
    PFRMAT QA
    TARGET T0999
    AUTHOR 1234-5678-9000
    REMARK Error estimate is CA-CA distance in Angstroms
    METHOD Description of methods used
    MODEL 1
    QMODE 2
    3D-JIGSAW_TS1 0.8 10.0 6.5 5.0 2.0 1.0  
    5.0 4.3 4.6
    FORTE1_AL1.pdb 0.7 8.0 5.5 4.5 X X 
    4.5 4.2 5.0 
    END
    ```

### QA group names for CASP 11, 12, 13

Each group participating in the QA track is assigned an id like `QA014`, 
but we also need the name of the group, which can be easily reconnected to the QA method used by the group.

In [2]:
def extract_groups(soup):
    for tr in soup.select('tr'):
        if tr.attrs != {'class': [], 'onmouseover': 'row_over(this)', 'onmouseout': 'row_out(this)'}:
            continue
        tds = tr.select('td')
        if 'QA' not in tds[4].text:
            continue
        group_name = tds[0].text
        group_id = 'QA' + tds[1].text
        yield group_id, group_name

for casp_ed in [11,12,13]:
    response = requests_get(f'https://predictioncenter.org/casp{casp_ed}/docs.cgi?view=groupsbyname')
    soup = bs4.BeautifulSoup(response.content)
    
    df_groups = pd.DataFrame(
        list(extract_groups(soup)), 
        columns=['qa_group_id', 'qa_group_name']
    ).sort_values('qa_group_id').reset_index(drop=True)
    
    dest = Path(f'CASP{casp_ed}/QA_groups.csv')
    dest.parent.mkdir(exist_ok=True, parents=True)
    df_groups.to_csv(dest, header=True, index=False)

display(df_groups.style.set_caption('CASP 13').hide_index())

qa_group_id,qa_group_name
QA014,Bhattacharya-ClustQ
QA022,Pcons
QA023,MULTICOM-NOVEL
QA027,FaeNNz
QA030,VoroMQA-B
QA044,ProQ2
QA058,MULTICOM_CLUSTER
QA065,Jagodzinski-Cao-QA
QA067,LamoureuxLab
QA083,Pcomb


### Download all QA predictions

Each CASP has a slightly different file structure for QA predictions.

In [22]:
dest = 'CASP9/QA_predictions'
base_url = 'https://predictioncenter.org/download_area/CASP9/predictions/'
qa_urls = [
    'QA_T0515-T0539.tar.gz',
    'QA_T0540-T0569.tar.gz',
    'QA_T0570-T0599.tar.gz',
    'QA_T0600-T0629.tar.gz',
    'QA_T0630-T0643.tar.gz',
]

if not Path(dest).is_dir():
    Path(dest).mkdir(parents=True)
    for qa_url in qa_urls:
        ! curl -s {base_url}{qa_url} | tar xz --directory {dest}
    for p in Path(dest).glob('T????QA???_?'):
        Path(dest).joinpath(p.name[:5]).mkdir(exist_ok=True)
        p.rename(Path(dest) / p.name[:5] / p.name)

! ls {dest} | wc -l
! du -sh {dest}

129
1.1G	CASP9/QA_predictions


In [23]:
dest = 'CASP10/QA_predictions'
base_url = 'https://predictioncenter.org/download_area/CASP10/predictions/'
qa_urls = [
    'QA_T0644-T0669.tar.gz',
    'QA_T0670-T0699.tar.gz',
    'QA_T0700-T0729.tar.gz',
    'QA_T0730-T0758.tar.gz',
]

if not Path(dest).is_dir():
    Path(dest).mkdir(parents=True)
    for qa_url in qa_urls:
        ! curl -s {base_url}{qa_url} | tar xz --strip 1 --directory {dest}

! ls {dest} | wc -l
! du -sh {dest}

113
620M	CASP10/QA_predictions


In [24]:
dest = 'CASP11/QA_predictions'
base_url = 'https://predictioncenter.org/download_area/CASP11/predictions/'
qa_urls = [
    'QA_T0759-799.tar.gz',
    'QA_T0800-829.tar.gz',
    'QA_T0830-858.tar.gz',
]

if not Path(dest).is_dir():
    Path(dest).mkdir(parents=True)
    for qa_url in qa_urls:
        ! curl -s {base_url}{qa_url} | tar xz --strip 1 --directory {dest}
        
! ls {dest} | wc -l
! du -sh {dest}

98
660M	CASP11/QA_predictions


In [25]:
dest = 'CASP12/QA_predictions'
base_url = 'https://predictioncenter.org/download_area/CASP12/predictions/'
qa_urls = [
    'CASP12_QA_T08x.tgz',
    'CASP12_QA_T09x.tgz',
]

if not Path(dest).is_dir():
    Path(dest).mkdir(parents=True)
    for qa_url in qa_urls:
        ! curl -s {base_url}{qa_url} | tar xz --directory {dest}
        
! ls {dest} | wc -l
! du -sh {dest}

70
526M	CASP12/QA_predictions


In [26]:
# Map CASP 12 naming to decoy_id naming:
# T0949TS145_1 -> QUARK_TS1
# TODO

In [27]:
dest = 'CASP13/QA_predictions'
base_url = 'https://predictioncenter.org/download_area/CASP13/predictions/QA/'
qa_urls = [
    # Stage 1
    'QA1.all.tar.gz',
    # Stage 2
    'QA2.T095_.tar.gz',
    'QA2.T096_.tar.gz',
    'QA2.T097_.tar.gz',
    'QA2.T098_.tar.gz',
    'QA2.T099_.tar.gz',
    'QA2.T100_.tar.gz',
    'QA2.T101_.tar.gz',
    'QA2.T102_.tar.gz',
]

if not Path(dest).is_dir():
    Path(dest).mkdir(parents=True)
    for qa_url in qa_urls:
        ! curl -s {base_url}{qa_url} | tar xz --directory {dest}

! ls {dest} | wc -l
! du -sh {dest}

86
795M	CASP13/QA_predictions


In [28]:
# Map CASP 13 naming to decoy_id naming:
# T0949TS145_1 -> QUARK_TS1
mapping = {}

base_url = "https://www.predictioncenter.org/download_area/CASP13/predictions/TS_as_submitted/"
urls = [
   "T0949.TS_as_accepted.tar.gz",
   "T0950.TS_as_accepted.tar.gz",
   "T0951.TS_as_accepted.tar.gz",
   "T0953s1.TS_as_accepted.tar.gz",
   "T0953s2.TS_as_accepted.tar.gz",
   "T0954.TS_as_accepted.tar.gz",
   "T0955.TS_as_accepted.tar.gz",
   "T0956.TS_as_accepted.tar.gz",
   "T0957s1.TS_as_accepted.tar.gz",
   "T0957s2.TS_as_accepted.tar.gz",
   "T0958.TS_as_accepted.tar.gz",
   "T0959.TS_as_accepted.tar.gz",
   "T0960.TS_as_accepted.tar.gz",
   "T0961.TS_as_accepted.tar.gz",
   "T0962.TS_as_accepted.tar.gz",
   "T0963.TS_as_accepted.tar.gz",
   "T0964.TS_as_accepted.tar.gz",
   "T0965.TS_as_accepted.tar.gz",
   "T0966.TS_as_accepted.tar.gz",
   "T0967.TS_as_accepted.tar.gz",
   "T0968s1.TS_as_accepted.tar.gz",
   "T0968s2.TS_as_accepted.tar.gz",
   "T0969.TS_as_accepted.tar.gz",
   "T0970.TS_as_accepted.tar.gz",
   "T0971.TS_as_accepted.tar.gz",
   "T0972.TS_as_accepted.tar.gz",
   "T0973.TS_as_accepted.tar.gz",
   "T0974s1.TS_as_accepted.tar.gz",
   "T0974s2.TS_as_accepted.tar.gz",
   "T0975.TS_as_accepted.tar.gz",
   "T0976.TS_as_accepted.tar.gz",
   "T0977.TS_as_accepted.tar.gz",
   "T0978.TS_as_accepted.tar.gz",
   "T0979.TS_as_accepted.tar.gz",
   "T0980s1.TS_as_accepted.tar.gz",
   "T0980s2.TS_as_accepted.tar.gz",
   "T0981.TS_as_accepted.tar.gz",
   "T0982.TS_as_accepted.tar.gz",
   "T0983.TS_as_accepted.tar.gz",
   "T0984.TS_as_accepted.tar.gz",
   "T0985.TS_as_accepted.tar.gz",
   "T0986s1.TS_as_accepted.tar.gz",
   "T0986s2.TS_as_accepted.tar.gz",
   "T0987.TS_as_accepted.tar.gz",
   "T0988.TS_as_accepted.tar.gz",
   "T0989.TS_as_accepted.tar.gz",
   "T0990.TS_as_accepted.tar.gz",
   "T0991.TS_as_accepted.tar.gz",
   "T0992.TS_as_accepted.tar.gz",
   "T0993s1.TS_as_accepted.tar.gz",
   "T0993s2.TS_as_accepted.tar.gz",
   "T0994.TS_as_accepted.tar.gz",
   "T0995.TS_as_accepted.tar.gz",
   "T0996.TS_as_accepted.tar.gz",
   "T0997.TS_as_accepted.tar.gz",
   "T0998.TS_as_accepted.tar.gz",
   "T0999.TS_as_accepted.tar.gz",
   "T1000.TS_as_accepted.tar.gz",
   "T1001.TS_as_accepted.tar.gz",
   "T1002.TS_as_accepted.tar.gz",
   "T1003.TS_as_accepted.tar.gz",
   "T1004.TS_as_accepted.tar.gz",
   "T1005.TS_as_accepted.tar.gz",
   "T1006.TS_as_accepted.tar.gz",
   "T1007.TS_as_accepted.tar.gz",
   "T1008.TS_as_accepted.tar.gz",
   "T1009.TS_as_accepted.tar.gz",
   "T1010.TS_as_accepted.tar.gz",
   "T1011.TS_as_accepted.tar.gz",
   "T1012.TS_as_accepted.tar.gz",
   "T1013.TS_as_accepted.tar.gz",
   "T1014.TS_as_accepted.tar.gz",
   "T1015s1.TS_as_accepted.tar.gz",
   "T1015s2.TS_as_accepted.tar.gz",
   "T1016.TS_as_accepted.tar.gz",
   "T1017s1.TS_as_accepted.tar.gz",
   "T1017s2.TS_as_accepted.tar.gz",
   "T1018.TS_as_accepted.tar.gz",
   "T1019s1.TS_as_accepted.tar.gz",
   "T1019s2.TS_as_accepted.tar.gz",
   "T1020.TS_as_accepted.tar.gz",
   "T1021s1.TS_as_accepted.tar.gz",
   "T1021s2.TS_as_accepted.tar.gz",
   "T1021s3.TS_as_accepted.tar.gz",
   "T1022s1.TS_as_accepted.tar.gz",
   "T1022s2.TS_as_accepted.tar.gz",
   "T1023s1.TS_as_accepted.tar.gz",
   "T1023s2.TS_as_accepted.tar.gz",
   "T1023s3.TS_as_accepted.tar.gz",
]

for url in urls:
    target_dir = Path('/tmp/casp13').joinpath(url[:-7])
    if not target_dir.is_dir():
        target_dir.mkdir(parents=True)
        ! curl -s {base_url}{url} | tar xz --strip 2 --directory {target_dir.as_posix()}
    
    for p in target_dir.iterdir():
        try:
            with p.open() as f:
                for l in f:
                    if l.startswith('TARGET'):
                        target_id = l.split()[1]
                    if l.startswith('AUTHOR'):
                        decoy_id = l.split()[1]
                    if l.startswith('MODEL'):
                        model = l.split()[1]
                        break
                decoy_id = f'{decoy_id}_TS{model}'
                mapping[p.name] = (target_id, decoy_id)
                del target_id, decoy_id, model
        except UnicodeDecodeError as e:
            print(p, e)
        except IndexError as e:
            print(p, repr(l), e)
        except ValueError as e:
            print(p, repr(l), e)
            
! rm -r '/tmp/casp13'
with open('CASP13/decoy_name_mapping.pkl', 'wb') as f:
    pickle.dump(mapping, f)

/tmp/casp13/T0953s1.TS_as_accepted/T0953s1TS401_1 'utf-8' codec can't decode byte 0x96 in position 1581: invalid start byte
/tmp/casp13/T0953s2.TS_as_accepted/T0953s2TS214_1 'utf-8' codec can't decode byte 0xa0 in position 59: invalid start byte
/tmp/casp13/T0953s2.TS_as_accepted/T0953s2TS214_2 'utf-8' codec can't decode byte 0xa0 in position 59: invalid start byte
/tmp/casp13/T0988.TS_as_accepted/T0988TS452_1 'AUTHOR \n' list index out of range


### Parsing QA submissions

These are some QA submissions for target `T0759`:

In [29]:
! ls CASP11/QA_predictions/T0759 | head -n3

T0759QA008_1
T0759QA008_2
T0759QA020_1


These are the predictions made by QA group `QA008` for all decoys of target `T0759` submitted in stage `1`:

In [30]:
! head -n 10 CASP11/QA_predictions/T0759/T0759QA008_1

PFRMAT QA
TARGET T0759
MODEL 1
QMODE 2
server04_TS1 0.391 6.073 5.287 5.008 3.850 3.101 3.084 2.224 1.875 7.066 2.143 2.003 2.066 2.202 2.197 2.393 2.215 
2.140 2.488 2.329 2.582 2.470 2.333 2.127 2.227 2.140 2.062 2.075 2.129 2.002 2.656 2.265 
2.256 2.227 2.110 2.494 2.395 2.373 2.187 2.232 7.886 7.705 2.420 2.486 3.108 2.026 1.996 
2.062 2.024 1.949 2.024 2.419 2.061 2.773 2.942 2.611 2.524 6.676 2.471 2.170 2.205 2.426 
2.262 2.142 2.321 2.822 2.204 3.025 2.476 2.272 5.215 2.273 2.535 8.080 2.579 2.167 2.629 
2.555 2.656 2.065 3.211 2.526 2.918 2.433 2.512 2.593 2.882 2.293 2.728 2.710 2.429 2.337 


These are the predictions made by QA group `QA008` for all decoys of target `T0759` submitted in stage `2`:

In [31]:
! head -n 10 CASP11/QA_predictions/T0759/T0759QA008_2

PFRMAT QA
TARGET T0759
MODEL 2
QMODE 2
MUFOLD-Server_TS2 0.660 7.167 6.428 5.492 4.609 3.464 3.637 2.478 2.194 7.383 2.368 3.012 2.580 2.309 2.271 2.672 2.409 
7.838 2.410 2.324 2.495 2.432 2.362 2.159 2.413 2.208 2.154 2.220 2.319 2.091 2.778 2.279 
2.181 2.026 1.903 2.280 2.099 2.137 2.010 1.996 2.179 7.554 2.302 2.447 3.196 2.105 1.993 
2.075 1.996 1.853 1.940 2.319 1.946 2.513 2.723 2.376 2.313 2.704 2.304 2.084 2.138 2.442 
2.213 2.101 2.276 2.702 2.082 2.401 2.273 2.135 2.154 2.047 2.300 7.870 2.440 2.027 2.420 
2.393 2.718 2.046 3.100 2.502 2.908 2.638 2.529 8.209 3.034 2.380 3.049 2.802 2.502 2.548 


In [3]:
def parse_float_score(score):
    if score == 'X':
        return float('NaN')
    return float(score)

def parse_filename(path):
    target, rest = path.name.split('QA')
    qa_group, stage = rest.split('_')
    return target, f'QA{qa_group}', int(stage)

@logger.catch(reraise=True)
def parse_qa_submission(path):   
    with open(path) as f:
        pformat = f.readline().split()
        if pformat != ['PFRMAT', 'QA']:
            raise ValueError(pformat)
        target = f.readline().split()[1]
        stage = int(f.readline().split()[1])
        qmode = int(f.readline().split()[1])

        if qmode == 1:
            split = f.readline().split()
            while split[0] != 'END':
                decoy, global_score = split
                yield decoy, parse_float_score(global_score), None
                split = f.readline().split()
        elif qmode == 2:
            decoy = None
            global_score = None
            local_scores = None

            line = f.readline()
            while line != '' and line.strip() != 'END':
                split = line.split()
                decoy = split[0]
                global_score = parse_float_score(split[1])
                local_scores = [parse_float_score(s) for s in split[2:]]

                try:
                    while line != '':
                        line = f.readline()
                        split = line.split()
                        local_scores.extend(parse_float_score(s) for s in split)
                except ValueError:
                    pass
                yield decoy, global_score, local_scores
        else:
            raise ValueError(qmode)

In [None]:
for casp_ed in [11,12,13]: # [9,10,11,12,13]
    df_global = []
    df_local = {}
    qa_group_names = pd.read_csv(f'CASP{casp_ed}/QA_groups.csv').set_index('qa_group_id')['qa_group_name'].to_dict()
    
    if casp_ed == 13:
        with open('CASP13/decoy_name_mapping.pkl', 'rb') as f:
            casp_13_decoy_mapping = pickle.load(f)
    
    parsed_decoys = 0
    targets = list(Path(f'CASP{casp_ed}/QA_predictions').glob('T*'))
    casp_bar = tqdm.tqdm(targets, desc=f'CASP{casp_ed}', unit='targets')
    for target_path in casp_bar:
        for path in target_path.glob('T*QA*'):
            target_id, qa_group_id, stage = parse_filename(path)
            qa_group_id = qa_group_names[qa_group_id]
            
            for decoy_id, global_score, local_scores in parse_qa_submission(path):
                if casp_ed == 13:
                    _, decoy_id = casp_13_decoy_mapping[decoy_id]
                parsed_decoys += 1
                df_global.append((qa_group_id, target_id, decoy_id, stage, global_score))
                if local_scores is not None:
                    local_scores = pd.Series(local_scores, name='pred').rename_axis('residue_idx')
                    df_local[(qa_group_id, target_id, decoy_id, stage)] = local_scores
        
        casp_bar.set_postfix({'parsed decoys': parsed_decoys})
    casp_bar.close()
    
    # Global scores
    df_global = pd.DataFrame(df_global, columns=['qa_group_id', 'target_id', 'decoy_id', 'stage', 'pred'])
    df_global.sort_values(['qa_group_id', 'target_id', 'decoy_id', 'stage', 'pred'], inplace=True)
    (
        df_global
        .set_index(['qa_group_id', 'target_id', 'decoy_id', 'stage'])
        .to_pickle(f'CASP{casp_ed}/QA_predictions/global.pkl.xz', compression='xz')
    )
    
    print('Raw dataframe')
    display(df_global.set_index(['qa_group_id', 'target_id', 'decoy_id', 'stage']))

    print('Number of decoys of each target scored by each group in each stage')
    display(
        df_global.groupby(['qa_group_id', 'target_id', 'stage'])
        .size()
        .unstack('stage', fill_value=0)
    )

    print('Number of targets considered by each group in each CASP')
    display(
        df_global
        .groupby(['qa_group_id', 'stage'])
        .agg({'target_id': 'nunique'})
        .unstack('stage', fill_value=0)
    )
    del df_global
    
    # Local scores
    df_local = pd.concat(
        df_local.values(), 
        keys=df_local.keys(), 
        names=['qa_group_id', 'target_id', 'decoy_id', 'stage']
    ).sort_index().to_frame()
    df_local.to_pickle(f'CASP{casp_ed}/QA_predictions/local.pkl.xz', compression='xz')
    
    print('Raw dataframe')
    display(df_local)
    
    print('By stage')
    display(
        df_local
        .groupby(['qa_group_id', 'target_id', 'decoy_id', 'stage'])
        .first()
        .groupby(['qa_group_id', 'target_id', 'stage'])
        .size()
        .unstack('stage', fill_value=0)
    )
    del df_local

HBox(children=(FloatProgress(value=0.0, description='CASP11', max=98.0, style=ProgressStyle(description_width=…


Raw dataframe


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,pred
qa_group_id,target_id,decoy_id,stage,Unnamed: 4_level_1
BITS,T0759,3D-Jigsaw-V5_1_TS1,2,0.4740
BITS,T0759,3D-Jigsaw-V5_1_TS2,2,0.4765
BITS,T0759,3D-Jigsaw-V5_1_TS3,2,0.5718
BITS,T0759,3D-Jigsaw-V5_1_TS4,2,0.6246
BITS,T0759,3D-Jigsaw-V5_1_TS5,2,0.4703
...,...,...,...,...
raghavagps-qaspro,T0858,slbio_TS1,2,0.7600
raghavagps-qaspro,T0858,slbio_TS2,2,0.8200
raghavagps-qaspro,T0858,slbio_TS3,2,0.8300
raghavagps-qaspro,T0858,slbio_TS4,2,0.8400


Number of decoys of each target scored by each group in each stage


Unnamed: 0_level_0,stage,1,2
qa_group_id,target_id,Unnamed: 2_level_1,Unnamed: 3_level_1
BITS,T0759,20,150
BITS,T0760,20,150
BITS,T0761,20,150
BITS,T0762,20,150
BITS,T0763,20,150
...,...,...,...
raghavagps-qaspro,T0854,20,150
raghavagps-qaspro,T0855,20,150
raghavagps-qaspro,T0856,20,150
raghavagps-qaspro,T0857,20,150


Number of targets considered by each group in each CASP


Unnamed: 0_level_0,target_id,target_id
stage,1,2
qa_group_id,Unnamed: 1_level_2,Unnamed: 2_level_2
BITS,98,97
ConsMQAPsingle,89,82
DAVIS-QAconsensus,98,97
DAVIS-QAconsensusALL,98,97
DandekarLab,36,36
FUSION,98,97
LNCCUnB,72,77
MQAPmulti,89,82
MQAPsingle,89,82
MQAPsingleA,89,81


In [None]:
! du -hsc CASP*/QA_predictions/*.pkl.xz

## Official QA local scores

Offical local scores (per-residue) are only avaliable for CASP 13.

### CASP 13

In [110]:
base_url = 'https://predictioncenter.org/download_area/CASP13/results/sda/'
response = requests_get(base_url)
soup = bs4.BeautifulSoup(response.content)

links = [
    a.attrs['href']
    for a in soup.select('tr td:nth-child(2) a')
    if re.search(r'T\d{4}(?:s\d)?.*\.tgz', a.attrs['href'])
]
with tempfile.NamedTemporaryFile(mode='w') as f:
    f.write('\n'.join([base_url + l for l in links]))
    ! wget --input-file {f.name} --directory-prefix='CASP13/QA_official'

for l in links:
    ! tar xf "CASP13/QA_official/{l}" --directory 'CASP13/QA_official/' && rm "CASP13/QA_official/{l}"

! du -sh CASP13/QA_official/

11G	CASP13/QA_official/


In [117]:
with open('CASP13/decoy_name_mapping.pkl', 'rb') as f:
    decoy_name_mapping = pickle.load(f)

def parse_lga(lga_path):
    residue_dist_mapping = {}
    with open(lga_path) as f:
        for l in filter(lambda l: l.startswith('LGA '), f):
            residue_idx = int(l.split()[2]) - 1
            distance = float(l.split()[5])
            residue_dist_mapping[residue_idx] = distance
    return residue_dist_mapping

In [182]:
target_links_mapping = {}
for l in links:
    target_id = l.split('.')[0].split('-')[0]
    target_links_mapping.setdefault(target_id, []).append(l)

for target_id in target_links_mapping:
    if f'{target_id}.tgz' in target_links_mapping[target_id]:
        # T0950.tgz is avalable, ignore T0950-D?.tgz files
        target_links_mapping[target_id] = f'{target_id}.tgz'
        
distances_true = {
    # target_id -> {
    #     decoy_id -> {
    #        residue_idx -> distance
    #     }
    # }
}
        
for target_id in target_links_mapping:
    target_series = {}
    if isinstance(target_links_mapping[target_id], str):
        # There is a single T0950.tgz file, load all ground-truth distances from it
        for lga_file in Path(f'CASP13/QA_official/{target_links_mapping[target_id]}').with_suffix('').glob('*.lga'):
            try:
                _, decoy_id = decoy_name_mapping[lga_file.with_suffix('').name]
            except KeyError:
                logger.warning(f'{lga_file.with_suffix("").name} not found in decoy_name_mapping')
                continue
            lga_dict = parse_lga(lga_file)
            if len(lga_dict) == 0:
                logger.warning(f'Check {lga_file} ({target_id} {decoy_id})')
                continue
            decoy_series = pd.Series(lga_dict).rename_axis('residue_idx').rename('true')
            target_series[decoy_id] = decoy_series
    else:
        # The target has been split into domains, e.g. T0984-D1.tgz T0984-D2.tgz, 
        # must merge individual files
        for domain_folder in target_links_mapping[target_id]:
            for lga_file in Path(f'CASP13/QA_official/{domain_folder}').with_suffix('').glob('*.lga'):
                try:
                    _, decoy_id = decoy_name_mapping[lga_file.with_suffix('').name.split('-')[0]]
                except KeyError:
                    logger.warning(f'{lga_file.with_suffix("").name} not found in decoy_name_mapping')
                    continue
                lga_dict = parse_lga(lga_file)
                if len(lga_dict) == 0:
                    logger.warning(f'Check {lga_file} ({target_id} {decoy_id})')
                    continue
                target_series.setdefault(decoy_id, {}).update(lga_dict)
        for decoy_id in target_series:
            target_series[decoy_id] = pd.Series(target_series[decoy_id]).rename_axis('residue_idx').rename('true')
    
    # Concat all decoys of a target into a single series
    distances_true[target_id] = pd.concat(
        [v for v in target_series.values() if isinstance(v, pd.Series)], 
        keys=[k for k in target_series if isinstance(target_series[k], pd.Series)], 
        names=['decoy_id']
    )

# Concat all targets of CASP13 into a single series    
distances_true = pd.concat(distances_true.values(), keys=distances_true.keys(), names=['target_id'])
                
distances_true.to_pickle('CASP13/QA_official/distances_true.pkl')
! du -h 'CASP13/QA_official/distances_true.pkl'



144M	CASP13/QA_official/distances_true.pkl


In [174]:
# Example dataframe
distances_true.to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,true
target_id,decoy_id,residue_idx,Unnamed: 3_level_1
T0949,5117-9799-8107_TS5,42,2.274
T0949,5117-9799-8107_TS5,43,2.047
T0949,5117-9799-8107_TS5,44,2.225
T0949,5117-9799-8107_TS5,45,1.599
T0949,5117-9799-8107_TS5,46,0.661
...,...,...,...
T1022s2,3166-0463-8476_TS1,522,122.095
T1022s2,3166-0463-8476_TS1,523,125.857
T1022s2,3166-0463-8476_TS1,524,124.972
T1022s2,3166-0463-8476_TS1,525,130.301
