In [7]:
import os
import requests
from Bio import PDB
from Bio.SCOP import Scop, Dom
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

def download_scop_sample():
    # Create directories
    os.makedirs('data/SCOP/raw', exist_ok=True)

    # SCOP class definitions
    scop_classes = {
        'a': 'All-alpha',
        'b': 'All-beta',
        'c': 'Alpha/beta',
        'd': 'Alpha+beta',
        'e': 'Multi-domain',
        'f': 'Membrane',
        'g': 'Small proteins'
    }

    # Download classification file
    class_url = "https://scop.berkeley.edu/downloads/parse/dir.des.scope.2.08-stable.txt"
    response = requests.get(class_url, verify=False)

    if response.status_code != 200:
        print(f"Failed to download classification file. Status code: {response.status_code}")
        return None

    domains = []
    for line in response.text.split('\n'):
        if line.startswith('#'):
            continue
        parts = line.strip().split('\t')
        if len(parts) >= 4:
            scop_id, type, scop_class, name = parts[0], parts[1], parts[2], parts[3]
            # Changed to look for domains instead of classes
            if type == 'px' and scop_class[0] in scop_classes:  # 'px' for domains
                domains.append({
                    'scop_id': scop_id,
                    'class': scop_class[0],  # Take first character as class
                    'name': name
                })

    # Convert to DataFrame
    df = pd.DataFrame(domains)
    print(f"Found {len(df)} domains")

    # Sample 50 from each class
    balanced_sample = pd.DataFrame()
    for class_id in scop_classes.keys():
        class_domains = df[df['class'] == class_id]
        if len(class_domains) > 0:
            sample = class_domains.sample(n=min(500, len(class_domains)))
            balanced_sample = pd.concat([balanced_sample, sample])

    # Download structures for sampled domains
    for idx, row in balanced_sample.iterrows():
        pdb_id = row['scop_id']
        # Using new URL format with direct file access
        # ASTRAL URL format with query parameters
        url = f"https://scop.berkeley.edu/astral/pdbstyle/?ver=2.08&id={pdb_id}&output=text"
        try:
            response = requests.get(url, verify=False)
            print(f"Trying URL: {url}")
            print(f"Status code: {response.status_code}")

            if response.status_code == 200:
                content = response.text
                if not content.startswith('<!DOCTYPE html>'):
                    with open(f'data/SCOP/raw/{pdb_id}.pdb', 'w') as f:
                        f.write(content)
                    print(f"Successfully downloaded {pdb_id}")
                else:
                    print(f"Received HTML instead of PDB for {pdb_id}")
                    print("First 100 characters of response:", content[:100])
            else:
                print(f"Failed to download {pdb_id} - Status code: {response.status_code}")
        except Exception as e:
            print(f"Error downloading {pdb_id}: {str(e)}")

    # Save classification info
    balanced_sample.to_csv('data/SCOP/raw/class_info.csv', index=False)
    return balanced_sample

# Run the download
if __name__ == "__main__":
    sample_data = download_scop_sample()
    if sample_data is not None:
        print("\nDownload complete. Summary:")
        print(sample_data['class'].value_counts())

Found 302609 domains
Trying URL: https://scop.berkeley.edu/astral/pdbstyle/?ver=2.08&id=406327&output=text
Status code: 200
Successfully downloaded 406327
Trying URL: https://scop.berkeley.edu/astral/pdbstyle/?ver=2.08&id=76731&output=text
Status code: 200
Successfully downloaded 76731
Trying URL: https://scop.berkeley.edu/astral/pdbstyle/?ver=2.08&id=130401&output=text
Status code: 200
Successfully downloaded 130401
Trying URL: https://scop.berkeley.edu/astral/pdbstyle/?ver=2.08&id=418150&output=text
Status code: 200
Successfully downloaded 418150
Trying URL: https://scop.berkeley.edu/astral/pdbstyle/?ver=2.08&id=270562&output=text
Status code: 200
Successfully downloaded 270562
Trying URL: https://scop.berkeley.edu/astral/pdbstyle/?ver=2.08&id=90357&output=text
Status code: 200
Successfully downloaded 90357
Trying URL: https://scop.berkeley.edu/astral/pdbstyle/?ver=2.08&id=16556&output=text
Status code: 200
Successfully downloaded 16556
Trying URL: https://scop.berkeley.edu/astral/pd