In [7]:
import csv
import os
import requests

def count_pdb_ids(csv_file):
    """Count the number of PDB IDs in the CSV file."""
    with open(csv_file, 'r') as file:
        reader = csv.reader(file)
        pdb_ids = [row[0].strip() for row in reader if row]  # Read all rows, including the header
    print(f"Total number of PDB IDs (including header): {len(pdb_ids)}")
    return pdb_ids

def download_pdb_files(csv_file, output_dir):
    """Download PDB files listed in a CSV file, including processing the header."""
    os.makedirs(output_dir, exist_ok=True)  # Create output directory if it doesn't exist

    pdb_ids = count_pdb_ids(csv_file)  # Count and get PDB IDs

    base_url = "https://files.rcsb.org/download/"

    for pdb_id in pdb_ids:
        output_path = os.path.join(output_dir, f"{pdb_id}.pdb")
        if os.path.exists(output_path):
            print(f"Skipped: {pdb_id}.pdb (already downloaded)")
            continue  # Skip downloading if the file already exists
        try:
            pdb_url = f"{base_url}{pdb_id}.pdb"
            response = requests.get(pdb_url, stream=True)
            if response.status_code == 200:
                with open(output_path, 'wb') as pdb_file:
                    pdb_file.write(response.content)
                print(f"Downloaded: {pdb_id}.pdb")
            else:
                print(f"Failed to download: {pdb_id}.pdb (HTTP {response.status_code})")
        except Exception as e:
            print(f"Error downloading {pdb_id}.pdb: {e}")

def check_downloads(csv_file, output_dir):
    """Check if all PDB IDs from the CSV file have been downloaded."""
    pdb_ids = count_pdb_ids(csv_file)  # Count and get PDB IDs

    missing_files = []
    for pdb_id in pdb_ids:
        file_path = os.path.join(output_dir, f"{pdb_id}.pdb")
        if not os.path.exists(file_path):
            missing_files.append(pdb_id)

    if missing_files:
        print("The following PDB files are missing:")
        for pdb_id in missing_files:
            print(f"- {pdb_id}.pdb")
    else:
        print("All PDB files have been successfully downloaded.")

if __name__ == "__main__":
    csv_file_path = "pdb_ids.csv"  # Replace with your CSV file path
    output_directory = "pdb_files"  # Replace with your desired output directory

    # Step 1: Download PDB files
    download_pdb_files(csv_file_path, output_directory)

    # Step 2: Check if all files have been downloaded
    check_downloads(csv_file_path, output_directory)


Total number of PDB IDs (including header): 233
Skipped: 1a15.pdb (already downloaded)
Skipped: 1b2t.pdb (already downloaded)
Skipped: 1b3a.pdb (already downloaded)
Skipped: 1b50.pdb (already downloaded)
Skipped: 1b53.pdb (already downloaded)
Skipped: 1bo0.pdb (already downloaded)
Skipped: 1cm9.pdb (already downloaded)
Skipped: 1dok.pdb (already downloaded)
Skipped: 1dol.pdb (already downloaded)
Skipped: 1dom.pdb (already downloaded)
Skipped: 1don.pdb (already downloaded)
Skipped: 1eig.pdb (already downloaded)
Skipped: 1eih.pdb (already downloaded)
Skipped: 1el0.pdb (already downloaded)
Skipped: 1eot.pdb (already downloaded)
Skipped: 1eqt.pdb (already downloaded)
Skipped: 1esr.pdb (already downloaded)
Skipped: 1f2l.pdb (already downloaded)
Skipped: 1f9p.pdb (already downloaded)
Skipped: 1f9q.pdb (already downloaded)
Skipped: 1f9r.pdb (already downloaded)
Skipped: 1f9s.pdb (already downloaded)
Skipped: 1g2s.pdb (already downloaded)
Skipped: 1g2t.pdb (already downloaded)
Skipped: 1g91.pd

All PDB files have been successfully downloaded.


In [8]:
import os

def write_lines(output_file, lines):
    """Write lines to a file."""
    output_file.writelines(lines)

def extract_and_save_models(input_pdb_filename, output_directory):
    """
    Extract models from a PDB file and save them as separate files.
    If no MODEL/ENDMDL found, save the file as a single model.
    """
    base_filename = os.path.splitext(os.path.basename(input_pdb_filename))[0]
    lines_before_model = []  # Store lines before the first MODEL
    lines_after_models = []  # Store lines after the last ENDMDL
    current_model_lines = []  # Store lines of the current model
    model_count = 0
    is_in_model = False

    with open(input_pdb_filename, 'r') as pdb_file:
        for line in pdb_file:
            if line.startswith('MODEL'):
                if current_model_lines:
                    save_model(output_directory, base_filename, model_count, lines_before_model, current_model_lines, lines_after_models)
                    current_model_lines = []
                model_count += 1
                is_in_model = True
            elif line.startswith('ENDMDL'):
                if is_in_model:
                    save_model(output_directory, base_filename, model_count, lines_before_model, current_model_lines, lines_after_models)
                    current_model_lines = []
                    is_in_model = False
            elif is_in_model:
                current_model_lines.append(line)
            else:
                # Collect lines outside of any MODEL/ENDMDL blocks
                if model_count == 0:
                    lines_before_model.append(line)
                else:
                    lines_after_models.append(line)

    # Handle the case where there's no MODEL/ENDMDL (single model file)
    if model_count == 0:
        save_single_model(input_pdb_filename, output_directory, base_filename)

def save_model(output_directory, base_filename, model_count, lines_before_model, current_model_lines, lines_after_models):
    """
    Save a single model as a PDB file, appending lines before and after the MODEL.
    """
    output_filename = os.path.join(output_directory, f"{base_filename}_model{model_count}.pdb")
    with open(output_filename, 'w') as output_file:
        if model_count == 1:  # Only write pre-MODEL lines for the first model
            write_lines(output_file, lines_before_model)
        write_lines(output_file, current_model_lines)
        write_lines(output_file, lines_after_models)
    print(f"Saved: {output_filename}")

def save_single_model(input_pdb_filename, output_directory, base_filename):
    """
    Save a PDB file as a single model if no MODEL/ENDMDL blocks are present.
    """
    output_filename = os.path.join(output_directory, f"{base_filename}_model1.pdb")
    with open(output_filename, 'w') as single_model_file:
        with open(input_pdb_filename, 'r') as input_pdb_file:
            single_model_file.writelines(input_pdb_file.readlines())
    print(f"Saved single model: {output_filename}")

if __name__ == '__main__':
    # Input and output directories
    pdb_directory = 'pdb_files'
    output_directory = 'output_models'
    os.makedirs(output_directory, exist_ok=True)

    # Process each PDB file in the input directory
    for filename in os.listdir(pdb_directory):
        if filename.endswith('.pdb'):
            input_pdb_file = os.path.join(pdb_directory, filename)
            print(f"Processing {input_pdb_file}...")
            extract_and_save_models(input_pdb_file, output_directory)


Processing pdb_files\1a15.pdb...
Saved single model: output_models\1a15_model1.pdb
Processing pdb_files\1b2t.pdb...
Saved: output_models\1b2t_model1.pdb
Saved: output_models\1b2t_model2.pdb
Saved: output_models\1b2t_model3.pdb
Saved: output_models\1b2t_model4.pdb
Saved: output_models\1b2t_model5.pdb
Saved: output_models\1b2t_model6.pdb
Saved: output_models\1b2t_model7.pdb
Saved: output_models\1b2t_model8.pdb
Saved: output_models\1b2t_model9.pdb
Saved: output_models\1b2t_model10.pdb
Saved: output_models\1b2t_model11.pdb
Saved: output_models\1b2t_model12.pdb
Saved: output_models\1b2t_model13.pdb
Saved: output_models\1b2t_model14.pdb
Saved: output_models\1b2t_model15.pdb
Saved: output_models\1b2t_model16.pdb
Saved: output_models\1b2t_model17.pdb
Saved: output_models\1b2t_model18.pdb
Saved: output_models\1b2t_model19.pdb
Saved: output_models\1b2t_model20.pdb
Processing pdb_files\1b3a.pdb...
Saved single model: output_models\1b3a_model1.pdb
Processing pdb_files\1b50.pdb...
Saved: output_mod

In [10]:
import os

def extract_chemokine_chains(input_directory, output_directory):
    """
    Extract chemokine chains from all PDB files in the input directory.
    Saves the extracted chains as new PDB files in the output directory.
    """
    os.makedirs(output_directory, exist_ok=True)

    for pdb_file in os.listdir(input_directory):
        if pdb_file.endswith(".pdb"):
            input_path = os.path.join(input_directory, pdb_file)
            extract_chemokine_chain_from_pdb(input_path, output_directory)

def extract_chemokine_chain_from_pdb(input_pdb, output_directory):
    """
    Extract chemokine chains from a single PDB file, excluding solvent-only chains.
    """
    chemokine_keywords = ["CHEMOKINE", "CXCL", "CCL"]  # Keywords to identify chemokines
    solvent_residues = {"HOH", "WAT", "SOL"}  # Common solvent residue names
    output_base = os.path.splitext(os.path.basename(input_pdb))[0]

    with open(input_pdb, 'r') as pdb_file:
        lines = pdb_file.readlines()

    chemokine_chains = []
    current_chain = []
    current_chain_id = None
    is_chemokine = False
    contains_non_solvent_atoms = False

    for line in lines:
        if line.startswith("TITLE") or line.startswith("HEADER"):
            if any(keyword in line.upper() for keyword in chemokine_keywords):
                is_chemokine = True

        if line.startswith("ATOM") or line.startswith("HETATM"):
            chain_id = line[21]  # Chain identifier in column 22
            residue_name = line[17:20].strip()  # Residue name in columns 18-20
            if current_chain_id != chain_id:
                if current_chain and contains_non_solvent_atoms:
                    chemokine_chains.append((current_chain_id, current_chain))
                current_chain = []
                contains_non_solvent_atoms = False
                current_chain_id = chain_id

            current_chain.append(line)
            if residue_name not in solvent_residues:
                contains_non_solvent_atoms = True

    # Add the last chain if it contains non-solvent atoms
    if current_chain and contains_non_solvent_atoms and is_chemokine:
        chemokine_chains.append((current_chain_id, current_chain))

    # Save each chain as a separate PDB file
    for chain_id, chain_lines in chemokine_chains:
        output_path = os.path.join(output_directory, f"{output_base}_chain_{chain_id}.pdb")
        with open(output_path, 'w') as output_file:
            output_file.writelines(chain_lines)
        print(f"Extracted chain {chain_id} from {input_pdb} to {output_path}")

if __name__ == "__main__":
    # Define input and output directories
    input_pdb_directory = "pdb_files"  # Replace with the directory containing PDB files
    output_chemokine_directory = "chemokine_chains"  # Directory for extracted chains

    # Extract chemokine chains
    extract_chemokine_chains(input_pdb_directory, output_chemokine_directory)


Extracted chain A from pdb_files\1a15.pdb to chemokine_chains\1a15_chain_A.pdb
Extracted chain B from pdb_files\1a15.pdb to chemokine_chains\1a15_chain_B.pdb
Extracted chain A from pdb_files\1a15.pdb to chemokine_chains\1a15_chain_A.pdb
Extracted chain A from pdb_files\1b2t.pdb to chemokine_chains\1b2t_chain_A.pdb
Extracted chain A from pdb_files\1b3a.pdb to chemokine_chains\1b3a_chain_A.pdb
Extracted chain B from pdb_files\1b3a.pdb to chemokine_chains\1b3a_chain_B.pdb
Extracted chain A from pdb_files\1b3a.pdb to chemokine_chains\1b3a_chain_A.pdb
Extracted chain B from pdb_files\1b3a.pdb to chemokine_chains\1b3a_chain_B.pdb
Extracted chain A from pdb_files\1b50.pdb to chemokine_chains\1b50_chain_A.pdb
Extracted chain B from pdb_files\1b50.pdb to chemokine_chains\1b50_chain_B.pdb
Extracted chain A from pdb_files\1b50.pdb to chemokine_chains\1b50_chain_A.pdb
Extracted chain B from pdb_files\1b50.pdb to chemokine_chains\1b50_chain_B.pdb
Extracted chain A from pdb_files\1b50.pdb to chemoki