In [14]:
import os
import json
import numpy as np
import pandas as pd
import argparse
from math import degrees, acos

def parse_pdb(filepath):
    atoms = []
    with open(filepath, 'r') as file:
        for line in file:
            if line.startswith("ATOM") or line.startswith("HETATM"):
                atom_name = line[12:16].strip()
                if atom_name in {"N", "CA", "C"}:
                    x, y, z = map(float, (line[30:38], line[38:46], line[46:54]))
                    atoms.append((atom_name, np.array([x, y, z])))
    return atoms


# Clash score calculation
def calculate_clash_score(atoms, threshold=2.0):
    # Extract only coordinates from atoms
    coordinates = [atom[1] for atom in atoms]
    clashes = 0
    num_atoms = len(coordinates)
    
    for i in range(num_atoms):
        for j in range(i + 1, num_atoms):
            distance = np.linalg.norm(coordinates[i] - coordinates[j])
            if distance < threshold:
                clashes += 1
    
    clash_score = clashes / num_atoms * 1000  # Clash score per 1000 atoms
    return clash_score


# Ramachandran outlier calculation
def calculate_phi_psi_angles(atoms):
    # Group atoms by residue based on N, CA, and C
    residues = []
    current_residue = {}
    for atom_name, coords in atoms:
        current_residue[atom_name] = coords
        if len(current_residue) == 3:  # When we have N, CA, and C
            residues.append(current_residue)
            current_residue = {}

    # Calculate phi and psi angles if possible
    phi_psi_angles = []
    for i in range(1, len(residues) - 1):
        n_prev = residues[i - 1]["N"]
        ca_curr = residues[i]["CA"]
        n_curr = residues[i]["N"]
        c_curr = residues[i]["C"]
        ca_next = residues[i + 1]["CA"]
        
        phi = calculate_dihedral(n_prev, ca_curr, n_curr, c_curr)
        psi = calculate_dihedral(ca_curr, n_curr, c_curr, ca_next)
        phi_psi_angles.append((phi, psi))

    return phi_psi_angles


def calculate_dihedral(p1, p2, p3, p4):
    b1 = p2 - p1
    b2 = p3 - p2
    b3 = p4 - p3
    b1xb2 = np.cross(b1, b2)
    b2xb3 = np.cross(b2, b3)
    b1xb2 /= np.linalg.norm(b1xb2)
    b2xb3 /= np.linalg.norm(b2xb3)
    angle = degrees(acos(np.dot(b1xb2, b2xb3)))
    return angle

def calculate_ramachandran_outliers(phi_psi_angles):
    outliers = 0
    for phi, psi in phi_psi_angles:
        if not (-180 <= phi <= 180 and -180 <= psi <= 180):  # Adjust ranges as needed
            outliers += 1
    percent_outliers = (outliers / len(phi_psi_angles)) * 100
    return percent_outliers

# PAE summary calculation
def summarize_pae(json_file):
    with open(json_file, 'r') as file:
        data = json.load(file)
    pae_values = np.array(data["pae"])
    pae_summary = np.mean(pae_values)  # Change to np.median if preferred
    return pae_summary


In [15]:
def process_files(input_folder):
    results = []
    
    for root, _, files in os.walk(input_folder):
        for file in files:
            if file.endswith(".pdb"):
                pdb_path = os.path.join(root, file)
                atoms = parse_pdb(pdb_path)
                
                # Calculate clash score
                clash_score = calculate_clash_score(atoms)
                
                # Calculate Ramachandran outliers
                phi_psi_angles = calculate_phi_psi_angles(atoms)
                ramachandran_outliers = calculate_ramachandran_outliers(phi_psi_angles)
                
                # Add results for the PDB file
                results.append({
                    "Filename": file,
                    "Clash Score": clash_score,
                    "Ramachandran Outliers (%)": ramachandran_outliers,
                    "PAE Summary": None  # Placeholder for PAE value
                })
            
            elif file.endswith(".json"):
                json_path = os.path.join(root, file)
                
                # Try to calculate PAE summary, handling exceptions if data is missing
                try:
                    pae_summary = summarize_pae(json_path)
                    results.append({
                        "Filename": file,
                        "Clash Score": None,
                        "Ramachandran Outliers (%)": None,
                        "PAE Summary": pae_summary
                    })
                except (KeyError, FileNotFoundError):
                    print(f"Could not calculate PAE for {file}")
    
    # Convert results to DataFrame
    df = pd.DataFrame(results)
    return df


In [16]:
# Set the path to your folder containing PDB and JSON files
input_folder = "/Users/adrianahernandezgonzalez/LabNotebook/11-24/states/partialAlphaCaV12HS8HLPlocalrun_b3702_32_64_10/pdb"
# Process files and create DataFrame
df = process_files(input_folder)

# Display the DataFrame
df.head()

# Save DataFrame to JSON file
output_json_file = "model_quality_metrics.json"
df.to_json(output_json_file, orient="records", indent=4)
print(f"Results saved to {output_json_file}")

# Save DataFrame to CSV file
output_csv_file = "model_quality_metrics.csv"
df.to_csv(output_csv_file, index=False)
print(f"Results saved to {output_csv_file}")


Results saved to model_quality_metrics.json
Results saved to model_quality_metrics.csv


In [None]:
##########