In [None]:
"""Template for jupyter notebooks created within this directory. Adds the parent directory to path and sets autoreload."""

In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
import os
import sys

# Get the current working directory
cwd = os.getcwd()

# Get the parent directory
parent_dir = os.path.dirname(cwd)

# Get the grandparent directory (two levels above)
root_dir = os.path.dirname(parent_dir)

# Add the root directory to sys.path
if root_dir not in sys.path:
    sys.path.append(root_dir)

print(f"Root directory: {root_dir} is added to sys.path")


Root directory: /Users/aag/Documents/proteinfolding is added to sys.path


In [3]:
## test imports

from proteinfolding import * ## should run without errors

##TODO: proper testing ##

In [4]:
from proteinfolding.data_processing import generate_exact_energies_data

/Users/aag/Documents/proteinfolding/notebooks/data_processing
┌──────────────────────────────────────────────────────────────────────────────┐
│                                 PyRosetta-4                                  │
│              Created in JHU by Sergey Lyskov and PyRosetta Team              │
│              (C) Copyright Rosetta Commons Member Institutions               │
│                                                                              │
│ NOTE: USE OF PyRosetta FOR COMMERCIAL PURPOSES REQUIRE PURCHASE OF A LICENSE │
│         See LICENSE.PyRosetta.md or email license@uw.edu for details         │
└──────────────────────────────────────────────────────────────────────────────┘
PyRosetta-4 2025 [Rosetta PyRosetta4.Release.python310.m1 2025.06+release.029c6a159b896477003a14f78f472d4cd2cead46 2025-02-04T15:14:13] retrieved from: http://www.pyrosetta.org
core.init: Checking for fconfig files in pwd and ./rosetta/flags
core.init: Rosetta version: PyRosetta4.Release.p

In [5]:
import numpy as np
import pandas as pd

df_results = pd.read_csv(os.path.join(root_dir, "notebooks/local_testing/complexity_SA_break_gs.csv"))
df_exact = pd.read_csv("/Users/aag/Documents/proteinfolding/data/processed/exact/exact_ground_state_and_energy.csv")

print("Reading from:", os.path.join(root_dir, "notebooks/local_testing/complexity_SA_break_gs.csv"))
print(df_results.shape)
# Merge on num_res and num_rot to get true ground state energy
df_merged = pd.merge(df_results, df_exact[['num_res', 'num_rot', 'gs_energy']], on=['num_res', 'num_rot'], how='left')

# Set a tolerance
tol = 1e-6

# Identify mismatched rows
mask_mismatch = ~np.isclose(df_merged["ground_state"], df_merged["gs_energy"], atol=tol)
df_mismatches = df_merged[mask_mismatch]

# Report mismatches
print("❌ Mismatches found:")
print(df_mismatches[["num_res", "num_rot", "ground_state", "gs_energy"]])

# Drop mismatches
df_cleaned = df_results[~mask_mismatch]

Reading from: /Users/aag/Documents/proteinfolding/notebooks/local_testing/complexity_SA_break_gs.csv
(121, 5)
❌ Mismatches found:
Empty DataFrame
Columns: [num_res, num_rot, ground_state, gs_energy]
Index: []


In [6]:
df_cleaned.to_csv("complexity_SA_break_gs.csv", index=False)

In [None]:
num_res_array = [6]
num_rot_array = [4]
import time

start_time = time.time()
df = generate_exact_energies_data(num_res_array, num_rot_array)
end_time = time.time()

print(f"Initial runtime: {end_time - start_time:.4f} seconds")
print(df.head())

In [None]:
import os
import shutil
from proteinfolding.paths import PYROSETTA_ENERGY_DATA_DIR, PYROSETTA_ENERGY_DATA_DIR_MUTABLE, PYROSETTA_ENERGY_DATA_ALL

def copy_missing_energy_files():
    """
    Copies all missing files from PYROSETTA_ENERGY_DATA_DIR to PYROSETTA_ENERGY_DATA_DIR_MUTABLE.
    Only copies files that do not already exist in the destination folder.
    """
    # Ensure the destination directory exists
    os.makedirs(PYROSETTA_ENERGY_DATA_DIR_MUTABLE, exist_ok=True)

    # Get a list of files in the source directory
    source_files = os.listdir(PYROSETTA_ENERGY_DATA_DIR)

    # Loop through each file and copy if it doesn’t exist in the destination
    for file_name in source_files:
        source_path = os.path.join(PYROSETTA_ENERGY_DATA_DIR, file_name)
        destination_path = os.path.join(PYROSETTA_ENERGY_DATA_DIR_MUTABLE, file_name)

        if not os.path.exists(destination_path):  # Only copy if missing
            shutil.copy(source_path, destination_path)
            print(f"✅ Copied: {file_name} -> {PYROSETTA_ENERGY_DATA_DIR_MUTABLE}")
        else:
            print(f"⚠️ Already exists, skipping: {file_name}")

    print("✅ All missing files copied successfully.")

copy_missing_energy_files()


In [None]:
from proteinfolding.paths import EXACT_DATA_DIR, EXACT_DATA_ENERGY_BITSTRING_FILE, EXACT_DATA_ENERGY_BITSTRING_FILE_ALL
import pandas as pd

# df = pd.read_csv("/Users/aag/Documents/proteinfolding/data/processed/exact/exact_ground_state_and_energy.csv")
# save the processed data as a csv, create the directory if it doesn't exist
if not os.path.exists(EXACT_DATA_DIR):
    os.makedirs(EXACT_DATA_DIR)

df.to_csv(str(EXACT_DATA_ENERGY_BITSTRING_FILE_ALL), index=False, compression="gzip", mode="a", header=not os.path.exists(EXACT_DATA_ENERGY_BITSTRING_FILE_ALL))


In [None]:
import pandas as pd
from proteinfolding.paths import EXACT_DATA_DIR,EXACT_DATA_ENERGY_BITSTRING_FILE_ALL, EXACT_DATA_ENERGY_BITSTRING_FILE


df = pd.read_csv(str(EXACT_DATA_ENERGY_BITSTRING_FILE), compression="gzip")
print(df)

In [None]:
from proteinfolding.data_processing import find_min_energy_and_bitstring_from_exact_energy_dataframe
num_res=6
num_rot=4

df_filtered = df[(df['num_res'] == num_res) & (df['num_rot'] == num_rot)]
if df_filtered.empty:
    print(f"No exact energy found for num_rot={num_rot}, num_res={num_res}. Skipping.")

df_filtered = df_filtered.sort_values(by='energies').head(1)
min_energy, min_energy_bitstring = find_min_energy_and_bitstring_from_exact_energy_dataframe(df, num_res, num_rot)
print(min_energy)


In [None]:
ground_energy_2x2 = df[(df['num_res'] == 2) & (df['num_rot'] == 3)]['energies'].values[0]
print(ground_energy_2x2)

In [None]:
duplicates = df[df.duplicated(subset=['num_res', 'num_rot'], keep=False)]
print(duplicates)

In [None]:
df_exact = df.drop_duplicates(subset=['num_res', 'num_rot'], keep='first')
print(df_exact)

In [None]:
df_exact.to_csv(
    str(EXACT_DATA_ENERGY_BITSTRING_FILE_ALL), 
    index=False, 
    compression="gzip", 
    mode="w",  # Overwrite the existing file
    header=True 
)