In [None]:
# ============================================================================
# MATTERSIM RELAXED STRUCTURE CONSOLIDATION SCRIPT
# ============================================================================
# This script consolidates MatterSim-relaxed crystal structures from multiple
# directories into a single master directory with sequential numbering.
#
# Project: MS508 Group Project
# Chemical System: Ba-Ti-O (Barium-Titanium-Oxygen)
# Purpose: Merge relaxed (geometry-optimized) structures from different runs
#
# WORKFLOW CONTEXT:
# 1. MatterGen generates initial crystal structures
# 2. MatterSim performs geometry relaxation (energy minimization)
# 3. This script consolidates the relaxed structures for analysis
#
# NOTE: This script was run LOCALLY on Windows (not in Colab)
#       File paths are specific to local machine and will need to be
#       updated for other users.
# ============================================================================

from pathlib import Path

# ----------------------------------------------------------------------------
# SECTION 1: Define Source and Destination Directories
# ----------------------------------------------------------------------------
# Source directories contain MatterSim-relaxed structures from different runs
# These represent geometry-optimized (lower energy) versions of the original
# MatterGen-generated structures
#
# NOTE: Update these paths to match your local file structure

# First set of relaxed structures
data1_relaxed = Path(r"C:\Users\colet\Desktop\MatterGen Master File\First_set\relaxed_structures")

# Second set of relaxed structures
data2_relaxed = Path(r"C:\Users\colet\Desktop\MatterGen Master File\Second_Set\Relaxed_Structures_Second_Set")

# Master destination directory for all consolidated relaxed structures
dest = Path(r"C:\Users\colet\Desktop\MatterGen Master File\Master_File_Combined_Relaxed_Geometries")
dest.mkdir(parents=True, exist_ok=True)

# ----------------------------------------------------------------------------
# SECTION 2: Define Consolidation Function
# ----------------------------------------------------------------------------
def copy_and_renumber(src_dir, start_idx):
    """
    Copy relaxed CIF files from source directory and renumber them sequentially.

    This function performs the same consolidation as the original structure
    consolidation script, but specifically for MatterSim-relaxed geometries.

    Parameters:
    -----------
    """
    idx = start_idx

    # Process all relaxed CIF files in sorted order
    for cif_path in sorted(src_dir.glob("*.cif")):
        # Read the relaxed CIF file content
        # errors="ignore" handles any non-UTF-8 characters gracefully
        text = cif_path.read_text(encoding="utf-8", errors="ignore")
        lines = text.splitlines()

        # Update or insert the data_ header with sequential numbering
        # This ensures each relaxed structure has a unique identifier
        if lines and lines[0].strip().startswith("data_"):
            # Replace existing data_ line with new sequential number
            lines[0] = f"data_image{idx}"
        else:
            # Insert data_ line if missing
            lines.insert(0, f"data_image{idx}")

        # Reconstruct file content
        new_text = "\n".join(lines) + "\n"

        # Write to destination with zero-padded filename
        # Format: struct_0001.cif, struct_0002.cif, etc.
        out_path = dest / f"struct_{idx+1:04d}.cif"
        out_path.write_text(new_text, encoding="utf-8")

        idx += 1

    return idx

# ----------------------------------------------------------------------------
# SECTION 3: Process All Relaxed Structure Directories
# ----------------------------------------------------------------------------
print("=" * 70)
print("MATTERSIM RELAXED STRUCTURE CONSOLIDATION")
print("=" * 70)
print(f"Destination: {dest}")
print("-" * 70)

# Initialize counter
next_idx = 0

# Process first set of relaxed structures
print(f"Processing first set of relaxed structures... Starting at index {next_idx}")
next_idx = copy_and_renumber(data1_relaxed, next_idx)

# Process second set of relaxed structures
print(f"Processing second set of relaxed structures... Starting at index {next_idx}")
next_idx = copy_and_renumber(data2_relaxed, next_idx)

# ----------------------------------------------------------------------------
# SECTION 4: Report Results
# ----------------------------------------------------------------------------
print("-" * 70)
print(f"✅ Combined relaxed CIFs written to: {dest}")
print(f"✅ Total relaxed structures consolidated: {next_idx}")
print("=" * 70)

# ============================================================================
# NOTES:
# ============================================================================
#
# SCRIPT PURPOSE:
# This script consolidates MatterSim-relaxed crystal structures from multiple
# directories. Unlike the original structure consolidation script (which
# merged raw MatterGen outputs), this script specifically handles structures
# that have undergone geometry relaxation.