In [None]:
# ============================================================================
# MATTERGEN CIF FILE CONSOLIDATION SCRIPT
# ============================================================================
# This script consolidates multiple sets of CIF (Crystallographic Information
# File) structures generated by different group members into a single master
# directory with sequential numbering.
#
# Project: MS508 Group Project
# Chemical System: Ba-Ti-O (Barium-Titanium-Oxygen)
# Purpose: Merge individual generation runs into one unified dataset
#
# NOTE: This script was run LOCALLY on Windows (not in Colab)
#       File paths are specific to local machine and will need to be
#       updated for other users.
# ============================================================================

from pathlib import Path
import re
import csv
import os
import shutil

# ----------------------------------------------------------------------------
# SECTION 1: Define Source Directories
# ----------------------------------------------------------------------------
# Each data directory corresponds to a separate MatterGen generation run
# These were likely generated by different group members or separate runs
#
# NOTE: Update these paths to match your local file structure

data1 = Path(r"C:\Users\colet\Desktop\MatterGen Master File\generated_crystals_cif-2-20251119T152131Z-1-001\generated_crystals_cif-2")
data2 = Path(r"C:\Users\colet\Desktop\MatterGen Master File\generated_crystals_cif-3-20251119T152133Z-1-001\generated_crystals_cif-3")
data3 = Path(r"C:\Users\colet\Desktop\MatterGen Master File\generated_crystals_cif-4-20251119T152135Z-1-001\generated_crystals_cif-4")
data4 = Path(r"C:\Users\colet\Desktop\MatterGen Master File\generated_crystals_cif-5-20251119T152138Z-1-001\generated_crystals_cif-5")
data5 = Path(r"C:\Users\colet\Desktop\MatterGen Master File\generated_crystals_cif-6-20251119T152141Z-1-001\generated_crystals_cif-6")
data6 = Path(r"C:\Users\colet\Desktop\MatterGen Master File\generated_crystals_cif-7-20251119T152144Z-1-001\generated_crystals_cif-7")
data7 = Path(r"C:\Users\colet\Desktop\MatterGen Master File\generated_crystals_cif-8-20251119T152154Z-1-001\generated_crystals_cif-8")
data8 = Path(r"C:\Users\colet\Desktop\MatterGen Master File\generated_crystals_cif-20251119T152129Z-1-001\generated_crystals_cif")

# Define destination directory for consolidated files
dest = Path(r"C:\Users\colet\Desktop\MatterGen Master File\Master_File_Second_Set")
dest.mkdir(parents=True, exist_ok=True)

# ----------------------------------------------------------------------------
# SECTION 2: Define Consolidation Function
# ----------------------------------------------------------------------------
def copy_and_renumber(src_dir, start_idx):
    """
    Copy CIF files from source directory and renumber them sequentially.

    This function:
    1. Reads each CIF file from the source directory
    2. Updates the data_image header to sequential numbering
    3. Saves to destination with standardized filename (struct_XXXX.cif)

    Parameters:
    -----------
    src_dir : Path
        Source directory containing CIF files to process
    start_idx : int
        Starting index for renumbering (continues from previous batch)

    Returns:
    --------
    int
        Next available index after processing all files in this directory

    CIF File Format Notes:
    ----------------------
    - CIF files must begin with a data_ block identifier
    - Format: data_image{idx} where idx is a unique integer
    - This identifier is used by crystallography software to reference structures
    """
    idx = start_idx

    # Process all CIF files in sorted order (ensures reproducibility)
    for cif_path in sorted(src_dir.glob("*.cif")):
        # Read the CIF file content
        # errors="ignore" handles any non-UTF-8 characters gracefully
        text = cif_path.read_text(encoding="utf-8", errors="ignore")
        lines = text.splitlines()

        # Update or insert the data_ header
        # Every CIF file must start with "data_" followed by an identifier
        if lines and lines[0].strip().startswith("data_"):
            # Replace existing data_ line with new sequential number
            lines[0] = f"data_image{idx}"
        else:
            # Insert data_ line if missing (shouldn't happen with MatterGen output)
            lines.insert(0, f"data_image{idx}")

        # Reconstruct file content
        new_text = "\n".join(lines) + "\n"

        # Write to destination with zero-padded filename
        # Format: struct_0001.cif, struct_0002.cif, etc.
        # Zero-padding ensures proper alphabetical/numerical sorting
        out_path = dest / f"struct_{idx+1:04d}.cif"
        out_path.write_text(new_text, encoding="utf-8")

        idx += 1

    return idx

# ----------------------------------------------------------------------------
# SECTION 3: Process All Directories Sequentially
# ----------------------------------------------------------------------------
print("=" * 70)
print("MATTERGEN CIF CONSOLIDATION")
print("=" * 70)
print(f"Destination: {dest}")
print("-" * 70)

# Initialize counter
next_idx = 0

# Process each source directory in order
# Each call to copy_and_renumber continues numbering from where the last batch ended
print(f"Processing data1... Starting at index {next_idx}")
next_idx = copy_and_renumber(data1, next_idx)

print(f"Processing data2... Starting at index {next_idx}")
next_idx = copy_and_renumber(data2, next_idx)

print(f"Processing data3... Starting at index {next_idx}")
next_idx = copy_and_renumber(data3, next_idx)

print(f"Processing data4... Starting at index {next_idx}")
next_idx = copy_and_renumber(data4, next_idx)

print(f"Processing data5... Starting at index {next_idx}")
next_idx = copy_and_renumber(data5, next_idx)

print(f"Processing data6... Starting at index {next_idx}")
next_idx = copy_and_renumber(data6, next_idx)

print(f"Processing data7... Starting at index {next_idx}")
next_idx = copy_and_renumber(data7, next_idx)

print(f"Processing data8... Starting at index {next_idx}")
next_idx = copy_and_renumber(data8, next_idx)

# ----------------------------------------------------------------------------
# SECTION 4: Report Results
# ----------------------------------------------------------------------------
print("-" * 70)
print(f"✅ Combined CIFs written to: {dest}")
print(f"✅ Total structures consolidated: {next_idx}")
print("=" * 70)

# ============================================================================
# NOTES:
# ============================================================================
#
# SCRIPT PURPOSE:
# This script merges CIF files from multiple MatterGen generation runs into
# a single directory with sequential numbering. This is necessary because:
# - Group members generated structures independently
# - Each run produced files with potentially overlapping numbering
# - Analysis tools require unique identifiers for each structure
#
# CIF FILE FORMAT:
# - CIF (Crystallographic Information File) is the standard format for
#   representing crystal structures
# - Each file contains:
#   * Lattice parameters (a, b, c, alpha, beta, gamma)
#   * Atomic positions and occupancies
#   * Space group information
#   * Chemical composition
# - The data_image{N} header provides a unique identifier for each structure
#
# CONSOLIDATION PROCESS:
# 1. Read each CIF file from source directories
# 2. Update the data_image header to ensure unique sequential numbering
# 3. Save to master directory with standardized filenames (struct_0001.cif, etc.)
# 4. Maintain original structural data while only updating identifiers
#
# FILE NAMING CONVENTION:
# - struct_XXXX.cif where XXXX is zero-padded to 4 digits
# - Zero-padding ensures proper sorting in file browsers and analysis tools
# - Example: struct_0001.cif, struct_0002.cif, ..., struct_0800.cif
#
# DOWNSTREAM ANALYSIS:
# The consolidated files can be analyzed using:
# - VESTA (visualization and structure analysis)
# - pymatgen (Python-based materials analysis)
# - Materials Project database (structure comparison)
# - ASE (Atomic Simulation Environment)
#
# LOCAL EXECUTION NOTE:
# This script was executed on a local Windows machine, NOT in Google Colab.
# File paths are specific to the local filesystem and would need to be
# updated for other users or environments.
# ============================================================================

MATTERGEN CIF CONSOLIDATION
Destination: C:\Users\colet\Desktop\MatterGen Master File\Master_File_Second_Set
----------------------------------------------------------------------
Processing data1... Starting at index 0
Processing data2... Starting at index 0
Processing data3... Starting at index 0
Processing data4... Starting at index 0
Processing data5... Starting at index 0
Processing data6... Starting at index 0
Processing data7... Starting at index 0
Processing data8... Starting at index 0
----------------------------------------------------------------------
✅ Combined CIFs written to: C:\Users\colet\Desktop\MatterGen Master File\Master_File_Second_Set
✅ Total structures consolidated: 0
