In [None]:
import pandas as pd
import os
from tqdm import tqdm

# Configuration
CSV_FOLDER = "path/to/your/csv/files"
OUTPUT_FILE = "merged_antibiotic_resistance_data.csv"
LINKED_FEATURES = ['anon_id', 'pat_enc_csn_id_coded', 'order_proc_id_coded', 'order_time_jittered_utc']

# Define all files with their merge keys and optimal dtypes
FILES_CONFIG = [
    {
        "name": "cultures_cohort",
        "path": "microbiology_cultures_cohort.csv",
        "merge_on": None,  # Base table
        "dtype": {
            "anon_id": "int32",
            "pat_enc_csn_id_coded": "int32",
            "order_proc_id_coded": "int32",
            "order_time_jittered_utc": "datetime64[ns]",
            "ordering_mode": "category",
            "culture_description": "category",
            "was_positive": "boolean",
            "organism": "category",
            "antibiotic": "category",
            "susceptibility": "category"
        }
    },
    {
        "name": "ward_info",
        "path": "microbiology_cultures_ward_info.csv",
        "merge_on": LINKED_FEATURES[:3],
        "dtype": {
            "anon_id": "int32",
            "pat_enc_csn_id_coded": "int32",
            "order_proc_id_coded": "int32",
            "order_time_jittered_utc": "int32",
            "hosp_ward_IP": "int8",
            "hosp_ward_OP": "int8",
            "hosp_ward_ER": "int8",
            "hosp_ward_ICU": "int8"
        }
    },
    {
        "name": "prior_med",
        "path": "microbiology_cultures_prior_med.csv",
        "merge_on": ['anon_id', 'order_time_jittered_utc'],
        "dtype": {
            "anon_id": "int32",
            "medication_name": "category",
            "medication_time_to_culturetime": "float32",
            "medication_category": "category"
        }
    },
    {
        "name": "microbial_resistance",
        "path": "microbiology_cultures_microbial_resistance.csv",
        "merge_on": ['anon_id', 'organism', 'antibiotic'],
        "dtype": {
            "anon_id": "int32",
            "organism": "category",
            "antibiotic": "category",
            "resistant_time_to_culturetime": "float32"
        }
    },
    {
        "name": "demographics",
        "path": "microbiology_cultures_demographics.csv",
        "merge_on": ['anon_id'],
        "dtype": {
            "anon_id": "int32",
            "age": "category",
            "gender": "int8"
        }
    }
    # Add configurations for remaining files following same pattern
]

def optimize_dataframe(df):
    """Downcast numeric columns and convert objects to category"""
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].astype('category')
        elif df[col].dtype == 'int64':
            df[col] = pd.to_numeric(df[col], downcast='integer')
        elif df[col].dtype == 'float64':
            df[col] = pd.to_numeric(df[col], downcast='float')
    return df

def load_in_chunks(file_path, dtype, chunksize=100000):
    """Load large CSV in optimized chunks"""
    chunks = []
    for chunk in tqdm(pd.read_csv(file_path, dtype=dtype, chunksize=chunksize),
                     desc=f"Loading {os.path.basename(file_path)}"):
        chunks.append(optimize_dataframe(chunk))
    return pd.concat(chunks, ignore_index=True)

# Load base table with full dtype specification
print("Loading base cultures table...")
base_df = load_in_chunks(
    os.path.join(CSV_FOLDER, FILES_CONFIG[0]["path"]),
    dtype=FILES_CONFIG[0]["dtype"]
)

# Sequential merging
for config in tqdm(FILES_CONFIG[1:], desc="Merging additional files"):
    try:
        file_path = os.path.join(CSV_FOLDER, config["path"])
        print(f"\nProcessing {config['name']}...")
        
        # Load with appropriate method based on file size
        file_size = os.path.getsize(file_path) / (1024**2)  # in MB
        if file_size > 500:  # Use chunking for large files
            df_to_merge = load_in_chunks(file_path, config.get("dtype"))
        else:
            df_to_merge = pd.read_csv(file_path, dtype=config.get("dtype"))
            df_to_merge = optimize_dataframe(df_to_merge)
        
        # Perform the merge
        base_df = pd.merge(
            base_df,
            df_to_merge,
            on=config["merge_on"],
            how="left",
            suffixes=('', f'_{config["name"]}'),
            validate="one_to_many"
        )
        
        # Clean up memory
        del df_to_merge
        
    except Exception as e:
        print(f"Error processing {config['path']}: {str(e)}")
        continue

# Final memory optimization
print("\nPerforming final memory optimization...")
base_df = optimize_dataframe(base_df)

# Save the merged data
print("\nSaving merged dataset...")
if len(base_df) > 1000000:  # Chunk output if too large
    for i, chunk in enumerate(np.array_split(base_df, 10)):
        chunk.to_csv(f"{OUTPUT_FILE}_{i}.csv", index=False)
else:
    base_df.to_csv(OUTPUT_FILE, index=False)

print(f"\nMerge complete. Final shape: {base_df.shape}")
print("Sample columns:", list(base_df.columns)[:10])