In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from pathlib import Path
import os


        

# Define mappings
state_mapping = {
    'Haryana': 'Washington',
    'Karnataka': 'California',
    'Madhya Pradesh': 'Michigan',
    'Maharashtra': 'Massachusetts',
    'Punjab': 'New Jersey',
    'Rajasthan': 'Florida',
    'Tamil Nadu': 'Texas',
    'Uttar Pradesh': 'New York',
    'West Bengal': 'Illinois'
}

city_mapping = {
    'Faridabad': 'Seattle',
    'Mysore': 'San Francisco',
    'Bangalore': 'Los Angeles',
    'Mangalore': 'San Diego',
    'Gwalior': 'Detroit',
    'Pune': 'Boston',
    'Amritsar': 'Jersey City',
    'Chandigarh': 'Newark',
    'Jaipur': 'Miami',
    'Madurai': 'Houston',
    'Varanasi': 'New York City',
    'Lucknow': 'Buffalo',
    'Darjeeling': 'Chicago'
}

country_mapping = {
    'India': 'United States'   
}

hospital_mapping = {
    'AIIMS': 'Mayo Clinic',
    'Kokilaben Hospital': 'Cleveland Clinic',
    'Global Hospitals': 'Johns Hopkins Hospital',
    'Narayana Health': 'Massachusetts General Hospital',
    'Columbia Asia': 'Mount Sinai Hospital',
    'Care Hospitals': 'St. Jude Children\'s Research Hospital',
    'Manipal Hospital': 'UCLA Medical Center',
    'Tata Memorial Hospital': 'Memorial Sloan Kettering Cancer Center',
    'BLK Super Speciality Hospital': 'Cedars-Sinai Medical Center',
    'Wockhardt Hospitals': 'New York-Presbyterian Hospital',
    'Fortis Hospital': 'Washington University in St. Louis Medical Center'
}

# Function to apply mappings to the data
def apply_mappings(df):
    df['state'] = df['state'].replace(state_mapping)
    df['city'] = df['city'].replace(city_mapping)
    df['country'] = df['country'].replace(country_mapping)
    return df

def apply_hospital_mapping(df):
    df['affiliated_hospital'] = df['affiliated_hospital'].map(hospital_mapping).fillna(df['affiliated_hospital'])
    return df

# Function to convert treatment costs to USD
def convert_cost(df, exchange_rate=85):
    df['treatment_cost'] = df['treatment_cost'] / exchange_rate
    df['treatment_cost'] = df['treatment_cost'].round(2)
    return df

# Function to load and process the file
def load_and_process_file(file_path):
    # Load CSV file into a DataFrame
    df = pd.read_csv(file_path)

    # Apply mappings and transformations
    df = apply_mappings(df)
    df = apply_hospital_mapping(df)
    df = convert_cost(df)
    
    return df
    
# Function to save the processed DataFrame to a CSV file


# Function to mark a file as processed
def mark_file_processed(file_path, metadata_file="processed_files.txt"):
    with open(metadata_file, "a") as f:
        f.write(f"{file_path.name}\n")

# Main ETL Function (can be triggered from the script)
def run_etl():
    # Directory setup for raw data and processed data
    BASE_DIR = Path(__file__).resolve().parent.parent  # Ensure you are in the right directory
    RAW_DATA_DIR = BASE_DIR / "Healthcare_ETL_Project" / "raw_data"
    PROCESSED_DIR = BASE_DIR / "Healthcare_ETL_Project" / "processed"
    PROCESSED_FILE = PROCESSED_DIR / "Healthcare_Dataset.csv"

    # Function to check if a file is already processed (by storing in a metadata file or similar)
    processed_files = set()

    # Try to load already processed files from metadata (if exists)
    processed_metadata_file = BASE_DIR / "processed_files.txt"
    if processed_metadata_file.exists():
        with open(processed_metadata_file, "r") as f:
            processed_files = set(f.read().splitlines())

    # Iterate through each file in the raw data directory
    for file_path in RAW_DATA_DIR.glob("*.csv"):
        if file_path.name in processed_files:
            print(f"Skipping already processed file: {file_path.name}")
            continue

        print(f"Processing file: {file_path.name}")
        
        # Load and process the file
        df = load_and_process_file(file_path)
        
        # Ensure processed directory exists
        PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
        # If the processed CSV already exists, append data to it, otherwise create a new file
        if PROCESSED_FILE.exists():
            df.to_csv(PROCESSED_FILE, mode='a', header=False, index=False)
        else:
            df.to_csv(PROCESSED_FILE, index=False)

        # Mark this file as processed
        mark_file_processed(file_path)

        print(f"File {file_path.name} processed and saved.")

if __name__ == "__main__":
    # Run the ETL process
    run_etl()
    print("ETL process completed.")
        




In [None]:
# # Run the ETL function
# #run_etl()

# if __name__ == "__main__":
#     # Hardcoded path to a sample raw CSV file
#     test_file_path = Path("/Users/avinashmacbookair/Documents/TREND Health Partners/Healthcare_ETL_Project/raw_data/healthcare_treatments_1.csv")
#     PROCESSED_DIR = Path("/Users/avinashmacbookair/Documents/TREND Health Partners/Healthcare_ETL_Project/processed")
#     PROCESSED_FILE = PROCESSED_DIR / "Healthcare_Dataset.csv"
#     # Load and process the file
#     df = load_and_process_file(test_file_path)

#     # Apply all transformations manually for testing
#     # df = apply_mappings(df)
#     # df = apply_hospital_mapping(df)
#     # df = convert_cost(df)
#     # If the processed CSV already exists, append data to it, otherwise create a new file
#     if PROCESSED_FILE.exists():
#         df.to_csv(PROCESSED_FILE, mode='a', header=False, index=False)
#     else:
#         df.to_csv(PROCESSED_FILE, index=False)
#         display(df.head())


#     # Show the resulting DataFrame (for test/debug)
#     #display(df.head())



Unnamed: 0,treatment_id,treatment_start_date,treatment_completion_date,treatment_outcome_status,treatment_outcome_date,treatment_duration,treatment_cost,treatment_type,provider_id,provider_name,speciality_id_x,speciality_name,affiliated_hospital,location_id,country,state,city,patient_id,patient_name,gender,age,disease_id,speciality_id_y,disease_name,disease_type,severity,transmission_mode,mortality_rate,added_at,modified_at
0,1,2024-01-01 00:00:00.000000000,2024-01-08 00:00:00.000000000,unsuccessful,2024-01-12 00:00:00.000000000,7,46489.71,pharmacological,1,Nandini Srivastava,8,Radiology,Mayo Clinic,1,United States,California,San Francisco,3,Kian Menon,Male,71,38,8,Pneumonia,Infectious,Moderate,Airborne,0.1,2024-01-12 00:00:00.000000000,2024-01-12 00:00:00.000000000
1,2,2024-01-01 00:00:45.051492930,2024-01-05 00:00:45.051492930,partially successful,2024-01-07 00:00:45.051492930,4,29150.13,surgical,1,Nandini Srivastava,8,Radiology,Mayo Clinic,1,United States,California,San Francisco,9,Kian Joshi,Male,50,36,8,Bone Fractures,Acute,Moderate,Indirect contact,0.01,2024-01-07 00:00:45.051492930,2024-01-07 00:00:45.051492930
2,3,2024-01-01 00:01:30.102985861,2024-01-04 00:01:30.102985861,deceased,2024-01-10 00:01:30.102985861,3,34554.65,pharmacological,1,Nandini Srivastava,8,Radiology,Mayo Clinic,1,United States,California,San Francisco,30,Namrata Kulkarni,Female,54,37,8,Tumors,Non-infectious,Severe,Indirect contact,0.2,2024-01-10 00:01:30.102985861,2024-01-10 00:01:30.102985861
3,4,2024-01-01 00:02:15.154478792,2024-01-08 00:02:15.154478792,worsened,2024-01-15 00:02:15.154478792,7,24743.59,surgical,1,Nandini Srivastava,8,Radiology,Mayo Clinic,1,United States,California,San Francisco,31,Rashmi Chopra,Female,78,40,8,Kidney Stones,Non-infectious,Moderate,Indirect contact,0.01,2024-01-15 00:02:15.154478792,2024-01-15 00:02:15.154478792
4,5,2024-01-01 00:03:00.205971722,2024-01-05 00:03:00.205971722,stable,2024-01-12 00:03:00.205971722,4,8095.13,preventive,1,Nandini Srivastava,8,Radiology,Mayo Clinic,1,United States,California,San Francisco,33,Avani Tripathi,Female,32,37,8,Tumors,Non-infectious,Severe,Indirect contact,0.2,2024-01-12 00:03:00.205971722,2024-01-12 00:03:00.205971722
