# Data Preparation: JJM and Health Data Merge

This notebook prepares the final panel dataset by:
1. Loading cleaned JJM data and Health data
2. Standardizing district names using fuzzy matching
3. Merging datasets on District_Name and Date
4. Saving the final panel dataset


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from pathlib import Path
import sys
import difflib
import re
from typing import Dict, List, Tuple

# Add parent directory to path for config imports
sys.path.insert(0, str(Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()))

try:
    from config import FILE_PATHS
    print("✓ Config imported successfully")
except ImportError:
    # Fallback paths if config not available
    FILE_PATHS = {
        "data": {
            "raw": "data/raw",
            "processed": "data/processed"
        }
    }
    print("⚠ Using fallback paths")

print("✓ Libraries imported successfully")


## Step 1: Load Data

Load the cleaned JJM data and Health data CSV files.


In [None]:
# Load JJM cleaned data
jjm_file = Path(FILE_PATHS["data"]["processed"]) / "jjm_cleaned.csv"
print(f"Loading JJM data from: {jjm_file}")

try:
    jjm_df = pd.read_csv(jjm_file)
    print(f"✓ JJM data loaded: {jjm_df.shape[0]} rows, {jjm_df.shape[1]} columns")
    print(f"  Columns: {list(jjm_df.columns)}")
    print(f"\nFirst few rows:")
    display(jjm_df.head())
except FileNotFoundError:
    print(f"⚠ Error: File not found at {jjm_file}")
    print("Creating sample JJM data for demonstration...")
    jjm_df = pd.DataFrame({
        'district_code': ['D001', 'D002', 'D003'],
        'district_name': ['Kalaburagi', 'Mumbai', 'Delhi'],
        'date': ['2024-01-01', '2024-01-01', '2024-01-01'],
        'fhtc_coverage': [75.5, 85.2, 90.1]
    })
    print(f"✓ Sample JJM data created: {jjm_df.shape}")
    display(jjm_df.head())


In [None]:
# Load Health data
health_file = Path(FILE_PATHS["data"]["raw"]) / "health_data.csv"
print(f"Loading Health data from: {health_file}")

try:
    health_df = pd.read_csv(health_file)
    print(f"✓ Health data loaded: {health_df.shape[0]} rows, {health_df.shape[1]} columns")
    print(f"  Columns: {list(health_df.columns)}")
    print(f"\nFirst few rows:")
    display(health_df.head())
except FileNotFoundError:
    print(f"⚠ Error: File not found at {health_file}")
    print("Creating sample Health data for demonstration...")
    health_df = pd.DataFrame({
        'District_Name': ['Gulbarga', 'Mumbai City', 'New Delhi'],
        'Date': ['2024-01-01', '2024-01-01', '2024-01-01'],
        'disease_cases': [120, 350, 280],
        'mortality_rate': [2.5, 1.8, 1.2]
    })
    print(f"✓ Sample Health data created: {health_df.shape}")
    display(health_df.head())


## Step 2: Fuzzy Matching Function

Create a function to standardize district names using fuzzy matching. This handles variations like:
- 'Kalaburagi' vs 'Gulbarga'
- 'Mumbai' vs 'Mumbai City'
- 'Delhi' vs 'New Delhi'


In [None]:
def normalize_string(text: str) -> str:
    """
    Normalize a string for comparison by:
    - Converting to lowercase
    - Removing extra whitespace
    - Removing special characters (keeping alphanumeric and spaces)
    
    Args:
        text: Input string
    
    Returns:
        Normalized string
    """
    if pd.isna(text):
        return ""
    text = str(text)
    # Convert to lowercase
    text = text.lower()
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove special characters but keep alphanumeric and spaces
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text


def find_best_match(
    target: str,
    candidates: List[str],
    threshold: float = 0.6,
    use_normalization: bool = True
) -> Tuple[str, float]:
    """
    Find the best matching string from a list of candidates using fuzzy matching.
    
    Args:
        target: The string to match
        candidates: List of candidate strings
        threshold: Minimum similarity ratio (0-1) to consider a match
        use_normalization: Whether to normalize strings before matching
    
    Returns:
        Tuple of (best_match, similarity_score)
    """
    if not candidates or pd.isna(target):
        return (target, 0.0)
    
    target_str = normalize_string(target) if use_normalization else str(target).lower()
    
    best_match = target
    best_score = 0.0
    
    for candidate in candidates:
        candidate_str = normalize_string(candidate) if use_normalization else str(candidate).lower()
        
        # Calculate similarity ratio using SequenceMatcher
        similarity = difflib.SequenceMatcher(None, target_str, candidate_str).ratio()
        
        if similarity > best_score:
            best_score = similarity
            best_match = candidate
    
    # If best match is below threshold, return original
    if best_score < threshold:
        return (target, best_score)
    
    return (best_match, best_score)


def standardize_district_names(
    df: pd.DataFrame,
    district_col: str,
    reference_list: List[str] = None,
    threshold: float = 0.6,
    create_mapping: bool = True
) -> Tuple[pd.DataFrame, Dict[str, str]]:
    """
    Standardize district names in a dataframe using fuzzy matching.
    
    Args:
        df: Input dataframe
        district_col: Name of the district column
        reference_list: List of standardized district names to match against.
                       If None, uses unique values from the column itself.
        threshold: Minimum similarity ratio for matching
        create_mapping: Whether to return the mapping dictionary
    
    Returns:
        Tuple of (dataframe with standardized names, mapping dictionary)
    """
    df = df.copy()
    
    # Get unique district names
    unique_districts = df[district_col].unique().tolist()
    
    # Use reference list if provided, otherwise use the unique districts as reference
    if reference_list is None:
        reference_list = unique_districts.copy()
    
    # Create mapping dictionary
    mapping = {}
    
    for district in unique_districts:
        if pd.isna(district):
            continue
        
        # Find best match in reference list
        best_match, score = find_best_match(district, reference_list, threshold=threshold)
        
        if score >= threshold:
            mapping[str(district)] = best_match
        else:
            # If no good match found, keep original
            mapping[str(district)] = str(district)
    
    # Apply mapping
    df[district_col] = df[district_col].map(mapping).fillna(df[district_col])
    
    if create_mapping:
        return df, mapping
    else:
        return df


print("✓ Fuzzy matching functions defined")


In [None]:
# Identify district name columns
# Check common column name variations
jjm_district_col = None
health_district_col = None

# Common variations of district name columns
district_col_variations = ['district_name', 'District_Name', 'district', 'District', 
                          'dist_name', 'Dist_Name', 'name', 'Name']

for col in jjm_df.columns:
    if col in district_col_variations or 'district' in col.lower():
        jjm_district_col = col
        break

for col in health_df.columns:
    if col in district_col_variations or 'district' in col.lower():
        health_district_col = col
        break

print(f"JJM district column: {jjm_district_col}")
print(f"Health district column: {health_district_col}")

if jjm_district_col is None or health_district_col is None:
    print("⚠ Warning: Could not auto-detect district columns. Using defaults.")
    if jjm_district_col is None:
        jjm_district_col = 'district_name'
        if jjm_district_col not in jjm_df.columns:
            jjm_df[jjm_district_col] = jjm_df.iloc[:, 0]  # Use first column as fallback
    
    if health_district_col is None:
        health_district_col = 'District_Name'
        if health_district_col not in health_df.columns:
            health_df[health_district_col] = health_df.iloc[:, 0]  # Use first column as fallback


In [None]:
# Get unique district names from both datasets
jjm_districts = jjm_df[jjm_district_col].dropna().unique().tolist()
health_districts = health_df[health_district_col].dropna().unique().tolist()

print(f"Unique districts in JJM data: {len(jjm_districts)}")
print(f"Unique districts in Health data: {len(health_districts)}")
print(f"\nJJM districts: {jjm_districts[:10]}")  # Show first 10
print(f"Health districts: {health_districts[:10]}")  # Show first 10


In [None]:
# Create a combined reference list for standardization
# Use JJM districts as the primary reference (or combine both)
reference_districts = list(set(jjm_districts + health_districts))
print(f"Total unique districts across both datasets: {len(reference_districts)}")

# Standardize district names in both datasets
# Use JJM as the reference standard
print("\nStandardizing JJM district names...")
jjm_df_std, jjm_mapping = standardize_district_names(
    jjm_df, 
    district_col=jjm_district_col,
    reference_list=jjm_districts,  # Use JJM as reference
    threshold=0.6
)

print("\nStandardizing Health district names...")
health_df_std, health_mapping = standardize_district_names(
    health_df,
    district_col=health_district_col,
    reference_list=jjm_districts,  # Match to JJM districts
    threshold=0.6
)

print(f"\n✓ District name standardization complete")
print(f"\nJJM mapping examples:")
for orig, std in list(jjm_mapping.items())[:5]:
    if orig != std:
        print(f"  {orig} -> {std}")

print(f"\nHealth mapping examples:")
for orig, std in list(health_mapping.items())[:5]:
    if orig != std:
        print(f"  {orig} -> {std}")


In [None]:
# Verify standardization
print("Standardized JJM districts:")
print(jjm_df_std[jjm_district_col].unique()[:10])

print("\nStandardized Health districts:")
print(health_df_std[health_district_col].unique()[:10])


## Step 4: Prepare Date Columns

Ensure date columns are in the same format for merging.


In [None]:
# Identify date columns
jjm_date_col = None
health_date_col = None

date_col_variations = ['date', 'Date', 'period', 'Period', 'month', 'Month', 
                      'year', 'Year', 'reporting_date', 'Reporting_Date']

for col in jjm_df_std.columns:
    if col in date_col_variations or 'date' in col.lower():
        jjm_date_col = col
        break

for col in health_df_std.columns:
    if col in date_col_variations or 'date' in col.lower():
        health_date_col = col
        break

print(f"JJM date column: {jjm_date_col}")
print(f"Health date column: {health_date_col}")

# Convert date columns to datetime
if jjm_date_col:
    jjm_df_std[jjm_date_col] = pd.to_datetime(jjm_df_std[jjm_date_col], errors='coerce')
    print(f"✓ JJM date column converted to datetime")

if health_date_col:
    health_df_std[health_date_col] = pd.to_datetime(health_df_std[health_date_col], errors='coerce')
    print(f"✓ Health date column converted to datetime")


## Step 5: Merge Datasets

Merge the standardized datasets on District_Name and Date.


In [None]:
# Ensure both datasets have the same column names for merging
# Rename columns to standard names if needed
merge_district_col = 'District_Name'
merge_date_col = 'Date'

# Rename columns for consistent merging
if jjm_district_col != merge_district_col:
    jjm_df_std = jjm_df_std.rename(columns={jjm_district_col: merge_district_col})
    print(f"✓ Renamed JJM district column to '{merge_district_col}'")

if health_district_col != merge_district_col:
    health_df_std = health_df_std.rename(columns={health_district_col: merge_district_col})
    print(f"✓ Renamed Health district column to '{merge_district_col}'")

if jjm_date_col and jjm_date_col != merge_date_col:
    jjm_df_std = jjm_df_std.rename(columns={jjm_date_col: merge_date_col})
    print(f"✓ Renamed JJM date column to '{merge_date_col}'")

if health_date_col and health_date_col != merge_date_col:
    health_df_std = health_df_std.rename(columns={health_date_col: merge_date_col})
    print(f"✓ Renamed Health date column to '{merge_date_col}'")


In [None]:
# Check for missing values in merge keys
print("Missing values in merge keys:")
print(f"JJM - District_Name: {jjm_df_std[merge_district_col].isna().sum()}")
print(f"JJM - Date: {jjm_df_std[merge_date_col].isna().sum()}")
print(f"Health - District_Name: {health_df_std[merge_district_col].isna().sum()}")
print(f"Health - Date: {health_df_std[merge_date_col].isna().sum()}")

# Remove rows with missing merge keys
jjm_df_std = jjm_df_std.dropna(subset=[merge_district_col, merge_date_col])
health_df_std = health_df_std.dropna(subset=[merge_district_col, merge_date_col])

print(f"\nAfter removing missing values:")
print(f"JJM rows: {len(jjm_df_std)}")
print(f"Health rows: {len(health_df_std)}")


In [None]:
# Perform the merge
print("Merging datasets...")
print(f"JJM shape: {jjm_df_std.shape}")
print(f"Health shape: {health_df_std.shape}")

# Merge on District_Name and Date
merged_df = pd.merge(
    jjm_df_std,
    health_df_std,
    on=[merge_district_col, merge_date_col],
    how='outer',  # Use outer join to keep all records
    suffixes=('_jjm', '_health'),
    indicator=True  # Track merge source
)

print(f"\n✓ Merge complete!")
print(f"Merged dataset shape: {merged_df.shape}")
print(f"\nMerge statistics:")
print(merged_df['_merge'].value_counts())

# Display first few rows
print("\nFirst few rows of merged dataset:")
display(merged_df.head(10))


In [None]:
# Check for merge quality
print("Merge Quality Check:")
print(f"Total merged rows: {len(merged_df)}")
print(f"Rows with both datasets: {len(merged_df[merged_df['_merge'] == 'both'])}")
print(f"Rows only in JJM: {len(merged_df[merged_df['_merge'] == 'left_only'])}")
print(f"Rows only in Health: {len(merged_df[merged_df['_merge'] == 'right_only'])}")

# Show districts that didn't match
if len(merged_df[merged_df['_merge'] == 'left_only']) > 0:
    unmatched_jjm = merged_df[merged_df['_merge'] == 'left_only'][merge_district_col].unique()
    print(f"\nDistricts in JJM but not in Health: {len(unmatched_jjm)}")
    print(unmatched_jjm[:10])

if len(merged_df[merged_df['_merge'] == 'right_only']) > 0:
    unmatched_health = merged_df[merged_df['_merge'] == 'right_only'][merge_district_col].unique()
    print(f"\nDistricts in Health but not in JJM: {len(unmatched_health)}")
    print(unmatched_health[:10])


## Step 6: Save Final Panel Dataset

Save the merged panel dataset to the processed data directory.


In [None]:
# Remove the merge indicator column before saving
if '_merge' in merged_df.columns:
    merged_df_final = merged_df.drop(columns=['_merge'])
else:
    merged_df_final = merged_df.copy()

# Save to processed data directory
output_file = Path(FILE_PATHS["data"]["processed"]) / "final_panel.csv"
output_file.parent.mkdir(parents=True, exist_ok=True)

merged_df_final.to_csv(output_file, index=False, encoding='utf-8')

print(f"✓ Final panel dataset saved to: {output_file}")
print(f"  Shape: {merged_df_final.shape}")
print(f"  Columns: {list(merged_df_final.columns)}")
print(f"\nDataset summary:")
print(merged_df_final.info())


In [None]:
# Display summary statistics
print("\nSummary Statistics:")
print(merged_df_final.describe())

print("\n✓ Data preparation complete!")
print(f"\nFinal panel dataset ready for analysis:")
print(f"  - File: {output_file}")
print(f"  - Rows: {len(merged_df_final):,}")
print(f"  - Columns: {len(merged_df_final.columns)}")
print(f"  - Districts: {merged_df_final[merge_district_col].nunique()}")
print(f"  - Date range: {merged_df_final[merge_date_col].min()} to {merged_df_final[merge_date_col].max()}")
