In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# === STEP 1: Load files ===

disease_file = "malaria_filtered.xlsx"
weather_file = "weather_filtered.xlsx"

# For test: Pick one district only
test_district = 'Pauri'  # Replace this with any district name you want to test

disease_df = pd.read_excel(disease_file, sheet_name=test_district)
weather_df = pd.read_excel(weather_file, sheet_name=test_district)

# === STEP 2: Merge disease & weather data ===

merged_df = pd.merge(disease_df, weather_df, on=['Year', 'Month'])

# === STEP 3: Check merged data ===

print("Merged Data Sample:")
print(merged_df.head())

# === STEP 4: Quick correlation heatmap ===

# Identify columns automatically
disease_cols = [col for col in merged_df.columns if 'Disease' in col or col in ['Malaria','Dengue','TB','Asthma','Typhoid','Diarrhea','Hepatitis','Dehydration']] 
weather_cols = ['Specific Humidity', 'Relative Humidity', 'T2M', 'T2M_MAX', 'T2M_MIN']

# Compute correlation matrix
corr = merged_df[disease_cols + weather_cols].corr()

# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title(f"Correlation Heatmap: {test_district}")
plt.show()


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load your files
disease_file = "malaria_filtered.xlsx"
weather_file = "weather_filtered.xlsx"

# Get all district sheet names
disease_sheets = pd.ExcelFile(disease_file).sheet_names
weather_sheets = pd.ExcelFile(weather_file).sheet_names

# Initialize list to store merged data for all districts
merged_data = []

# Loop over districts
for district in disease_sheets:
    if district not in weather_sheets:
        print(f"Skipping {district} as it's not found in weather file")
        continue
    
    disease_df = pd.read_excel(disease_file, sheet_name=district)
    weather_df = pd.read_excel(weather_file, sheet_name=district)

    # Merge on Year and Month
    merged_df = pd.merge(disease_df, weather_df, on=['Year', 'Month'])
    merged_df['District'] = district  # keep track of district
    merged_data.append(merged_df)

# Combine all districts into one dataframe
full_df = pd.concat(merged_data, ignore_index=True)

# Inspect combined data
print("Combined dataset shape:", full_df.shape)
print(full_df.head())

# Identify columns
disease_cols = [col for col in full_df.columns if 'Disease' in col or col in ['Malaria','Dengue','TB','Asthma','Typhoid','Diarrhea','Hepatitis','Dehydration']] 
weather_cols = ['Specific Humidity', 'Relative Humidity', 'T2M', 'T2M_MAX', 'T2M_MIN']

# Compute full correlation matrix
corr = full_df[disease_cols + weather_cols].corr()

# Plot heatmap
plt.figure(figsize=(14, 12))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title("Overall Correlation Heatmap (All Districts Combined)")
plt.show()

# Optional: save merged data for later modeling
full_df.to_csv("merged_disease_weather_data.csv", index=False)
print("Merged data saved to 'merged_disease_weather_data.csv'")
