In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# === STEP 1: Load files ===

disease_file = "malaria_filtered.xlsx"
weather_file = "weather_filtered.xlsx"

# For test: Pick one district only
test_district = 'Pauri'  # Replace this with any district name you want to test

disease_df = pd.read_excel(disease_file, sheet_name=test_district)
weather_df = pd.read_excel(weather_file, sheet_name=test_district)

# Add Temperature column only (do not use Temperature Range for analysis)
weather_df['Temperature'] = weather_df['T2M']

# === STEP 2: Merge disease & weather data ===

merged_df = pd.merge(disease_df, weather_df, on=['Year', 'Month'])

# === STEP 3: Check merged data ===

print("Merged Data Sample:")
print(merged_df.head())

# === STEP 4: Quick correlation heatmap ===

# Identify columns automatically
disease_cols = [col for col in merged_df.columns if 'Disease' in col or col in ['Malaria','Dengue','TB','Asthma','Typhoid','Diarrhea','Hepatitis','Dehydration']] 
weather_cols = ['Specific Humidity', 'Relative Humidity', 'Temperature']

# Compute correlation matrix (exclude Temperature Range)
corr = merged_df[disease_cols + weather_cols].corr()

# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title(f"Correlation Heatmap: {test_district}")
plt.show()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load your files
disease_file = "malaria_filtered.xlsx"
weather_file = "weather_filtered.xlsx"

# Get all district sheet names
disease_sheets = pd.ExcelFile(disease_file).sheet_names
weather_sheets = pd.ExcelFile(weather_file).sheet_names

# Initialize list to store merged data for all districts
merged_data = []

# Loop over districts
for district in disease_sheets:
    if district not in weather_sheets:
        print(f"Skipping {district} as it's not found in weather file")
        continue
    
    disease_df = pd.read_excel(disease_file, sheet_name=district)
    weather_df = pd.read_excel(weather_file, sheet_name=district)
    weather_df['Temperature'] = weather_df['T2M']

    # Merge on Year and Month
    merged_df = pd.merge(disease_df, weather_df, on=['Year', 'Month'])
    merged_df['District'] = district  # keep track of district
    merged_data.append(merged_df)

# Combine all districts into one dataframe
full_df = pd.concat(merged_data, ignore_index=True)

# Inspect combined data
print("Combined dataset shape:", full_df.shape)
print(full_df.head())

# Identify columns
disease_cols = [col for col in full_df.columns if 'Disease' in col or col in ['Malaria','Dengue','TB','Asthma','Typhoid','Diarrhea','Hepatitis','Dehydration']] 
weather_cols = ['Specific Humidity', 'Relative Humidity', 'Temperature']

# Compute full correlation matrix (exclude Temperature Range)
corr = full_df[disease_cols + weather_cols].corr()

# Plot heatmap
plt.figure(figsize=(14, 12))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title("Overall Correlation Heatmap (All Districts Combined)")
plt.show()

# Optional: save merged data for later modeling
full_df.to_csv("merged_disease_weather_data.csv", index=False)
print("Merged data saved to 'merged_disease_weather_data.csv'")

In [None]:
# Plot the variation of diseases with temperature for each district over all years
def plot_disease_temperature_trends(district, disease_cols):
    district_data = full_df[full_df['District'] == district]
    
    fig, ax1 = plt.subplots(figsize=(14, 10))

    # Plotting disease trends
    for disease in disease_cols:
        sns.lineplot(data=district_data, x='Year', y=disease, label=disease, ax=ax1)
    
    ax1.set_title(f"Disease Trends Over Years in {district}")
    ax1.set_xlabel("Year")
    ax1.set_ylabel("Number of Cases")
    ax1.legend(loc='upper left')

    # Creating a second y-axis for temperature
    ax2 = ax1.twinx()
    sns.lineplot(data=district_data, x='Year', y='Temperature', label='Temperature', ax=ax2, color='tab:orange', linestyle='--')
    ax2.set_ylabel("Temperature")
    ax2.legend(loc='upper right')

    plt.show()
# Plot for all districts
for district in full_df['District'].unique():
    plot_disease_temperature_trends(district, disease_cols)

# Suggested Plots to Reveal Correlations Between Diseases and Weather

To better understand the relationships between diseases and weather variables, consider the following visualizations:

1. **Scatter Plots with Regression Lines**  
   For each disease, plot a scatter plot against each weather variable (e.g., Malaria vs. Temperature), with a regression line to visualize linear relationships.

2. **Pairplot (Seaborn)**  
   Use seaborn's `pairplot` to visualize pairwise relationships between all diseases and weather variables for a selected district or the combined dataset.

3. **Monthly/Seasonal Trends**  
   Plot average disease cases and weather variables by month to see if there are seasonal patterns.

4. **Correlation Matrix (Already Done)**  
   The heatmap is good, but you can also show the top correlated pairs as a ranked list.

5. **Lagged Correlation Analysis**  
   Sometimes weather affects diseases with a delay. Plot correlations between weather variables and disease cases with 1-3 month lags.

6. **Boxplots by Weather Quantiles**  
   Divide a weather variable (e.g., temperature) into quantiles and plot boxplots of disease cases for each quantile.

Below are example code snippets for some of these plots.

In [None]:
# 1. Scatter plots with regression lines for each disease-weather pair
import seaborn as sns
import matplotlib.pyplot as plt

for disease in disease_cols:
    for weather in weather_cols:
        plt.figure(figsize=(6,4))
        sns.regplot(data=full_df, x=weather, y=disease, scatter_kws={'alpha':0.5})
        plt.title(f"{disease} vs. {weather}")
        plt.xlabel(weather)
        plt.ylabel(disease)
        plt.tight_layout()
        plt.show()

In [None]:
# 2. Pairplot for all diseases and weather variables (may be slow for large data)
sns.pairplot(full_df[disease_cols + weather_cols])
plt.suptitle("Pairplot: Diseases and Weather Variables", y=1.02)
plt.show()

In [None]:
# 3. Monthly/Seasonal trends: Average disease cases and weather by month
monthly = full_df.groupby('Month')[disease_cols + weather_cols].mean().reset_index()
monthly = monthly.set_index('Month').reindex([
    'January','February','March','April','May','June','July','August','September','October','November','December'
])

fig, axes = plt.subplots(len(disease_cols), 1, figsize=(12, 4*len(disease_cols)), sharex=True)
if len(disease_cols) == 1:
    axes = [axes]
for i, disease in enumerate(disease_cols):
    monthly[disease].plot(ax=axes[i], marker='o', label=disease)
    axes[i].set_ylabel("Avg Cases")
    axes[i].set_title(f"Monthly Trend: {disease}")
    axes[i].legend()
plt.xlabel("Month")
plt.tight_layout()
plt.show()

In [None]:
# 4. Top correlated pairs (absolute value, excluding self-correlation)
corr_pairs = corr.abs().unstack().sort_values(ascending=False)
corr_pairs = corr_pairs[corr_pairs < 1]  # exclude self-correlation
print("Top correlated disease-weather pairs:")
print(corr_pairs.head(10))

In [None]:
# 5. Lagged correlation: Weather variable vs. disease cases with 1-month lag
for disease in disease_cols:
    for weather in weather_cols:
        shifted = full_df[[disease, weather]].copy()
        shifted[weather] = shifted[weather].shift(1)
        corr_lag = shifted.corr().iloc[0,1]
        print(f"Lag-1 correlation between {disease} and previous month's {weather}: {corr_lag:.2f}")

In [None]:
# 6. Boxplots of disease cases by temperature quantiles
temp_quantiles = pd.qcut(full_df['Temperature'], 4, labels=['Q1','Q2','Q3','Q4'])
for disease in disease_cols:
    plt.figure(figsize=(6,4))
    sns.boxplot(x=temp_quantiles, y=full_df[disease])
    plt.title(f"{disease} by Temperature Quantile")
    plt.xlabel("Temperature Quantile")
    plt.ylabel(disease)
    plt.show()