In [4]:
import pandas as pd

# Example: CSV file with climate and disease data
df = pd.read_csv("/content/climate_disease_dataset.csv")

print(df.head())
print(df.info())

   year  month                country   region  avg_temp_c  precipitation_mm  \
0  2000      1  Palestinian Territory  Central   28.132468        152.083870   
1  2000      2  Palestinian Territory  Central   30.886500        119.591418   
2  2000      3  Palestinian Territory  Central   31.366433         95.876124   
3  2000      4  Palestinian Territory  Central   28.481870        175.315731   
4  2000      5  Palestinian Territory  Central   26.890370        191.445990   

   air_quality_index   uv_index  malaria_cases  dengue_cases  \
0         110.487231  12.000000             53           145   
1          83.467928  12.000000            132            48   
2          93.095292  12.000000             34            80   
3         105.530192   9.395894             23           133   
4          60.205979   9.935726             39            74   

   population_density  healthcare_budget  
0                 113               1068  
1                 113               1068  
2    

In [5]:
# Fill missing numeric values with mean
df.fillna(df.mean(numeric_only=True), inplace=True)

# Or forward-fill for time-series data
df.fillna(method='ffill', inplace=True)


  df.fillna(method='ffill', inplace=True)


In [13]:
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['week'] = df['date'].dt.isocalendar().week

# Display first few rows to confirm changes
print(df.head())


   year  month                country   region  avg_temp_c  precipitation_mm  \
0  2000      1  Palestinian Territory  Central    0.651021          0.313120   
1  2000      2  Palestinian Territory  Central    0.709442          0.246223   
2  2000      3  Palestinian Territory  Central    0.719623          0.197396   
3  2000      4  Palestinian Territory  Central    0.658433          0.360952   
4  2000      5  Palestinian Territory  Central    0.624672          0.394162   

   air_quality_index   uv_index  malaria_cases  dengue_cases  \
0           0.654360  12.000000             53           145   
1           0.494339  12.000000            132            48   
2           0.551357  12.000000             34            80   
3           0.625002   9.395894             23           133   
4           0.356570   9.935726             39            74   

   population_density  healthcare_budget       date  week  
0                 113               1068 2000-01-01    52  
1             

In [15]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import pandas as pd

# Scaling numerical features
scaler = MinMaxScaler()
df[['avg_temp_c', 'precipitation_mm', 'air_quality_index']] = scaler.fit_transform(
    df[['avg_temp_c', 'precipitation_mm', 'air_quality_index']]
)
print("After Scaling:\n", df.head(), "\n")

# Creating lag features
df['avg_temp_c_lag_1'] = df['avg_temp_c'].shift(1)
df['precipitation_mm_lag_2'] = df['precipitation_mm'].shift(2)
df.dropna(inplace=True)
print("After Lag Features:\n", df.head(), "\n")

# Encoding categorical 'region'
df = pd.get_dummies(df, columns=['region'], drop_first=True)
print("After Encoding:\n", df.head(), "\n")

# Splitting dataset
X = df.drop(['malaria_cases', 'dengue_cases', 'date', 'country'], axis=1)
y_malaria = df['malaria_cases']
y_dengue = df['dengue_cases']


X_train, X_test, y_malaria_train, y_malaria_test = train_test_split(
    X, y_malaria, test_size=0.2, shuffle=False  # preserve time order
)

X_train, X_test, y_dengue_train, y_dengue_test = train_test_split(
    X, y_dengue, test_size=0.2, shuffle=False  # preserve time order
)

print("Shape of Training Data (Malaria):", X_train.shape)
print("Shape of Testing Data (Malaria):", X_test.shape)
print("Training Features Sample:\n", X_train.head(), "\n")
print("Training Labels Sample (Malaria):\n", y_malaria_train.head())
print("Training Labels Sample (Dengue):\n", y_dengue_train.head())

After Scaling:
    year  month                country   region  avg_temp_c  precipitation_mm  \
0  2000      1  Palestinian Territory  Central    0.651021          0.313120   
1  2000      2  Palestinian Territory  Central    0.709442          0.246223   
2  2000      3  Palestinian Territory  Central    0.719623          0.197396   
3  2000      4  Palestinian Territory  Central    0.658433          0.360952   
4  2000      5  Palestinian Territory  Central    0.624672          0.394162   

   air_quality_index   uv_index  malaria_cases  dengue_cases  \
0           0.654360  12.000000             53           145   
1           0.494339  12.000000            132            48   
2           0.551357  12.000000             34            80   
3           0.625002   9.395894             23           133   
4           0.356570   9.935726             39            74   

   population_density  healthcare_budget       date  week  
0                 113               1068 2000-01-01    52 