In [7]:
!pip install pycountry pycountry-convert

Collecting pycountry
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Collecting pycountry-convert
  Downloading pycountry_convert-0.7.2-py3-none-any.whl.metadata (7.2 kB)
Collecting pprintpp>=0.3.0 (from pycountry-convert)
  Downloading pprintpp-0.4.0-py2.py3-none-any.whl.metadata (7.9 kB)
Collecting pytest-mock>=1.6.3 (from pycountry-convert)
  Downloading pytest_mock-3.15.1-py3-none-any.whl.metadata (3.9 kB)
Collecting pytest-cov>=2.5.1 (from pycountry-convert)
  Downloading pytest_cov-7.0.0-py3-none-any.whl.metadata (31 kB)
Collecting repoze.lru>=0.7 (from pycountry-convert)
  Downloading repoze.lru-0.7-py3-none-any.whl.metadata (1.1 kB)
Collecting coverage>=7.10.6 (from coverage[toml]>=7.10.6->pytest-cov>=2.5.1->pycountry-convert)
  Downloading coverage-7.13.0-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl.metadata (8.5 kB)
Downloading pycountry-24.6.1-py3-none-any.whl (6.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

# **Clean_economic**

In [3]:
import pandas as pd
import numpy as np

df1 = pd.read_csv(
    'clean_economic_data (2).csv',
    na_values=["..", ".", "NA", "N/A", "", " "]
)


print("Dataset shape:", df1.shape)
print("\nColumns:", df1.columns.tolist())
print("\nFirst few rows:")
print(df1.head())
print("\nData types:")
print(df1.dtypes)
print("\nMissing values:")
print(df1.isnull().sum())


print(f"\nYear range: {df1['Year'].min()} to {df1['Year'].max()}")

print("\n=== Handling Missing Values ===")


gdp_columns = [
    'GDP (current US$)',
    'GDP growth (annual %)',
    'GDP per capita (current US$)'
]

for col in gdp_columns:
    missing_before = df1[col].isnull().sum()


    df1[col] = pd.to_numeric(df1[col], errors='coerce')


    df1[col] = df1.groupby('Country Name')[col].transform(
        lambda x: x.interpolate(method='linear', limit_direction='both')
    )

    missing_after = df1[col].isnull().sum()
    print(f"{col}: {missing_before} → {missing_after} missing")


gov_col = 'Government Effectiveness: Estimate'
missing_before = df1[gov_col].isnull().sum()

df1[gov_col] = pd.to_numeric(df1[gov_col], errors='coerce')


df1[gov_col] = df1.groupby('Country Name')[gov_col].transform(
    lambda x: x.interpolate(method='linear', limit_direction='both')
)

missing_after = df1[gov_col].isnull().sum()
print(f"{gov_col}: {missing_before} → {missing_after} missing")


pop_col = 'Population (total)'
missing_before = df1[pop_col].isnull().sum()

df1[pop_col] = pd.to_numeric(df1[pop_col], errors='coerce')


df1[pop_col] = df1.groupby('Country Name')[pop_col].transform(
    lambda x: x.interpolate(method='linear', limit_direction='both')
)

missing_after = df1[pop_col].isnull().sum()
print(f"{pop_col}: {missing_before} → {missing_after} missing")


all_cols = gdp_columns + [gov_col, pop_col]

print("\n=== Second Pass: Year-wise Interpolation ===")

for col in all_cols:
    missing_before = df1[col].isnull().sum()

    df1[col] = df1.groupby('Year')[col].transform(
        lambda x: x.interpolate(method='linear', limit_direction='both')
    )

    missing_after = df1[col].isnull().sum()
    print(f"{col}: {missing_before} → {missing_after} missing (after year-wise interpolation)")

print("\nRemaining missing values after ALL interpolation:")
print(df1.isnull().sum())

print("\n=== Summary Statistics ===")
numeric_columns = [
    'GDP (current US$)',
    'GDP growth (annual %)',
    'GDP per capita (current US$)',
    'Population (total)',
    'Government Effectiveness: Estimate'
]
print(df1[numeric_columns].describe())

print(f"\nUnique countries: {df1['Country Name'].nunique()}")
print(f"Unique years: {sorted(df1['Year'].unique())}")

df1 = df1[(df1['Year'] >= 2000) & (df1['Year'] <= 2023)]
print(f"Filtered economic data to years 2000–2023: {df1.shape[0]} rows")

print("\n=== Sample of Processed Data ===")
print(df1.head(10))

df1.to_csv('clean_economic_data_processed.csv', index=False)

print("\nFinal null counts:")
print(df1.isnull().sum())


Dataset shape: (4522, 8)

Columns: ['Country Name', 'Country Code', 'Year', 'GDP (current US$)', 'GDP growth (annual %)', 'GDP per capita (current US$)', 'Population (total)', 'Government Effectiveness: Estimate']

First few rows:
  Country Name Country Code  Year  GDP (current US$)  GDP growth (annual %)  \
0  Afghanistan          AFG  2000       3.520000e+09                    NaN   
1  Afghanistan          AFG  2001       2.810000e+09              -9.431970   
2  Afghanistan          AFG  2002       3.830000e+09              28.600000   
3  Afghanistan          AFG  2003       4.520000e+09               8.832278   
4  Afghanistan          AFG  2004       5.220000e+09               1.414118   

   GDP per capita (current US$)  Population (total)  \
0                      174.9310          20130327.0   
1                      138.7068          20284307.0   
2                      178.9541          21378117.0   
3                      198.8711          22733049.0   
4                  

# **HDI**

In [4]:
import pandas as pd

hdi_df = pd.read_csv('hdi_data - Sheet1.csv')


print("HDI Data Info:")
print(f"Shape: {hdi_df.shape}")
print(f"Columns: {hdi_df.columns.tolist()}")
print(f"Years: {hdi_df['year'].min()} to {hdi_df['year'].max()}")
print(f"Countries: {hdi_df['country'].nunique()}")


print("\nMissing values before handling:")
print(hdi_df.isnull().sum())


hdi_df_clean = hdi_df.dropna(subset=['country', 'year', 'value'])

print(f"\nRows after removing missing values: {hdi_df_clean.shape[0]}")


hdi_df_final = hdi_df_clean[['countryIsoCode', 'country', 'index', 'year', 'value']].copy()

print(f"\nFinal dataset shape: {hdi_df_final.shape}")
print(f"Years range: {hdi_df_final['year'].min()} - {hdi_df_final['year'].max()}")
print(f"Countries in final data: {hdi_df_final['country'].nunique()}")

hdi_df_final = hdi_df_final[(hdi_df_final['year'] >= 2000) & (hdi_df_final['year'] <= 2023)]
print(f"Filtered HDI data to years 2000-2023: {hdi_df_final.shape[0]} rows")

print("\nFirst 5 rows of cleaned HDI data:")
print(hdi_df_final.head())
hdi_df_final.to_csv('hdi_data_processed.csv', index=False)

HDI Data Info:
Shape: (4748, 5)
Columns: ['countryIsoCode', 'country', 'index', 'year', 'value']
Years: 2000 to 2023
Countries: 204

Missing values before handling:
countryIsoCode    0
country           0
index             0
year              0
value             0
dtype: int64

Rows after removing missing values: 4748

Final dataset shape: (4748, 5)
Years range: 2000 - 2023
Countries in final data: 204
Filtered HDI data to years 2000-2023: 4748 rows

First 5 rows of cleaned HDI data:
  countryIsoCode      country                    index  year  value
0            AFG  Afghanistan  Human Development Index  2000  0.351
1            AFG  Afghanistan  Human Development Index  2001  0.355
2            AFG  Afghanistan  Human Development Index  2002  0.383
3            AFG  Afghanistan  Human Development Index  2003  0.392
4            AFG  Afghanistan  Human Development Index  2004  0.408


## **Disaster**

In [5]:
import pandas as pd
import numpy as np


disaster_df = pd.read_csv("disaster_data - Sheet1.csv")


print("Dataset shape:", disaster_df.shape)
print("\nColumns:", disaster_df.columns.tolist())
print("\nFirst few rows:")
print(disaster_df.head())
print("\nData types:")
print(disaster_df.dtypes)
print("\nMissing values before processing:")
print(disaster_df.isnull().sum())

print(f"\nYear range: {disaster_df['Start Year'].min()} to {disaster_df['Start Year'].max()}")

print("\n=== Handling Missing Values ===")


econ_col = "Total Damage ('000 US$)"
affected_col = "Total Affected"
deaths_col = "Total Deaths"

numeric_cols = [deaths_col, affected_col, econ_col]
for col in numeric_cols:
    if col in disaster_df.columns:
        if disaster_df[col].dtype == object:
            disaster_df[col] = pd.to_numeric(disaster_df[col], errors='coerce')

# 1. Handle Disaster Type
if 'Disaster Type' in disaster_df.columns:
    missing_disaster = disaster_df['Disaster Type'].isnull().sum()
    disaster_df['Disaster Type'].fillna('Unknown', inplace=True)
    print(f"Disaster Type: {missing_disaster} missing values replaced with 'Unknown'")

# 2. Handle Total Deaths
if deaths_col in disaster_df.columns:
    missing_deaths_before = disaster_df[deaths_col].isnull().sum()


    has_economic_data = False
    if econ_col in disaster_df.columns:
        has_economic_data = disaster_df[econ_col].notnull() & (disaster_df[econ_col] > 0)

    has_affected_data = False
    if affected_col in disaster_df.columns:
        has_affected_data = disaster_df[affected_col].notnull() & (disaster_df[affected_col] > 0)


    condition = has_economic_data | has_affected_data
    fill_mask = disaster_df[deaths_col].isnull() & condition
    disaster_df.loc[fill_mask, deaths_col] = 0


    remaining_nulls = disaster_df[deaths_col].isnull().sum()
    if remaining_nulls > 0 and 'Disaster Type' in disaster_df.columns:

        median_by_type = disaster_df.groupby('Disaster Type')[deaths_col].transform('median')

        disaster_df[deaths_col] = disaster_df[deaths_col].fillna(median_by_type)

        overall_median = disaster_df[deaths_col].median()
        disaster_df[deaths_col].fillna(overall_median, inplace=True)

    print(f"Total Deaths: {missing_deaths_before} missing values processed")

# 3. Handle Total Affected
if affected_col in disaster_df.columns:
    missing_before = disaster_df[affected_col].isnull().sum()

    if 'Disaster Type' in disaster_df.columns and missing_before > 0:

        median_by_type = disaster_df.groupby('Disaster Type')[affected_col].transform('median')
        disaster_df[affected_col] = disaster_df[affected_col].fillna(median_by_type)


        still_missing = disaster_df[affected_col].isnull()
        if still_missing.any() and deaths_col in disaster_df.columns:

            non_null = disaster_df[disaster_df[affected_col].notnull() &
                                  disaster_df[deaths_col].notnull()]
            if len(non_null) > 0:

                deaths_nonzero = non_null[deaths_col].replace(0, 1)
                avg_ratio = (non_null[affected_col] / deaths_nonzero).median()
                if pd.notna(avg_ratio) and avg_ratio > 0:

                    disaster_df.loc[still_missing, affected_col] = (
                        disaster_df.loc[still_missing, deaths_col].replace(0, 1) * avg_ratio
                    )


        overall_median = disaster_df[affected_col].median()
        disaster_df[affected_col].fillna(overall_median, inplace=True)

    print(f"Total Affected: {missing_before} missing values processed")

# 4. Handle Economic Damage
if econ_col in disaster_df.columns:
    missing_economic = disaster_df[econ_col].isnull().sum()
    print(f"Economic Damage: {missing_economic} missing values kept as null (not filled with 0)")



print("\n=== Summary Statistics ===")
# Include only non-null columns for description
desc_cols = [col for col in numeric_cols if col in disaster_df.columns]
if desc_cols:
    for col in desc_cols:
        if col in disaster_df.columns:
            stats = disaster_df[col].describe()
            print(f"\n{col}:")
            print(f"  Count: {stats['count']}")
            print(f"  Mean: {stats['mean']:.2f}")
            print(f"  Std: {stats['std']:.2f}")
            print(f"  Min: {stats['min']}")
            print(f"  25%: {stats['25%']}")
            print(f"  50%: {stats['50%']}")
            print(f"  75%: {stats['75%']}")
            print(f"  Max: {stats['max']}")


if econ_col in disaster_df.columns:
    econ_stats = disaster_df[econ_col].describe()
    print(f"\nEconomic Damage Statistics (nulls preserved):")
    print(f"Count (non-null): {econ_stats['count']}")
    print(f"Null count: {disaster_df[econ_col].isnull().sum()}")
    print(f"Mean: {econ_stats['mean']:.2f}")
    print(f"Std: {econ_stats['std']:.2f}")
    print(f"Min: {econ_stats['min']}")
    print(f"25%: {econ_stats['25%']}")
    print(f"50%: {econ_stats['50%']}")
    print(f"75%: {econ_stats['75%']}")
    print(f"Max: {econ_stats['max']}")

print(f"\nUnique countries: {disaster_df['Country'].nunique()}")
print(f"Unique years: {sorted(disaster_df['Start Year'].unique())}")
print(f"Unique disaster types: {disaster_df['Disaster Type'].nunique()}")

print("\n=== Sample of Processed Data (10 rows) ===")

sample_cols = []
for col in ['Disaster Type', deaths_col, affected_col, econ_col]:
    if col in disaster_df.columns:
        sample_cols.append(col)

if sample_cols:
    print(disaster_df[sample_cols].head(10))

disaster_df = disaster_df[(disaster_df['Start Year'] >= 2000) & (disaster_df['Start Year'] <= 2023)]
print(f"Filtered disaster data to years 2000-2023: {disaster_df.shape[0]} rows")

disaster_df.to_csv('disaster_data_processed.csv', index=False)
print("\nProcessed data saved to 'disaster_data_processed.csv'")

Dataset shape: (10548, 7)

Columns: ['ISO', 'Country', 'Start Year', 'Disaster Type', 'Total Deaths', 'Total Affected', "Total Damage ('000 US$)"]

First few rows:
   ISO     Country  Start Year        Disaster Type  Total Deaths  \
0  DJI    Djibouti        2001              Drought           NaN   
1  SDN       Sudan        2000              Drought           NaN   
2  SOM     Somalia        2000              Drought          21.0   
3  AGO      Angola        2000                Flood          31.0   
4  BGD  Bangladesh        2000  Extreme temperature          49.0   

   Total Affected  Total Damage ('000 US$)  
0        100000.0                      NaN  
1       2000000.0                      NaN  
2       1200000.0                      NaN  
3         70000.0                  10000.0  
4             NaN                      NaN  

Data types:
ISO                         object
Country                     object
Start Year                   int64
Disaster Type               objec

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  disaster_df['Disaster Type'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  disaster_df[deaths_col].fillna(overall_median, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on

# **Merge**

In [8]:
import pandas as pd
import numpy as np
import pycountry
import pycountry_convert as pc


df1 = pd.read_csv('clean_economic_data_processed.csv')
hdi_df_final = pd.read_csv('hdi_data_processed.csv')
disaster_df = pd.read_csv('disaster_data_processed.csv')


df1 = df1.rename(columns={'Country Name': 'Country', 'Year': 'Year'})
hdi_df_final = hdi_df_final.rename(columns={'country': 'Country', 'year': 'Year'})
disaster_df = disaster_df.rename(columns={'Start Year': 'Year'})

def impute_hdi_values(df):


    df = df.copy()

    df['Continent'] = df['Country'].apply(get_continent)

    region_avg_hdi = {}
    for continent in df['Continent'].unique():
        if pd.isna(continent):
            continue
        continent_df = df[df['Continent'] == continent]
        avg_hdi = continent_df[continent_df['value'].notna()]['value'].mean()
        if pd.isna(avg_hdi):
            avg_hdi = df['value'].mean()
        region_avg_hdi[continent] = avg_hdi

    df['HDI_Imputed'] = False

    for country in df['Country'].unique():
        country_mask = df['Country'] == country
        country_df = df[country_mask].copy()

        if country_df['value'].notna().any():

            country_df = country_df.sort_values('Year')

            country_df['value'] = country_df['value'].ffill().bfill()

            country_df['value'] = country_df['value'].interpolate(method='linear')

            country_df['value'] = country_df['value'].ffill().bfill()

            original_nan_mask = df.loc[country_mask, 'value'].isna()
            df.loc[country_mask, 'HDI_Imputed'] = original_nan_mask & country_df['value'].notna()
            df.loc[country_mask, 'value'] = country_df['value']

        else:
            continent = df.loc[country_mask, 'Continent'].iloc[0]
            if pd.notna(continent) and continent in region_avg_hdi:
                df.loc[country_mask, 'value'] = region_avg_hdi[continent]
                df.loc[country_mask, 'HDI_Imputed'] = True
            else:
                global_avg = df['value'].mean()
                df.loc[country_mask, 'value'] = global_avg
                df.loc[country_mask, 'HDI_Imputed'] = True

    return df

def get_continent(country_name):
    """
    Map country name to continent using pycountry and pycountry-convert.
    """
    try:
        country_mappings = {
            'Bahamas, The': 'Bahamas',
            'Cabo Verde': 'Cape Verde',
            'Congo, Dem. Rep.': 'Democratic Republic of the Congo',
            'Congo, Rep.': 'Republic of the Congo',
            'Egypt, Arab Rep.': 'Egypt',
            'Gambia, The': 'Gambia',
            'Hong Kong SAR, China': 'Hong Kong',
            'Iran, Islamic Rep.': 'Iran',
            'Korea, Dem. People\'s Rep.': 'North Korea',
            'Korea, Rep.': 'South Korea',
            'Kyrgyz Republic': 'Kyrgyzstan',
            'Lao PDR': 'Laos',
            'Macao SAR, China': 'Macao',
            'Micronesia, Fed. Sts.': 'Micronesia',
            'Russian Federation': 'Russia',
            'Slovak Republic': 'Slovakia',
            'Syrian Arab Republic': 'Syria',
            'Turkiye': 'Turkey',
            'Venezuela, RB': 'Venezuela',
            'Vietnam': 'Viet Nam',
            'Yemen, Rep.': 'Yemen',
        }

        if country_name in country_mappings:
            country_name = country_mappings[country_name]

        country = pycountry.countries.search_fuzzy(country_name)[0]
        country_code = country.alpha_2

        continent_code = pc.country_alpha2_to_continent_code(country_code)

        continent_mapping = {
            'AF': 'Africa',
            'AS': 'Asia',
            'EU': 'Europe',
            'NA': 'North America',
            'SA': 'South America',
            'OC': 'Oceania',
            'AN': 'Antarctica'
        }

        return continent_mapping.get(continent_code, 'Unknown')

    except (LookupError, KeyError):
        special_cases = {
            'Africa Eastern and Southern': 'Africa',
            'Africa Western and Central': 'Africa',
            'Arab World': 'Asia',
            'Caribbean small states': 'North America',
            'Central Europe and the Baltics': 'Europe',
            'Early-demographic dividend': 'Unknown',
            'East Asia & Pacific': 'Asia',
            'East Asia & Pacific (excluding high income)': 'Asia',
            'East Asia & Pacific (IDA & IBRD countries)': 'Asia',
            'Euro area': 'Europe',
            'Europe & Central Asia': 'Europe',
            'Europe & Central Asia (excluding high income)': 'Europe',
            'Europe & Central Asia (IDA & IBRD countries)': 'Europe',
            'European Union': 'Europe',
            'Fragile and conflict affected situations': 'Unknown',
            'Heavily indebted poor countries (HIPC)': 'Unknown',
            'High income': 'Unknown',
            'IBRD only': 'Unknown',
            'IDA & IBRD total': 'Unknown',
            'IDA blend': 'Unknown',
            'IDA only': 'Unknown',
            'IDA total': 'Unknown',
            'Late-demographic dividend': 'Unknown',
            'Latin America & Caribbean': 'South America',
            'Latin America & Caribbean (excluding high income)': 'South America',
            'Latin America & the Caribbean (IDA & IBRD countries)': 'South America',
            'Least developed countries: UN classification': 'Unknown',
            'Low & middle income': 'Unknown',
            'Low income': 'Unknown',
            'Lower middle income': 'Unknown',
            'Middle East & North Africa': 'Asia',
            'Middle East & North Africa (excluding high income)': 'Asia',
            'Middle East & North Africa (IDA & IBRD countries)': 'Asia',
            'Middle income': 'Unknown',
            'North America': 'North America',
            'OECD members': 'Unknown',
            'Other small states': 'Unknown',
            'Pacific island small states': 'Oceania',
            'Post-demographic dividend': 'Unknown',
            'Pre-demographic dividend': 'Unknown',
            'Small states': 'Unknown',
            'South Asia': 'Asia',
            'South Asia (IDA & IBRD)': 'Asia',
            'Sub-Saharan Africa': 'Africa',
            'Sub-Saharan Africa (excluding high income)': 'Africa',
            'Sub-Saharan Africa (IDA & IBRD countries)': 'Africa',
            'Upper middle income': 'Unknown',
            'West Bank and Gaza': 'Asia',
            'World': 'Unknown'
        }

        if country_name in special_cases:
            return special_cases[country_name]

        if any(term in country_name for term in ['Africa', 'Sub-Saharan']):
            return 'Africa'
        elif any(term in country_name for term in ['Asia', 'Middle East', 'Arab']):
            return 'Asia'
        elif any(term in country_name for term in ['Europe', 'Euro', 'European']):
            return 'Europe'
        elif any(term in country_name for term in ['America', 'Caribbean']):
            if 'North' in country_name or 'Caribbean' in country_name:
                return 'North America'
            elif 'South' in country_name or 'Latin' in country_name:
                return 'South America'
            else:
                return 'North America'
        elif any(term in country_name for term in ['Pacific', 'Oceania', 'Island']):
            return 'Oceania'

        return 'Unknown'


disaster_df['Disaster Type'] = disaster_df['Disaster Type'].astype(str)

def split_disaster_types(types_str):
    if pd.isna(types_str) or types_str.lower() == 'none' or types_str.strip() == '':
        return ['None']

    types = [t.strip() for t in types_str.split(',')]
    types = [t for t in types if t]
    return types if types else ['None']

disaster_df['Disaster_Type_List'] = disaster_df['Disaster Type'].apply(split_disaster_types)

expanded_rows = []
for idx, row in disaster_df.iterrows():
    country = row['Country']
    year = row['Year']
    types_list = row['Disaster_Type_List']

    for disaster_type in types_list:
        expanded_rows.append({
            'Country': country,
            'Year': year,
            'Disaster_Type': disaster_type,
            'Total Deaths': row['Total Deaths'],
            'Total Affected': row['Total Affected'],
            "Total Damage ('000 US$)": row["Total Damage ('000 US$)"],
            'Disaster_Count': 1 if disaster_type != 'None' else 0
        })

disaster_expanded = pd.DataFrame(expanded_rows)

disaster_agg = disaster_expanded.groupby(['Country', 'Year', 'Disaster_Type']).agg({
    'Total Deaths': 'sum',
    'Total Affected': 'sum',
    "Total Damage ('000 US$)": 'sum',
    'Disaster_Count': 'sum'
}).reset_index()


merged_df = pd.merge(df1, hdi_df_final, on=['Country', 'Year'], how='left')

print("Applying HDI imputation...")
merged_df = impute_hdi_values(merged_df)

final_df = pd.merge(merged_df, disaster_agg, on=['Country', 'Year'], how='left')

final_df['Disaster_Type'] = final_df['Disaster_Type'].fillna('None')
final_df[['Total Deaths', 'Total Affected', "Total Damage ('000 US$)", 'Disaster_Count']] = \
    final_df[['Total Deaths', 'Total Affected', "Total Damage ('000 US$)", 'Disaster_Count']].fillna(0)


final_df.to_csv('merged_economic_hdi_disaster_long_imputed.csv', index=False)

print("\nDataset saved as 'merged_economic_hdi_disaster_long_imputed.csv'")
print(f"Final dataset shape: {final_df.shape}")


# Show summary statistics
print("\nSummary of Disaster_Count:")
print(final_df['Disaster_Count'].value_counts().head())

Applying HDI imputation...

Dataset saved as 'merged_economic_hdi_disaster_long_imputed.csv'
Final dataset shape: (5741, 18)

Summary of Disaster_Count:
Disaster_Count
0.0    2588
1.0    2116
2.0     548
3.0     216
4.0      97
Name: count, dtype: int64


# **Handle Missing values**

In [10]:
final_df.isnull().sum()

Unnamed: 0,0
Country,0
Country Code,0
Year,0
GDP (current US$),0
GDP growth (annual %),0
GDP per capita (current US$),0
Population (total),0
Government Effectiveness: Estimate,0
countryIsoCode,1641
index,1641


# **Feature Engineering**

In [9]:
import pandas as pd
import numpy as np

df = pd.read_csv("merged_economic_hdi_disaster_long_imputed.csv")


if "Disaster_Type" not in df.columns:
    df["Disaster_Type"] = "None"


df["Disaster_Type"] = df["Disaster_Type"].fillna("None").astype(str)


df["fatalities_per_million"] = (
    df["Total Deaths"] / df["Population (total)"]
) * 1_000_000


df["affected_pct_population"] = (
    df["Total Affected"] / df["Population (total)"]
)


df["Total_Damage_USD"] = df["Total Damage ('000 US$)"] * 1000
df["economic_loss_pct_gdp"] = (
    df["Total_Damage_USD"] / df["GDP (current US$)"]
)


def count_types(x):
    if x == "None":
        return 0
    return len([t for t in str(x).split(",") if t.strip() != ""])

df["exposure_score"] = df["Disaster_Type"].apply(count_types)


severity_weights = {
    "Flood": 1,
    "Drought": 1.5,
    "Storm": 2,
    "Epidemic": 1.2,
    "Earthquake": 3,
    "Wildfire": 1.3,
    "Landslide": 1.1,
    "Volcano": 2.2
}

def calculate_severity(disaster_str):
    if disaster_str == "None" or str(disaster_str).strip() == "":
        return 0

    types = [d.strip() for d in str(disaster_str).split(",") if d.strip() != ""]
    weights = [severity_weights.get(t, 1) for t in types]
    return np.mean(weights)

df["severity_weight"] = df["Disaster_Type"].apply(calculate_severity)


df["DII"] = (
    (df["fatalities_per_million"] + df["affected_pct_population"])
    / df["GDP per capita (current US$)"]
) * df["severity_weight"]


df["GovIndex_normalized"] = (
    df["Government Effectiveness: Estimate"] + 2.5
) / 5


df["adaptive_capacity"] = (
    df["value"] + df["GovIndex_normalized"]
) / 2


df["vulnerability_score"] = (
    df["fatalities_per_million"] + df["economic_loss_pct_gdp"]
)


def calc_cri(row):
    E = row["exposure_score"]
    V = row["vulnerability_score"]
    A = row["adaptive_capacity"]

    if E == 0 or V == 0:
        return A

    return A / (E * V)

df["CRI"] = df.apply(calc_cri, axis=1)


df = df.sort_values(["Country", "Year"])

df["GDPgrowth_pre"] = df.groupby("Country")["GDP growth (annual %)"].shift(1)
df["GDPgrowth_post"] = df.groupby("Country")["GDP growth (annual %)"].shift(-1)
df["Trecovery"] = 1

df["RRS"] = (
    (df["GDPgrowth_post"] - df["GDPgrowth_pre"]) / df["Trecovery"]
) + df["adaptive_capacity"]

df["RRS"] = df["RRS"].fillna(df["adaptive_capacity"])


df.to_csv("final_dataset_with_features.csv", index=False)

print("Feature-engineered dataset saved as final_dataset_with_features.csv")
print(df.head())


Feature-engineered dataset saved as final_dataset_with_features.csv
       Country Country Code  Year  GDP (current US$)  GDP growth (annual %)  \
0  Afghanistan          AFG  2000       3.520000e+09               -9.43197   
1  Afghanistan          AFG  2000       3.520000e+09               -9.43197   
2  Afghanistan          AFG  2001       2.810000e+09               -9.43197   
3  Afghanistan          AFG  2001       2.810000e+09               -9.43197   
4  Afghanistan          AFG  2001       2.810000e+09               -9.43197   

   GDP per capita (current US$)  Population (total)  \
0                      174.9310          20130327.0   
1                      174.9310          20130327.0   
2                      138.7068          20284307.0   
3                      138.7068          20284307.0   
4                      138.7068          20284307.0   

   Government Effectiveness: Estimate countryIsoCode                    index  \
0                            -2.17395        