In [8]:
import pandas as pd

In [9]:
# Load all cleaned datasets
df_fertility_gov = pd.read_csv('../CLEANED_DATASETS/fertility_and_gov_spending_merged.csv')
df_congenital_syphilis = pd.read_excel('../CLEANED_DATASETS/congenital_syphilis_cleaned.xlsx')
df_hospital_beds = pd.read_excel('../CLEANED_DATASETS/hospital_beds_cleaned.xlsx')
df_life_expectancy = pd.read_excel('../CLEANED_DATASETS/life_expectancy_clean.xlsx')
df_malaria = pd.read_excel('../CLEANED_DATASETS/malaria_clean.xlsx')
df_overweight = pd.read_excel('../CLEANED_DATASETS/overweight_clean.xlsx')
df_tuberculosis = pd.read_excel('../CLEANED_DATASETS/tuberculosis_clean.xlsx')

print("All datasets loaded successfully!")

All datasets loaded successfully!


In [10]:
# Rename columns in fertility_and_gov_spending_merged to match standard naming
df_fertility_gov = df_fertility_gov.rename(columns={
    'CountryName': 'country',
    'CountryCode': 'country_code',
    'Region': 'region',
    'IndicatorName': 'indicator',
    'Year': 'year',
    'IndicatorValue': 'indicator_value'
})

print("Columns renamed successfully!")
print(f"New columns: {df_fertility_gov.columns.tolist()}")
df_fertility_gov.head()

Columns renamed successfully!
New columns: ['country', 'country_code', 'region', 'indicator', 'year', 'indicator_value']


Unnamed: 0,country,country_code,region,indicator,year,indicator_value
0,Aruba,ABW,,"Fertility rate, total (births per woman)",1960,4.567
1,Africa Eastern and Southern,AFE,,"Fertility rate, total (births per woman)",1960,6.65031
2,Afghanistan,AFG,Eastern Mediterranean,"Fertility rate, total (births per woman)",1960,7.282
3,Africa Western and Central,AFW,,"Fertility rate, total (births per woman)",1960,6.468887
4,Angola,AGO,Africa,"Fertility rate, total (births per woman)",1960,6.708


In [11]:
# Check columns of all other datasets before standardizing
print("Column names for all datasets:")
print(f"\n1. Fertility & Gov: {df_fertility_gov.columns.tolist()}")
print(f"\n2. Congenital Syphilis: {df_congenital_syphilis.columns.tolist()}")
print(f"\n3. Hospital Beds: {df_hospital_beds.columns.tolist()}")
print(f"\n4. Life Expectancy: {df_life_expectancy.columns.tolist()}")
print(f"\n5. Malaria: {df_malaria.columns.tolist()}")
print(f"\n6. Overweight: {df_overweight.columns.tolist()}")
print(f"\n7. Tuberculosis: {df_tuberculosis.columns.tolist()}")

Column names for all datasets:

1. Fertility & Gov: ['country', 'country_code', 'region', 'indicator', 'year', 'indicator_value']

2. Congenital Syphilis: ['region', 'country', 'country_code', 'year', 'indicator', 'indicator_value']

3. Hospital Beds: ['region', 'country', 'country_code', 'year', 'indicator', 'indicator_value']

4. Life Expectancy: ['region', 'country', 'year', 'indicator', 'indicator_value']

5. Malaria: ['region', 'country', 'year', 'indicator', 'indicator_value']

6. Overweight: ['region', 'country', 'country_code', 'year', 'indicator', 'indicator_value']

7. Tuberculosis: ['region', 'country', 'country_code', 'year', 'indicator', 'indicator_value']


In [12]:
# Add missing country_code column to datasets that don't have it
df_life_expectancy['country_code'] = None
df_malaria['country_code'] = None

# Reorder all datasets to have consistent column order
column_order = ['region', 'country', 'country_code', 'year', 'indicator', 'indicator_value']

df_fertility_gov = df_fertility_gov[column_order]
df_congenital_syphilis = df_congenital_syphilis[column_order]
df_hospital_beds = df_hospital_beds[column_order]
df_life_expectancy = df_life_expectancy[column_order]
df_malaria = df_malaria[column_order]
df_overweight = df_overweight[column_order]
df_tuberculosis = df_tuberculosis[column_order]

print("All datasets standardized with columns:", column_order)

All datasets standardized with columns: ['region', 'country', 'country_code', 'year', 'indicator', 'indicator_value']


In [13]:
# Merge all datasets by concatenating vertically
merged_all = pd.concat([
    df_fertility_gov,
    df_congenital_syphilis,
    df_hospital_beds,
    df_life_expectancy,
    df_malaria,
    df_overweight,
    df_tuberculosis
], ignore_index=True)

print(f"Merged dataset shape: {merged_all.shape}")
print(f"\nColumns: {merged_all.columns.tolist()}")
print(f"\nUnique indicators ({merged_all['indicator'].nunique()}):")
for indicator in merged_all['indicator'].unique():
    count = len(merged_all[merged_all['indicator'] == indicator])
    print(f"  - {indicator}: {count} rows")
print(f"\nFirst few rows:")
merged_all.head(10)

Merged dataset shape: (64572, 6)

Columns: ['region', 'country', 'country_code', 'year', 'indicator', 'indicator_value']

Unique indicators (9):
  - Fertility rate, total (births per woman): 17290 rows
  - Domestic general government health expenditure (GGHE-D) per capita in US$: 4585 rows
  - Congenital syphilis number of cases, reported: 1170 rows
  - Hospital beds (per 10 000 population): 2958 rows
  - Estimated malaria incidence (per 1000 population at risk): 2680 rows
  - Life expectancy at birth (years): 12210 rows
  - Life expectancy at age 60 (years): 12210 rows
  - Prevalence of overweight among adults, BMI >= 25 (%): 6567 rows
  - Tuberculosis - Estimated incident cases: 4902 rows

First few rows:


Unnamed: 0,region,country,country_code,year,indicator,indicator_value
0,,Aruba,ABW,1960,"Fertility rate, total (births per woman)",4.567
1,,Africa Eastern and Southern,AFE,1960,"Fertility rate, total (births per woman)",6.65031
2,Eastern Mediterranean,Afghanistan,AFG,1960,"Fertility rate, total (births per woman)",7.282
3,,Africa Western and Central,AFW,1960,"Fertility rate, total (births per woman)",6.468887
4,Africa,Angola,AGO,1960,"Fertility rate, total (births per woman)",6.708
5,Europe,Albania,ALB,1960,"Fertility rate, total (births per woman)",6.383
6,Europe,Andorra,AND,1960,"Fertility rate, total (births per woman)",2.545
7,,Arab World,ARB,1960,"Fertility rate, total (births per woman)",6.922033
8,Eastern Mediterranean,United Arab Emirates,ARE,1960,"Fertility rate, total (births per woman)",6.499
9,Americas,Argentina,ARG,1960,"Fertility rate, total (births per woman)",3.136


In [15]:
# Save the merged dataset
merged_all.to_csv('../CLEANED_DATASETS/all_datasets_merged.csv', index=False)
print("Merged dataset saved successfully to 'all_datasets_merged.csv'!")
print(f"Total rows: {len(merged_all)}")
print(f"Total indicators: {merged_all['indicator'].nunique()}")

Merged dataset saved successfully to 'all_datasets_merged.csv'!
Total rows: 64572
Total indicators: 9


In [14]:
# Check column names and shapes for all datasets
datasets = {
    'Fertility & Government Spending': df_fertility_gov,
    'Congenital Syphilis': df_congenital_syphilis,
    'Hospital Beds': df_hospital_beds,
    'Life Expectancy': df_life_expectancy,
    'Malaria': df_malaria,
    'Overweight': df_overweight,
    'Tuberculosis': df_tuberculosis
}

for name, df in datasets.items():
    print(f"\n{'='*60}")
    print(f"{name}")
    print(f"{'='*60}")
    print(f"Shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
    print(f"First few rows:")
    display(df.head(3))


Fertility & Government Spending
Shape: (21875, 6)
Columns: ['region', 'country', 'country_code', 'year', 'indicator', 'indicator_value']
First few rows:


Unnamed: 0,region,country,country_code,year,indicator,indicator_value
0,,Aruba,ABW,1960,"Fertility rate, total (births per woman)",4.567
1,,Africa Eastern and Southern,AFE,1960,"Fertility rate, total (births per woman)",6.65031
2,Eastern Mediterranean,Afghanistan,AFG,1960,"Fertility rate, total (births per woman)",7.282



Congenital Syphilis
Shape: (1170, 6)
Columns: ['region', 'country', 'country_code', 'year', 'indicator', 'indicator_value']
First few rows:


Unnamed: 0,region,country,country_code,year,indicator,indicator_value
0,Africa,Burkina Faso,BFA,2022,"Congenital syphilis number of cases, reported",16
1,Africa,Burkina Faso,BFA,2021,"Congenital syphilis number of cases, reported",8
2,Africa,Burkina Faso,BFA,2020,"Congenital syphilis number of cases, reported",11



Hospital Beds
Shape: (2958, 6)
Columns: ['region', 'country', 'country_code', 'year', 'indicator', 'indicator_value']
First few rows:


Unnamed: 0,region,country,country_code,year,indicator,indicator_value
0,Africa,Angola,AGO,2019,Hospital beds (per 10 000 population),7.53
1,Africa,Burundi,BDI,2014,Hospital beds (per 10 000 population),6.86
2,Africa,Burundi,BDI,2013,Hospital beds (per 10 000 population),7.11



Life Expectancy
Shape: (2680, 6)
Columns: ['region', 'country', 'country_code', 'year', 'indicator', 'indicator_value']
First few rows:


Unnamed: 0,region,country,country_code,year,indicator,indicator_value
0,Africa,Angola,,2024,Estimated malaria incidence (per 1000 populati...,258.9
1,Africa,Angola,,2023,Estimated malaria incidence (per 1000 populati...,255.5
2,Africa,Angola,,2022,Estimated malaria incidence (per 1000 populati...,260.1



Malaria
Shape: (24420, 6)
Columns: ['region', 'country', 'country_code', 'year', 'indicator', 'indicator_value']
First few rows:


Unnamed: 0,region,country,country_code,year,indicator,indicator_value
0,Africa,Angola,,2021,Life expectancy at birth (years),62.13
1,Africa,Angola,,2021,Life expectancy at birth (years),60.01
2,Africa,Angola,,2021,Life expectancy at birth (years),64.26



Overweight
Shape: (6567, 6)
Columns: ['region', 'country', 'country_code', 'year', 'indicator', 'indicator_value']
First few rows:


Unnamed: 0,region,country,country_code,year,indicator,indicator_value
0,Africa,Angola,AGO,2022,"Prevalence of overweight among adults, BMI >= ...",31.78
1,Africa,Angola,AGO,2021,"Prevalence of overweight among adults, BMI >= ...",31.25
2,Africa,Angola,AGO,2020,"Prevalence of overweight among adults, BMI >= ...",30.73



Tuberculosis
Shape: (4902, 6)
Columns: ['region', 'country', 'country_code', 'year', 'indicator', 'indicator_value']
First few rows:


Unnamed: 0,region,country,country_code,year,indicator,indicator_value
0,Africa,Angola,AGO,2024,Tuberculosis - Estimated incident cases,141000
1,Africa,Angola,AGO,2023,Tuberculosis - Estimated incident cases,141000
2,Africa,Angola,AGO,2022,Tuberculosis - Estimated incident cases,135000
