In [2]:
import pandas as pd

df_ow = pd.read_excel('OVERWEIGHT_PERCENTAGE_AMONG_ADULTS_AGE_STANDARDIZED.xlsx', header=2)

print(f"Original shape: {df_ow.shape}")
print(f"\nAll columns ({len(df_ow.columns)}):")
for i, col in enumerate(df_ow.columns):
    null_pct = df_ow[col].isnull().sum() / len(df_ow) * 100
    print(f"  [{i:2d}] {col:35s} | nulls: {null_pct:5.1f}% | unique: {df_ow[col].nunique()}")

Original shape: (19701, 34)

All columns (34):
  [ 0] IndicatorCode                       | nulls:   0.0% | unique: 1
  [ 1] Indicator                           | nulls:   0.0% | unique: 1
  [ 2] ValueType                           | nulls:   0.0% | unique: 1
  [ 3] ParentLocationCode                  | nulls:   0.0% | unique: 6
  [ 4] ParentLocation                      | nulls:   0.0% | unique: 6
  [ 5] Location type                       | nulls:   0.0% | unique: 1
  [ 6] SpatialDimValueCode                 | nulls:   0.0% | unique: 199
  [ 7] Location                            | nulls:   0.0% | unique: 199
  [ 8] Period type                         | nulls:   0.0% | unique: 1
  [ 9] Period                              | nulls:   0.0% | unique: 33
  [10] IsLatestYear                        | nulls:   0.0% | unique: 2
  [11] Dim1 type                           | nulls:   0.0% | unique: 1
  [12] Dim1                                | nulls:   0.0% | unique: 3
  [13] Dim1ValueCode     

In [3]:
print(f"Rows before filter: {len(df_ow)}")
print(f"Sex categories: {df_ow['Dim1'].unique().tolist()}")

df_ow = df_ow[df_ow['Dim1'] == 'Both sexes']

print(f"Rows after keeping only 'Both sexes': {len(df_ow)}")

Rows before filter: 19701
Sex categories: ['Both sexes', 'Male', 'Female']
Rows after keeping only 'Both sexes': 6567


In [4]:
df_ow = df_ow[['ParentLocationCode', 'ParentLocation', 'SpatialDimValueCode', 'Location', 'Period', 'FactValueNumeric']]

df_ow['indicator_code'] = 'NCD_BMI_25A'
df_ow['indicator_name'] = 'Prevalence of overweight among adults, BMI >= 25 (%)'

print(f"Shape after selecting columns: {df_ow.shape}")
df_ow.head()

Shape after selecting columns: (6567, 8)


Unnamed: 0,ParentLocationCode,ParentLocation,SpatialDimValueCode,Location,Period,FactValueNumeric,indicator_code,indicator_name
0,AFR,Africa,AGO,Angola,2022,31.78,NCD_BMI_25A,"Prevalence of overweight among adults, BMI >= ..."
3,AFR,Africa,AGO,Angola,2021,31.25,NCD_BMI_25A,"Prevalence of overweight among adults, BMI >= ..."
6,AFR,Africa,AGO,Angola,2020,30.73,NCD_BMI_25A,"Prevalence of overweight among adults, BMI >= ..."
9,AFR,Africa,AGO,Angola,2019,30.22,NCD_BMI_25A,"Prevalence of overweight among adults, BMI >= ..."
12,AFR,Africa,AGO,Angola,2018,29.73,NCD_BMI_25A,"Prevalence of overweight among adults, BMI >= ..."


In [5]:
df_ow = df_ow.rename(columns={
    'ParentLocationCode': 'region_id',
    'ParentLocation': 'region_name',
    'SpatialDimValueCode': 'country_code',
    'Location': 'country_name',
    'Period': 'year',
    'FactValueNumeric': 'value'
})

df_ow = df_ow[['region_id', 'region_name', 'country_code', 'country_name', 'year', 'indicator_code', 'indicator_name', 'value']]

print("Final columns:", list(df_ow.columns))
df_ow.head(10)

Final columns: ['region_id', 'region_name', 'country_code', 'country_name', 'year', 'indicator_code', 'indicator_name', 'value']


Unnamed: 0,region_id,region_name,country_code,country_name,year,indicator_code,indicator_name,value
0,AFR,Africa,AGO,Angola,2022,NCD_BMI_25A,"Prevalence of overweight among adults, BMI >= ...",31.78
3,AFR,Africa,AGO,Angola,2021,NCD_BMI_25A,"Prevalence of overweight among adults, BMI >= ...",31.25
6,AFR,Africa,AGO,Angola,2020,NCD_BMI_25A,"Prevalence of overweight among adults, BMI >= ...",30.73
9,AFR,Africa,AGO,Angola,2019,NCD_BMI_25A,"Prevalence of overweight among adults, BMI >= ...",30.22
12,AFR,Africa,AGO,Angola,2018,NCD_BMI_25A,"Prevalence of overweight among adults, BMI >= ...",29.73
15,AFR,Africa,AGO,Angola,2017,NCD_BMI_25A,"Prevalence of overweight among adults, BMI >= ...",29.25
18,AFR,Africa,AGO,Angola,2016,NCD_BMI_25A,"Prevalence of overweight among adults, BMI >= ...",28.77
21,AFR,Africa,AGO,Angola,2015,NCD_BMI_25A,"Prevalence of overweight among adults, BMI >= ...",28.31
24,AFR,Africa,AGO,Angola,2014,NCD_BMI_25A,"Prevalence of overweight among adults, BMI >= ...",27.85
27,AFR,Africa,AGO,Angola,2013,NCD_BMI_25A,"Prevalence of overweight among adults, BMI >= ...",27.4


In [7]:
print("VALIDATION — Overweight Cleaned Data")
print(f"\nFinal shape: {df_ow.shape}")
print(f"\nData types:\n{df_ow.dtypes}")
print(f"\nNull values:\n{df_ow.isnull().sum()}")
print(f"\nCountries: {df_ow['country_name'].nunique()}")
print(f"Years: {df_ow['year'].min()} - {df_ow['year'].max()}")
print(f"Regions: {df_ow['region_name'].unique().tolist()}")
df_ow.sample(5, random_state=42)

VALIDATION — Overweight Cleaned Data

Final shape: (6567, 8)

Data types:
region_id          object
region_name        object
country_code       object
country_name       object
year                int64
indicator_code     object
indicator_name     object
value             float64
dtype: object

Null values:
region_id         0
region_name       0
country_code      0
country_name      0
year              0
indicator_code    0
indicator_name    0
value             0
dtype: int64

Countries: 199
Years: 1990 - 2022
Regions: ['Africa', 'Americas', 'Eastern Mediterranean', 'Europe', 'South-East Asia', 'Western Pacific']


Unnamed: 0,region_id,region_name,country_code,country_name,year,indicator_code,indicator_name,value
7497,AMR,Americas,PRI,Puerto Rico,1998,NCD_BMI_25A,"Prevalence of overweight among adults, BMI >= ...",56.89
15756,SEAR,South-East Asia,BTN,Bhutan,2017,NCD_BMI_25A,"Prevalence of overweight among adults, BMI >= ...",40.24
13386,EUR,Europe,KGZ,Kyrgyzstan,2015,NCD_BMI_25A,"Prevalence of overweight among adults, BMI >= ...",53.13
8367,EMR,Eastern Mediterranean,AFG,Afghanistan,2005,NCD_BMI_25A,"Prevalence of overweight among adults, BMI >= ...",23.33
5754,AMR,Americas,CRI,Costa Rica,2018,NCD_BMI_25A,"Prevalence of overweight among adults, BMI >= ...",64.98


In [9]:
from google.colab import files

df_ow.to_excel('OVERWEIGHT_CLEAN.xlsx', index=False)
files.download('OVERWEIGHT_CLEAN.xlsx')
print(f"Downloading OVERWEIGHT_CLEAN.xlsx — {len(df_ow)} rows, {len(df_ow.columns)} columns")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Downloading OVERWEIGHT_CLEAN.xlsx — 6567 rows, 8 columns
