In [6]:
import pandas as pd

df_ow = pd.read_excel('OVERWEIGHT_PERCENTAGE_AMONG_ADULTS_AGE_STANDARDIZED.xlsx', header=2)

print(f"Original shape: {df_ow.shape}")
print(f"\nAll columns ({len(df_ow.columns)}):")
for i, col in enumerate(df_ow.columns):
    null_pct = df_ow[col].isnull().sum() / len(df_ow) * 100
    print(f"  [{i:2d}] {col:35s} | nulls: {null_pct:5.1f}% | unique: {df_ow[col].nunique()}")

Original shape: (19701, 34)

All columns (34):
  [ 0] IndicatorCode                       | nulls:   0.0% | unique: 1
  [ 1] Indicator                           | nulls:   0.0% | unique: 1
  [ 2] ValueType                           | nulls:   0.0% | unique: 1
  [ 3] ParentLocationCode                  | nulls:   0.0% | unique: 6
  [ 4] ParentLocation                      | nulls:   0.0% | unique: 6
  [ 5] Location type                       | nulls:   0.0% | unique: 1
  [ 6] SpatialDimValueCode                 | nulls:   0.0% | unique: 199
  [ 7] Location                            | nulls:   0.0% | unique: 199
  [ 8] Period type                         | nulls:   0.0% | unique: 1
  [ 9] Period                              | nulls:   0.0% | unique: 33
  [10] IsLatestYear                        | nulls:   0.0% | unique: 2
  [11] Dim1 type                           | nulls:   0.0% | unique: 1
  [12] Dim1                                | nulls:   0.0% | unique: 3
  [13] Dim1ValueCode     

In [7]:
print(f"Rows before filter: {len(df_ow)}")
print(f"Sex categories: {df_ow['Dim1'].unique().tolist()}")

df_ow = df_ow[df_ow['Dim1'] == 'Both sexes']

print(f"Rows after keeping only 'Both sexes': {len(df_ow)}")

Rows before filter: 19701
Sex categories: ['Both sexes', 'Male', 'Female']
Rows after keeping only 'Both sexes': 6567


In [8]:
df_ow = df_ow[['ParentLocation', 'SpatialDimValueCode', 'Location', 'Period', 'FactValueNumeric']]

df_ow['indicator_name'] = 'Prevalence of overweight among adults, BMI >= 25 (%)'

print(f"Shape after selecting columns: {df_ow.shape}")
df_ow.head()

Shape after selecting columns: (6567, 6)


Unnamed: 0,ParentLocation,SpatialDimValueCode,Location,Period,FactValueNumeric,indicator_name
0,Africa,AGO,Angola,2022,31.78,"Prevalence of overweight among adults, BMI >= ..."
3,Africa,AGO,Angola,2021,31.25,"Prevalence of overweight among adults, BMI >= ..."
6,Africa,AGO,Angola,2020,30.73,"Prevalence of overweight among adults, BMI >= ..."
9,Africa,AGO,Angola,2019,30.22,"Prevalence of overweight among adults, BMI >= ..."
12,Africa,AGO,Angola,2018,29.73,"Prevalence of overweight among adults, BMI >= ..."


In [9]:
df_ow = df_ow.rename(columns={
    'ParentLocation': 'region',
    'Location': 'country',
    'SpatialDimValueCode': 'country_code',
    'Period': 'year',
    'indicator_name': 'indicator',
    'FactValueNumeric': 'indicator_value'
})

df_ow = df_ow[['region', 'country', 'country_code', 'year', 'indicator', 'indicator_value']]

print("Final columns:", list(df_ow.columns))
df_ow.head(10)

Final columns: ['region', 'country', 'country_code', 'year', 'indicator', 'indicator_value']


Unnamed: 0,region,country,country_code,year,indicator,indicator_value
0,Africa,Angola,AGO,2022,"Prevalence of overweight among adults, BMI >= ...",31.78
3,Africa,Angola,AGO,2021,"Prevalence of overweight among adults, BMI >= ...",31.25
6,Africa,Angola,AGO,2020,"Prevalence of overweight among adults, BMI >= ...",30.73
9,Africa,Angola,AGO,2019,"Prevalence of overweight among adults, BMI >= ...",30.22
12,Africa,Angola,AGO,2018,"Prevalence of overweight among adults, BMI >= ...",29.73
15,Africa,Angola,AGO,2017,"Prevalence of overweight among adults, BMI >= ...",29.25
18,Africa,Angola,AGO,2016,"Prevalence of overweight among adults, BMI >= ...",28.77
21,Africa,Angola,AGO,2015,"Prevalence of overweight among adults, BMI >= ...",28.31
24,Africa,Angola,AGO,2014,"Prevalence of overweight among adults, BMI >= ...",27.85
27,Africa,Angola,AGO,2013,"Prevalence of overweight among adults, BMI >= ...",27.4


In [11]:
print("VALIDATION — Overweight Cleaned Data")
print(f"\nFinal shape: {df_ow.shape}")
print(f"\nData types:\n{df_ow.dtypes}")
print(f"\nNull values:\n{df_ow.isnull().sum()}")
print(f"\nCountries: {df_ow['country'].nunique()}")
print(f"Years: {df_ow['year'].min()} – {df_ow['year'].max()}")
print(f"Regions: {df_ow['region'].unique().tolist()}")
df_ow.sample(5, random_state=42)

VALIDATION — Overweight Cleaned Data

Final shape: (6567, 6)

Data types:
region              object
country             object
country_code        object
year                 int64
indicator           object
indicator_value    float64
dtype: object

Null values:
region             0
country            0
country_code       0
year               0
indicator          0
indicator_value    0
dtype: int64

Countries: 199
Years: 1990 – 2022
Regions: ['Africa', 'Americas', 'Eastern Mediterranean', 'Europe', 'South-East Asia', 'Western Pacific']


Unnamed: 0,region,country,country_code,year,indicator,indicator_value
7497,Americas,Puerto Rico,PRI,1998,"Prevalence of overweight among adults, BMI >= ...",56.89
15756,South-East Asia,Bhutan,BTN,2017,"Prevalence of overweight among adults, BMI >= ...",40.24
13386,Europe,Kyrgyzstan,KGZ,2015,"Prevalence of overweight among adults, BMI >= ...",53.13
8367,Eastern Mediterranean,Afghanistan,AFG,2005,"Prevalence of overweight among adults, BMI >= ...",23.33
5754,Americas,Costa Rica,CRI,2018,"Prevalence of overweight among adults, BMI >= ...",64.98


In [12]:
from google.colab import files

df_ow.to_excel('overweight_clean.xlsx', index=False)
files.download('overweight_clean.xlsx')
print(f"Downloading overweight_clean.xlsx — {len(df_ow)} rows, {len(df_ow.columns)} columns")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading overweight_clean.xlsx — 6567 rows, 6 columns
