In [16]:
df_tb = pd.read_excel('TUBERCULOSIS_INCIDENT_CASES.xlsx', header=2)

print(f"Original shape: {df_tb.shape}")
print(f"\nAll columns ({len(df_tb.columns)}):")
for i, col in enumerate(df_tb.columns):
    null_pct = df_tb[col].isnull().sum() / len(df_tb) * 100
    print(f"  [{i:2d}] {col:35s} | nulls: {null_pct:5.1f}% | unique: {df_tb[col].nunique()}")

Original shape: (4902, 34)

All columns (34):
  [ 0] IndicatorCode                       | nulls:   0.0% | unique: 1
  [ 1] Indicator                           | nulls:   0.0% | unique: 1
  [ 2] ValueType                           | nulls:   0.0% | unique: 1
  [ 3] ParentLocationCode                  | nulls:   0.0% | unique: 7
  [ 4] ParentLocation                      | nulls:   0.0% | unique: 7
  [ 5] Location type                       | nulls:   0.0% | unique: 1
  [ 6] SpatialDimValueCode                 | nulls:   0.0% | unique: 197
  [ 7] Location                            | nulls:   0.0% | unique: 197
  [ 8] Period type                         | nulls:   0.0% | unique: 1
  [ 9] Period                              | nulls:   0.0% | unique: 25
  [10] IsLatestYear                        | nulls:   0.0% | unique: 2
  [11] Dim1 type                           | nulls: 100.0% | unique: 0
  [12] Dim1                                | nulls: 100.0% | unique: 0
  [13] Dim1ValueCode      

In [17]:
# Keep only the columns that map to our E-R model
df_tb = df_tb[['ParentLocationCode', 'ParentLocation', 'SpatialDimValueCode', 'Location', 'Period', 'FactValueNumeric']]

# Add indicator info (same for every row in this dataset)
df_tb['indicator_code'] = 'TB_e_inc_num'
df_tb['indicator_name'] = 'Tuberculosis - Estimated incident cases'

print(f"Shape after selecting columns: {df_tb.shape}")
print(f"Columns: {list(df_tb.columns)}")
df_tb.head()

Shape after selecting columns: (4902, 8)
Columns: ['ParentLocationCode', 'ParentLocation', 'SpatialDimValueCode', 'Location', 'Period', 'FactValueNumeric', 'indicator_code', 'indicator_name']


Unnamed: 0,ParentLocationCode,ParentLocation,SpatialDimValueCode,Location,Period,FactValueNumeric,indicator_code,indicator_name
0,AFR,Africa,AGO,Angola,2024,141000,TB_e_inc_num,Tuberculosis - Estimated incident cases
1,AFR,Africa,AGO,Angola,2023,141000,TB_e_inc_num,Tuberculosis - Estimated incident cases
2,AFR,Africa,AGO,Angola,2022,135000,TB_e_inc_num,Tuberculosis - Estimated incident cases
3,AFR,Africa,AGO,Angola,2021,128000,TB_e_inc_num,Tuberculosis - Estimated incident cases
4,AFR,Africa,AGO,Angola,2020,125000,TB_e_inc_num,Tuberculosis - Estimated incident cases


In [18]:
df_tb = df_tb.rename(columns={
    'ParentLocationCode': 'region_id',
    'ParentLocation': 'region_name',
    'SpatialDimValueCode': 'country_code',
    'Location': 'country_name',
    'Period': 'year',
    'FactValueNumeric': 'value',
    'indicator_code': 'indicator_code',
    'indicator_name': 'indicator_name'
})

# Reorder columns to match E-R model
df_tb = df_tb[['region_id', 'region_name', 'country_code', 'country_name', 'year', 'indicator_code', 'indicator_name', 'value']]

print("Final columns:")
print(list(df_tb.columns))
df_tb.head(10)

Final columns:
['region_id', 'region_name', 'country_code', 'country_name', 'year', 'indicator_code', 'indicator_name', 'value']


Unnamed: 0,region_id,region_name,country_code,country_name,year,indicator_code,indicator_name,value
0,AFR,Africa,AGO,Angola,2024,TB_e_inc_num,Tuberculosis - Estimated incident cases,141000
1,AFR,Africa,AGO,Angola,2023,TB_e_inc_num,Tuberculosis - Estimated incident cases,141000
2,AFR,Africa,AGO,Angola,2022,TB_e_inc_num,Tuberculosis - Estimated incident cases,135000
3,AFR,Africa,AGO,Angola,2021,TB_e_inc_num,Tuberculosis - Estimated incident cases,128000
4,AFR,Africa,AGO,Angola,2020,TB_e_inc_num,Tuberculosis - Estimated incident cases,125000
5,AFR,Africa,AGO,Angola,2019,TB_e_inc_num,Tuberculosis - Estimated incident cases,126000
6,AFR,Africa,AGO,Angola,2018,TB_e_inc_num,Tuberculosis - Estimated incident cases,126000
7,AFR,Africa,AGO,Angola,2017,TB_e_inc_num,Tuberculosis - Estimated incident cases,126000
8,AFR,Africa,AGO,Angola,2016,TB_e_inc_num,Tuberculosis - Estimated incident cases,126000
9,AFR,Africa,AGO,Angola,2015,TB_e_inc_num,Tuberculosis - Estimated incident cases,125000


In [21]:

print("Tuberculosis Cleaned Data")
print(f"\nFinal shape: {df_tb.shape}")
print(f"\nData types:\n{df_tb.dtypes}")
print(f"\nNull values:\n{df_tb.isnull().sum()}")
print(f"\nCountries: {df_tb['country_name'].nunique()}")
print(f"Years: {df_tb['year'].min()} - {df_tb['year'].max()}")
print(f"Regions: {df_tb['region_name'].unique().tolist()}")
print(f"\nSample data:")
df_tb.sample(5, random_state=42)

Tuberculosis Cleaned Data

Final shape: (4902, 8)

Data types:
region_id         object
region_name       object
country_code      object
country_name      object
year               int64
indicator_code    object
indicator_name    object
value              int64
dtype: object

Null values:
region_id         0
region_name       0
country_code      0
country_name      0
year              0
indicator_code    0
indicator_name    0
value             0
dtype: int64

Countries: 197
Years: 2000 - 2024
Regions: ['Africa', 'Americas', 'Eastern Mediterranean', 'Europe', 'Global', 'South-East Asia', 'Western Pacific']

Sample data:


Unnamed: 0,region_id,region_name,country_code,country_name,year,indicator_code,indicator_name,value
4751,WPR,Western Pacific,SGP,Singapore,2000,TB_e_inc_num,Tuberculosis - Estimated incident cases,3100
3541,EUR,Europe,NLD,Netherlands (Kingdom of the),2017,TB_e_inc_num,Tuberculosis - Estimated incident cases,890
907,AFR,Africa,SSD,South Sudan,2017,TB_e_inc_num,Tuberculosis - Estimated incident cases,38000
2833,EUR,Europe,BLR,Belarus,2005,TB_e_inc_num,Tuberculosis - Estimated incident cases,10000
3106,EUR,Europe,GEO,Georgia,2007,TB_e_inc_num,Tuberculosis - Estimated incident cases,7600


In [None]:
df_ow.to_csv('../DATA_CLEANING/OVERWEIGHT_CLEAN.csv', index=False)
print(f"Saved OVERWEIGHT_CLEAN.csv — {len(df_ow)} rows, {len(df_ow.columns)} columns")

df_ow.to_excel('../DATA_CLEANING/OVERWEIGHT_CLEAN.xlsx', index=False)
print(f"Saved OVERWEIGHT_CLEAN.xlsx — {len(df_ow)} rows, {len(df_ow.columns)} columns")