# Top 5 countries with the sharpest decline in birth rates since 2019


## 1. Importing the data

In [1]:
import pandas as pd

# Import data to dataframe
df = pd.read_csv('../data/fertility_global_wide-raw.csv')

## 2. Analysis of dataset shape and columns

In [2]:
# Datasets sourced from World Bank often have many metadata columns
print(df.shape)
print(df.columns[:10])
df.head()

(265, 105)
Index(['STRUCTURE', 'STRUCTURE_ID', 'ACTION', 'FREQ', 'REF_AREA', 'INDICATOR',
       'SEX', 'AGE', 'URBANISATION', 'UNIT_MEASURE'],
      dtype='str')


Unnamed: 0,STRUCTURE,STRUCTURE_ID,ACTION,FREQ,REF_AREA,INDICATOR,SEX,AGE,URBANISATION,UNIT_MEASURE,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,datastructure,WB.DATA360:DS_DATA360(1.3),I,A,TCA,WB_WDI_SP_DYN_TFRT_IN,_T,_T,_T,BR_W,...,1.729,1.718,1.713,1.704,1.677,1.617,1.552,1.501,1.477,1.463
1,datastructure,WB.DATA360:DS_DATA360(1.3),I,A,FRO,WB_WDI_SP_DYN_TFRT_IN,_T,_T,_T,BR_W,...,2.5767,2.4278,2.6393,2.4926,2.5118,2.4341,2.3498,2.325,2.0713,1.8586
2,datastructure,WB.DATA360:DS_DATA360(1.3),I,A,ALB,WB_WDI_SP_DYN_TFRT_IN,_T,_T,_T,BR_W,...,1.721,1.631,1.555,1.486,1.415,1.395,1.371,1.365,1.355,1.348
3,datastructure,WB.DATA360:DS_DATA360(1.3),I,A,SGP,WB_WDI_SP_DYN_TFRT_IN,_T,_T,_T,BR_W,...,1.25,1.24,1.2,1.16,1.14,1.14,1.1,1.12,1.04,0.97
4,datastructure,WB.DATA360:DS_DATA360(1.3),I,A,BOL,WB_WDI_SP_DYN_TFRT_IN,_T,_T,_T,BR_W,...,2.947,2.886,2.829,2.777,2.73,2.688,2.651,2.618,2.584,2.547


In [3]:
df.info(verbose=True)

<class 'pandas.DataFrame'>
RangeIndex: 265 entries, 0 to 264
Data columns (total 105 columns):
 #    Column                  Dtype  
---   ------                  -----  
 0    STRUCTURE               str    
 1    STRUCTURE_ID            str    
 2    ACTION                  str    
 3    FREQ                    str    
 4    REF_AREA                str    
 5    INDICATOR               str    
 6    SEX                     str    
 7    AGE                     str    
 8    URBANISATION            str    
 9    UNIT_MEASURE            str    
 10   COMP_BREAKDOWN_1        str    
 11   COMP_BREAKDOWN_2        str    
 12   COMP_BREAKDOWN_3        str    
 13   AGG_METHOD              str    
 14   UNIT_TYPE               str    
 15   DECIMALS                int64  
 16   DATABASE_ID             str    
 17   TIME_FORMAT             str    
 18   COMMENT_TS              str    
 19   UNIT_MULT               int64  
 20   DATA_SOURCE             str    
 21   OBS_CONF                s

## 3. Converting the dataset to long format

In [39]:
# To ensure usable data it is neccessary to convert the existing dataset to a long format
df_long = df.melt(
    id_vars=["REF_AREA_LABEL"],
    # filter the data for data from 2000 to 2023
    value_vars=[str(year) for year in range(2000, 2024)],
    var_name="Year",
    value_name="Fertility"
)
# Convert the year to an int type to ensure proper handling
df_long['Year'] = df_long['Year'].astype(int)

df_long = df_long.sort_values(by=['REF_AREA_LABEL', 'Year', 'Fertility'])

## 4. Verify integrity of data

In [40]:
# Display first 5 datasets to verify the data
df_long.head()

Unnamed: 0,REF_AREA_LABEL,Year,Fertility
156,Afghanistan,2000,7.566
421,Afghanistan,2001,7.453
686,Afghanistan,2002,7.32
951,Afghanistan,2003,7.174
1216,Afghanistan,2004,7.018


In [41]:
# Display last 5 datasets to verify the data
df_long.tail()

Unnamed: 0,REF_AREA_LABEL,Year,Fertility
5123,Zimbabwe,2019,3.748
5388,Zimbabwe,2020,3.754
5653,Zimbabwe,2021,3.765
5918,Zimbabwe,2022,3.767
6183,Zimbabwe,2023,3.724


In [7]:
# Verify that all year values are complete
pd.isna(df_long['Year']).sum()

np.int64(0)

In [42]:
# Verify that all fertility values are complete
pd.isna(df_long['Fertility']).sum()

np.int64(0)

In [6]:
df_long.describe()

Unnamed: 0,Year,Fertility
count,6360.0,6360.0
mean,2011.5,2.834922
std,6.922731,1.45657
min,2000.0,0.586
25%,2005.75,1.70988
50%,2011.5,2.324037
75%,2017.25,3.71125
max,2023.0,7.829


## 5. Examine percentual changes of fertility between the years

### 5.1. Identify which countries faced the sharpest decline of fertility between 2019 and 2023

In [43]:
start = df_long[df_long["Year"] == 2019][["REF_AREA_LABEL", "Fertility"]].copy()
end   = df_long[df_long["Year"] == 2023][["REF_AREA_LABEL", "Fertility"]].copy()

merged = start.merge(
    end,
    on="REF_AREA_LABEL",
    suffixes=("_2019", "_2023")
)

merged.head()

Unnamed: 0,REF_AREA_LABEL,Fertility_2019,Fertility_2023
0,Afghanistan,5.238,4.84
1,Albania,1.395,1.348
2,Algeria,2.997,2.766
3,American Samoa,2.404,2.286
4,Andorra,1.045,1.082


In [11]:
merged["Percent_Change"] = (
    (merged["Fertility_2023"] - merged["Fertility_2019"])
    / merged["Fertility_2019"]
) * 100

In [13]:
decline = merged.sort_values("Percent_Change")
top5_decline = decline.head(5)

top5_decline.head()

Unnamed: 0,REF_AREA_LABEL,Fertility_2019,Fertility_2023,Percent_Change
147,"Macao SAR, China",0.899,0.586,-34.816463
44,China,1.496,0.999,-33.221925
103,"Hong Kong SAR, China",1.064,0.751,-29.417293
128,Kuwait,2.082,1.524,-26.801153
52,Curaçao,1.6,1.2,-25.0


### 5.2. Identify the percentual change in fertility for previously determined countries for the years 2014 to 2018

In [52]:
selected_countries = ["Macao SAR, China", "China", "Hong Kong SAR, China", "Kuwait", "Curaçao"]

start_2014 = df_long[
    (df_long["Year"] == 2014) & (df_long["REF_AREA_LABEL"].isin(selected_countries))
][["REF_AREA_LABEL", "Fertility"]].copy()

end_2018   = df_long[
    (df_long["Year"] == 2018) & (df_long["REF_AREA_LABEL"].isin(selected_countries))
][["REF_AREA_LABEL", "Fertility"]].copy()

merged_old = start_2014.merge(
    end_2018,
    on="REF_AREA_LABEL",
    suffixes=("_2014", "_2018")
)

In [53]:
merged_old["Percent_Change"] = (
    (merged_old["Fertility_2018"] - merged_old["Fertility_2014"])
    / merged_old["Fertility_2014"]
) * 100

decline_old = merged_old.sort_values("Percent_Change")
decline_old.head()

Unnamed: 0,REF_AREA_LABEL,Fertility_2014,Fertility_2018,Percent_Change
4,"Macao SAR, China",1.224,0.915,-25.245098
1,Curaçao,2.0,1.7,-15.0
0,China,1.769,1.539,-13.001696
2,"Hong Kong SAR, China",1.235,1.08,-12.550607
3,Kuwait,2.06,2.118,2.815534


### 5.3. Compare the changes in fertility from 2014-2018 to the changes from 2019-2023

In [56]:
decline_recent = top5_decline[["REF_AREA_LABEL", "Percent_Change"]].copy()
decline_recent = decline_recent.rename(columns={"Percent_Change": "Percent_Change_2019_2023"})

decline_old = merged_old[["REF_AREA_LABEL", "Percent_Change"]].copy()
decline_old = decline_old.rename(columns={"Percent_Change": "Percent_Change_2014_2018"})

comparison = decline_recent.merge(
    decline_old,
    on="REF_AREA_LABEL",
    how="left"
)

comparison = comparison.sort_values("Percent_Change_2019_2023")
comparison

Unnamed: 0,REF_AREA_LABEL,Percent_Change_2019_2023,Percent_Change_2014_2018
0,"Macao SAR, China",-34.816463,-25.245098
1,China,-33.221925,-13.001696
2,"Hong Kong SAR, China",-29.417293,-12.550607
3,Kuwait,-26.801153,2.815534
4,Curaçao,-25.0,-15.0


### 5.4. Identify world wide fertility changes between 2019 and 2023 in %

In [57]:
start_ww_new = df_long[df_long["Year"] == 2019][["REF_AREA_LABEL", "Fertility"]].copy()
end_ww_new   = df_long[df_long["Year"] == 2023][["REF_AREA_LABEL", "Fertility"]].copy()

merged_ww_new = start_ww_new.merge(
    end_ww_new,
    on="REF_AREA_LABEL",
    suffixes=("_2019", "_2023")
)

merged_ww_new["Percent_Change"] = (
    (merged_ww_new["Fertility_2023"] - merged_ww_new["Fertility_2019"])
    / merged_ww_new["Fertility_2019"]
) * 100

merged_ww_new["Percent_Change"].mean()

np.float64(-6.73083255071215)

### 5.5. Identify world wide fertility changes between 2019 and 2023 in %

In [35]:
start_ww_old = df_long[df_long["Year"] == 2014][["REF_AREA_LABEL", "Fertility"]].copy()
end_ww_old   = df_long[df_long["Year"] == 2018][["REF_AREA_LABEL", "Fertility"]].copy()

merged_ww_old = start_ww_old.merge(
    end_ww_old,
    on="REF_AREA_LABEL",
    suffixes=("_2014", "_2018")
)

merged_ww_old["Percent_Change"] = (
    (merged_ww_old["Fertility_2018"] - merged_ww_old["Fertility_2014"])
    / merged_ww_old["Fertility_2014"]
) * 100

merged_ww_old["Percent_Change"].mean()

np.float64(-6.098271051337685)