# World Life Expectancy Data Cleaning

This notebook cleans and prepares the World Life Expectancy dataset for analysis. The dataset provides global life expectancy data across different countries, which will be used to examine potential correlations with air quality, asthma prevalence, and quality of life indicators. The cleaned data will be integrated with other datasets in this project to support a comprehensive analysis of health and environmental factors globally.


In [16]:
# Dependencies
import pandas as pd
from pathlib import Path
import numpy as np

# File path for loading life expectancy data
life_expectancy_load = Path("Raw_Data/World Life Expectancy.csv")

# Load the life expectancy dataset
life_expectancy_df = pd.read_csv(life_expectancy_load)

# Preview the first few rows of the data
life_expectancy_df.head()


Unnamed: 0,Data Source,World Development Indicators,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 57,Unnamed: 58,Unnamed: 59,Unnamed: 60,Unnamed: 61,Unnamed: 62,Unnamed: 63,Unnamed: 64,Unnamed: 65,Unnamed: 66
0,,,,,,,,,,,...,,,,,,,,,,
1,Last Updated Date,29/06/2023,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,Country Name,Country Code,Indicator Name,Indicator Code,1960.0,1961.0,1962.0,1963.0,1964.0,1965.0,...,2013.0,2014.0,2015.0,2016.0,2017.0,2018.0,2019.0,2020.0,2021.0,2022.0
4,Aruba,ABW,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,64.152,64.537,64.752,65.132,65.294,65.502,...,75.636,75.601,75.683,75.617,75.903,76.072,76.248,75.723,74.626,


In [17]:
# Retrieve column headers for reference
life_expectancy_df.columns.values.tolist()

['Data Source',
 'World Development Indicators',
 'Unnamed: 2',
 'Unnamed: 3',
 'Unnamed: 4',
 'Unnamed: 5',
 'Unnamed: 6',
 'Unnamed: 7',
 'Unnamed: 8',
 'Unnamed: 9',
 'Unnamed: 10',
 'Unnamed: 11',
 'Unnamed: 12',
 'Unnamed: 13',
 'Unnamed: 14',
 'Unnamed: 15',
 'Unnamed: 16',
 'Unnamed: 17',
 'Unnamed: 18',
 'Unnamed: 19',
 'Unnamed: 20',
 'Unnamed: 21',
 'Unnamed: 22',
 'Unnamed: 23',
 'Unnamed: 24',
 'Unnamed: 25',
 'Unnamed: 26',
 'Unnamed: 27',
 'Unnamed: 28',
 'Unnamed: 29',
 'Unnamed: 30',
 'Unnamed: 31',
 'Unnamed: 32',
 'Unnamed: 33',
 'Unnamed: 34',
 'Unnamed: 35',
 'Unnamed: 36',
 'Unnamed: 37',
 'Unnamed: 38',
 'Unnamed: 39',
 'Unnamed: 40',
 'Unnamed: 41',
 'Unnamed: 42',
 'Unnamed: 43',
 'Unnamed: 44',
 'Unnamed: 45',
 'Unnamed: 46',
 'Unnamed: 47',
 'Unnamed: 48',
 'Unnamed: 49',
 'Unnamed: 50',
 'Unnamed: 51',
 'Unnamed: 52',
 'Unnamed: 53',
 'Unnamed: 54',
 'Unnamed: 55',
 'Unnamed: 56',
 'Unnamed: 57',
 'Unnamed: 58',
 'Unnamed: 59',
 'Unnamed: 60',
 'Unnamed: 61',

In [18]:
# Drop unwanted columns and rows with metadata
# Start by selecting relevant rows (data starts from index 3) and dropping unnecessary columns
life_expectancy_df = life_expectancy_df.iloc[3:].drop(columns=[
    'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8',
    'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15',
    'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21', 'Unnamed: 22',
    'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25', 'Unnamed: 26', 'Unnamed: 27', 'Unnamed: 28', 'Unnamed: 29',
    'Unnamed: 30', 'Unnamed: 31', 'Unnamed: 32', 'Unnamed: 33', 'Unnamed: 34', 'Unnamed: 35', 'Unnamed: 36',
    'Unnamed: 37', 'Unnamed: 38', 'Unnamed: 39', 'Unnamed: 40', 'Unnamed: 41', 'Unnamed: 42', 'Unnamed: 43',
    'Unnamed: 44', 'Unnamed: 45', 'Unnamed: 46', 'Unnamed: 47', 'Unnamed: 48', 'Unnamed: 49', 'Unnamed: 50',
    'Unnamed: 51', 'Unnamed: 52', 'Unnamed: 53'
])

In [19]:
life_expectancy_df

Unnamed: 0,Data Source,World Development Indicators,Unnamed: 54,Unnamed: 55,Unnamed: 56,Unnamed: 57,Unnamed: 58,Unnamed: 59,Unnamed: 60,Unnamed: 61,Unnamed: 62,Unnamed: 63,Unnamed: 64,Unnamed: 65,Unnamed: 66
3,Country Name,Country Code,2010.000000,2011.000000,2012.000000,2013.000000,2014.000000,2015.000000,2016.000000,2017.000000,2018.000000,2019.000000,2020.000000,2021.000000,2022.0
4,Aruba,ABW,75.404000,75.465000,75.531000,75.636000,75.601000,75.683000,75.617000,75.903000,76.072000,76.248000,75.723000,74.626000,
5,Africa Eastern and Southern,AFE,58.411150,59.293271,60.050780,60.709870,61.337917,61.856458,62.444050,62.922390,63.365863,63.755678,63.313860,62.454590,
6,Afghanistan,AFG,60.851000,61.419000,61.923000,62.417000,62.545000,62.659000,63.136000,63.016000,63.081000,63.565000,62.575000,61.982000,
7,Africa Western and Central,AFW,54.550169,55.013138,55.340561,55.673406,55.922229,56.195872,56.581678,56.888446,57.189139,57.555796,57.226373,56.988657,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,Kosovo,XKX,77.630000,77.740000,78.280000,78.587000,78.880000,78.922000,78.981000,78.783000,78.696000,79.022000,76.567000,76.806000,
266,"Yemen, Rep.",YEM,67.280000,67.419000,67.343000,67.545000,67.384000,65.873000,66.064000,65.957000,64.575000,65.092000,64.650000,63.753000,
267,South Africa,ZAF,58.899000,60.651000,61.846000,62.533000,63.380000,63.950000,64.747000,65.402000,65.674000,66.175000,65.252000,62.341000,
268,Zambia,ZMB,56.799000,57.771000,58.867000,59.878000,60.699000,61.208000,61.794000,62.120000,62.342000,62.793000,62.380000,61.223000,


In [20]:
# Set the first row as the header and drop the initial row
life_expectancy_df.columns = life_expectancy_df.iloc[0]
life_expectancy_df = life_expectancy_df[1:].reset_index(drop=True)

In [21]:
life_expectancy_df

3,Country Name,Country Code,2010.0,2011.0,2012.0,2013.0,2014.0,2015.0,2016.0,2017.0,2018.0,2019.0,2020.0,2021.0,2022.0
0,Aruba,ABW,75.404000,75.465000,75.531000,75.636000,75.601000,75.683000,75.617000,75.903000,76.072000,76.248000,75.723000,74.626000,
1,Africa Eastern and Southern,AFE,58.411150,59.293271,60.050780,60.709870,61.337917,61.856458,62.444050,62.922390,63.365863,63.755678,63.313860,62.454590,
2,Afghanistan,AFG,60.851000,61.419000,61.923000,62.417000,62.545000,62.659000,63.136000,63.016000,63.081000,63.565000,62.575000,61.982000,
3,Africa Western and Central,AFW,54.550169,55.013138,55.340561,55.673406,55.922229,56.195872,56.581678,56.888446,57.189139,57.555796,57.226373,56.988657,
4,Angola,AGO,56.726000,57.596000,58.623000,59.307000,60.040000,60.655000,61.092000,61.680000,62.144000,62.448000,62.261000,61.643000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
261,Kosovo,XKX,77.630000,77.740000,78.280000,78.587000,78.880000,78.922000,78.981000,78.783000,78.696000,79.022000,76.567000,76.806000,
262,"Yemen, Rep.",YEM,67.280000,67.419000,67.343000,67.545000,67.384000,65.873000,66.064000,65.957000,64.575000,65.092000,64.650000,63.753000,
263,South Africa,ZAF,58.899000,60.651000,61.846000,62.533000,63.380000,63.950000,64.747000,65.402000,65.674000,66.175000,65.252000,62.341000,
264,Zambia,ZMB,56.799000,57.771000,58.867000,59.878000,60.699000,61.208000,61.794000,62.120000,62.342000,62.793000,62.380000,61.223000,


In [22]:
# Remove the last empty column, if any, and round numerical values to 2 decimal places
life_expectancy_df = life_expectancy_df.iloc[:, :-1].apply(pd.to_numeric, errors='ignore').round(2)
life_expectancy_df

3,Country Name,Country Code,2010.0,2011.0,2012.0,2013.0,2014.0,2015.0,2016.0,2017.0,2018.0,2019.0,2020.0,2021.0
0,Aruba,ABW,75.40,75.46,75.53,75.64,75.60,75.68,75.62,75.90,76.07,76.25,75.72,74.63
1,Africa Eastern and Southern,AFE,58.41,59.29,60.05,60.71,61.34,61.86,62.44,62.92,63.37,63.76,63.31,62.45
2,Afghanistan,AFG,60.85,61.42,61.92,62.42,62.54,62.66,63.14,63.02,63.08,63.56,62.58,61.98
3,Africa Western and Central,AFW,54.55,55.01,55.34,55.67,55.92,56.20,56.58,56.89,57.19,57.56,57.23,56.99
4,Angola,AGO,56.73,57.60,58.62,59.31,60.04,60.66,61.09,61.68,62.14,62.45,62.26,61.64
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
261,Kosovo,XKX,77.63,77.74,78.28,78.59,78.88,78.92,78.98,78.78,78.70,79.02,76.57,76.81
262,"Yemen, Rep.",YEM,67.28,67.42,67.34,67.54,67.38,65.87,66.06,65.96,64.58,65.09,64.65,63.75
263,South Africa,ZAF,58.90,60.65,61.85,62.53,63.38,63.95,64.75,65.40,65.67,66.18,65.25,62.34
264,Zambia,ZMB,56.80,57.77,58.87,59.88,60.70,61.21,61.79,62.12,62.34,62.79,62.38,61.22


In [23]:
# Rename columns to remove decimal points from year columns
life_expectancy_df.rename(columns=lambda x: str(int(float(x))) if isinstance(x, float) else x, inplace=True)
life_expectancy_df

3,Country Name,Country Code,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Aruba,ABW,75.40,75.46,75.53,75.64,75.60,75.68,75.62,75.90,76.07,76.25,75.72,74.63
1,Africa Eastern and Southern,AFE,58.41,59.29,60.05,60.71,61.34,61.86,62.44,62.92,63.37,63.76,63.31,62.45
2,Afghanistan,AFG,60.85,61.42,61.92,62.42,62.54,62.66,63.14,63.02,63.08,63.56,62.58,61.98
3,Africa Western and Central,AFW,54.55,55.01,55.34,55.67,55.92,56.20,56.58,56.89,57.19,57.56,57.23,56.99
4,Angola,AGO,56.73,57.60,58.62,59.31,60.04,60.66,61.09,61.68,62.14,62.45,62.26,61.64
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
261,Kosovo,XKX,77.63,77.74,78.28,78.59,78.88,78.92,78.98,78.78,78.70,79.02,76.57,76.81
262,"Yemen, Rep.",YEM,67.28,67.42,67.34,67.54,67.38,65.87,66.06,65.96,64.58,65.09,64.65,63.75
263,South Africa,ZAF,58.90,60.65,61.85,62.53,63.38,63.95,64.75,65.40,65.67,66.18,65.25,62.34
264,Zambia,ZMB,56.80,57.77,58.87,59.88,60.70,61.21,61.79,62.12,62.34,62.79,62.38,61.22


In [26]:
# Replace any remaining NaN values with 0
life_expectancy_df.fillna(0, inplace=True)

# Filter for years of interest (2015-2021) and reshape the data from wide to long format
years_of_interest = [str(year) for year in range(2015, 2022)]
melted_data = pd.melt(life_expectancy_df, id_vars="Country Name", value_vars=years_of_interest)


In [27]:
# Sort data by Country and Year for better readability
sorted_data = melted_data.sort_values(by=["Country Name", 3])

In [28]:
# Rename columns for clarity
sorted_data = sorted_data.rename(columns={
    "Country Name": "Country",
    3 : "Year",
    "value": "Life Expectancy"
})

In [29]:

# Display the cleaned and sorted data
sorted_data

Unnamed: 0,Country,Year,Life Expectancy
2,Afghanistan,2015,62.66
268,Afghanistan,2016,63.14
534,Afghanistan,2017,63.02
800,Afghanistan,2018,63.08
1066,Afghanistan,2019,63.56
...,...,...,...
797,Zimbabwe,2017,60.71
1063,Zimbabwe,2018,61.41
1329,Zimbabwe,2019,61.29
1595,Zimbabwe,2020,61.12


In [30]:
# Define output path for storing the cleaned life expectancy data
cleaned_data_output_path = Path("Cleaned_Data/cleaned_life_expectancy_data.csv")

# Export the cleaned and sorted DataFrame to a CSV file without the index
sorted_data.to_csv(cleaned_data_output_path, index=False)
