In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


In [3]:
#loading the datafile as a csv
file_id = '1kirGsHGc5vQZPvqi8Kb1vXkc57etuDmR'
download_url = f'https://drive.google.com/uc?id={file_id}'
data1 = pd.read_csv(download_url)

In [4]:
data1.head(3)

Unnamed: 0,iso2,name,capital,region,currency,gdp,gdp_per_capita,gdp_growth,life_expectancy_female,life_expectancy_male,...,homicide_rate,refugees,threatened_species,primary_school_enrollment_female,primary_school_enrollment_male,secondary_school_enrollment_female,secondary_school_enrollment_male,post_secondary_enrollment_female,post_secondary_enrollment_male,co2_emissions
0,AF,Afghanistan,Kabul,Southern Asia,"{'code': 'AFN', 'name': 'Afghani'}",20514.0,551.9,-1.7,65.8,62.8,...,6.7,2826.4,40.0,82.9,124.2,40.0,70.1,4.9,14.2,130.85
1,AL,Albania,Tirana,Southern Europe,"{'code': 'ALL', 'name': 'Lek'}",15059.0,5223.8,4.1,80.1,76.7,...,2.3,4.3,142.0,109.1,105.2,95.7,95.2,67.6,43.0,4.3
2,DZ,Algeria,Algiers,Northern Africa,"{'code': 'DZD', 'name': 'Algerian Dinar'}",173757.0,4114.7,1.4,77.8,75.4,...,1.4,99.5,155.0,107.3,112.4,101.5,97.8,64.4,38.8,130.5


In [5]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 179 entries, 0 to 178
Data columns (total 38 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   iso2                                178 non-null    object 
 1   name                                179 non-null    object 
 2   capital                             179 non-null    object 
 3   region                              179 non-null    object 
 4   currency                            179 non-null    object 
 5   gdp                                 179 non-null    float64
 6   gdp_per_capita                      179 non-null    float64
 7   gdp_growth                          179 non-null    float64
 8   life_expectancy_female              179 non-null    float64
 9   life_expectancy_male                179 non-null    float64
 10  fertility                           179 non-null    float64
 11  infant_mortality                    179 non-n

In [6]:
# Check to see if the range of the

# Calculate mean, median, and quartiles for life expectancy (male and female)
print("Life Expectancy (Female):")
print("Mean:", data1['life_expectancy_female'].mean())
print("Median:", data1['life_expectancy_female'].median())
print("1st Quartile (25%):", data1['life_expectancy_female'].quantile(0.25))
print("3rd Quartile (75%):", data1['life_expectancy_female'].quantile(0.75))

print("\nLife Expectancy (Male):")
print("Mean:", data1['life_expectancy_male'].mean())
print("Median:", data1['life_expectancy_male'].median())
print("1st Quartile (25%):", data1['life_expectancy_male'].quantile(0.25))
print("3rd Quartile (75%):", data1['life_expectancy_male'].quantile(0.75))

Life Expectancy (Female):
Mean: 74.82094972067038
Median: 77.1
1st Quartile (25%): 68.6
3rd Quartile (75%): 80.65

Life Expectancy (Male):
Mean: 69.9695530726257
Median: 70.7
1st Quartile (25%): 64.55
3rd Quartile (75%): 75.55000000000001


In [7]:
# prompt: Do the same for all the other male, female columns

# Assuming 'data' DataFrame is already loaded as in the previous code

def print_stats(column_name):
    """Prints mean, median, and quartiles for a given column."""
    print(f"\n{column_name}:")
    print("Mean:", data1[column_name].mean())
    print("Median:", data1[column_name].median())
    print("1st Quartile (25%):", data1[column_name].quantile(0.25))
    print("3rd Quartile (75%):", data1[column_name].quantile(0.75))


# List of columns to process (assuming they exist in the DataFrame)
columns_to_process = ['life_expectancy_female', 'life_expectancy_male',
                      'primary_school_enrollment_female' , 'primary_school_enrollment_male',
                      'secondary_school_enrollment_female', 'secondary_school_enrollment_male',
                      'post_secondary_enrollment_female', 'post_secondary_enrollment_male'
                      # Add other male/female columns here as needed
                      ]


for col in columns_to_process:
    if col in data1.columns:  # Check if column exists
        print_stats(col)
    else:
        print(f"\nWarning: Column '{col}' not found in the DataFrame.")


life_expectancy_female:
Mean: 74.82094972067038
Median: 77.1
1st Quartile (25%): 68.6
3rd Quartile (75%): 80.65

life_expectancy_male:
Mean: 69.9695530726257
Median: 70.7
1st Quartile (25%): 64.55
3rd Quartile (75%): 75.55000000000001

primary_school_enrollment_female:
Mean: 101.4231843575419
Median: 101.8
1st Quartile (25%): 98.05
3rd Quartile (75%): 106.55

primary_school_enrollment_male:
Mean: 103.41843575418996
Median: 102.8
1st Quartile (25%): 99.275
3rd Quartile (75%): 108.7

secondary_school_enrollment_female:
Mean: 83.53240223463688
Median: 91.6
1st Quartile (25%): 55.65
3rd Quartile (75%): 102.75

secondary_school_enrollment_male:
Mean: 83.79720670391062
Median: 89.3
1st Quartile (25%): 61.95
3rd Quartile (75%): 103.1

post_secondary_enrollment_female:
Mean: 45.173463687150836
Median: 41.2
1st Quartile (25%): 12.2
3rd Quartile (75%): 69.75

post_secondary_enrollment_male:
Mean: 35.67178770949721
Median: 33.8
1st Quartile (25%): 14.3
3rd Quartile (75%): 53.2


In [8]:
# Looks like that the range is similar in comparison for male and female columns, so we can take take the median value to
# replace the orignal columns with mean values

data1["life_expectancy_avg"] = data1[["life_expectancy_male", "life_expectancy_female"]].mean(axis=1)
# Drop the original columns
data1.drop(columns=["life_expectancy_male", "life_expectancy_female"], inplace=True)

#doing the same for all the other gender columns:

data1['primary_school_enrollment_avg'] = data1[['primary_school_enrollment_male', 'primary_school_enrollment_female']].mean(axis=1)
data1.drop(columns=['primary_school_enrollment_male', 'primary_school_enrollment_female'], inplace=True)

data1['secondary_school_enrollment_avg'] = data1[['secondary_school_enrollment_male', 'secondary_school_enrollment_female']].mean(axis=1)
data1.drop(columns=['secondary_school_enrollment_male', 'secondary_school_enrollment_female'], inplace=True)

data1['post_secondary_enrollment_avg'] = data1[['post_secondary_enrollment_male', 'post_secondary_enrollment_female']].mean(axis=1)
data1.drop(columns=['post_secondary_enrollment_male', 'post_secondary_enrollment_female'], inplace=True)


In [9]:
data1.shape

(179, 34)

In [10]:
data1.head(3)

Unnamed: 0,iso2,name,capital,region,currency,gdp,gdp_per_capita,gdp_growth,fertility,infant_mortality,...,forested_area,internet_users,homicide_rate,refugees,threatened_species,co2_emissions,life_expectancy_avg,primary_school_enrollment_avg,secondary_school_enrollment_avg,post_secondary_enrollment_avg
0,AF,Afghanistan,Kabul,Southern Asia,"{'code': 'AFN', 'name': 'Afghani'}",20514.0,551.9,-1.7,4.6,51.7,...,2.1,13.5,6.7,2826.4,40.0,130.85,64.3,103.55,55.05,9.55
1,AL,Albania,Tirana,Southern Europe,"{'code': 'ALL', 'name': 'Lek'}",15059.0,5223.8,4.1,1.6,8.0,...,28.2,71.8,2.3,4.3,142.0,4.3,78.4,107.15,95.45,55.3
2,DZ,Algeria,Algiers,Northern Africa,"{'code': 'DZD', 'name': 'Algerian Dinar'}",173757.0,4114.7,1.4,3.0,21.2,...,0.8,49.0,1.4,99.5,155.0,130.5,76.6,109.85,99.65,51.6


For the analysis, the columns iso2, capital and currency are not required, so we can drop them before going further to moseling part.

In [11]:
data = data1.drop(columns=['iso2', 'capital', 'currency'])
data.head(3)

Unnamed: 0,name,region,gdp,gdp_per_capita,gdp_growth,fertility,infant_mortality,sex_ratio,population,pop_growth,...,forested_area,internet_users,homicide_rate,refugees,threatened_species,co2_emissions,life_expectancy_avg,primary_school_enrollment_avg,secondary_school_enrollment_avg,post_secondary_enrollment_avg
0,Afghanistan,Southern Asia,20514.0,551.9,-1.7,4.6,51.7,105.4,38928.0,2.5,...,2.1,13.5,6.7,2826.4,40.0,130.85,64.3,103.55,55.05,9.55
1,Albania,Southern Europe,15059.0,5223.8,4.1,1.6,8.0,103.7,2878.0,-0.1,...,28.2,71.8,2.3,4.3,142.0,4.3,78.4,107.15,95.45,55.3
2,Algeria,Northern Africa,173757.0,4114.7,1.4,3.0,21.2,102.1,43851.0,2.0,...,0.8,49.0,1.4,99.5,155.0,130.5,76.6,109.85,99.65,51.6


In [12]:
# setting the name column as index

data.set_index('name', inplace=True)
data.head(3)

Unnamed: 0_level_0,region,gdp,gdp_per_capita,gdp_growth,fertility,infant_mortality,sex_ratio,population,pop_growth,pop_density,...,forested_area,internet_users,homicide_rate,refugees,threatened_species,co2_emissions,life_expectancy_avg,primary_school_enrollment_avg,secondary_school_enrollment_avg,post_secondary_enrollment_avg
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,Southern Asia,20514.0,551.9,-1.7,4.6,51.7,105.4,38928.0,2.5,59.6,...,2.1,13.5,6.7,2826.4,40.0,130.85,64.3,103.55,55.05,9.55
Albania,Southern Europe,15059.0,5223.8,4.1,1.6,8.0,103.7,2878.0,-0.1,105.0,...,28.2,71.8,2.3,4.3,142.0,4.3,78.4,107.15,95.45,55.3
Algeria,Northern Africa,173757.0,4114.7,1.4,3.0,21.2,102.1,43851.0,2.0,18.4,...,0.8,49.0,1.4,99.5,155.0,130.5,76.6,109.85,99.65,51.6


In [13]:
data.info() #Only one categorical column and the rest are numeric

<class 'pandas.core.frame.DataFrame'>
Index: 179 entries, Afghanistan to Zimbabwe
Data columns (total 30 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   region                           179 non-null    object 
 1   gdp                              179 non-null    float64
 2   gdp_per_capita                   179 non-null    float64
 3   gdp_growth                       179 non-null    float64
 4   fertility                        179 non-null    float64
 5   infant_mortality                 179 non-null    float64
 6   sex_ratio                        179 non-null    float64
 7   population                       179 non-null    float64
 8   pop_growth                       179 non-null    float64
 9   pop_density                      179 non-null    float64
 10  urban_population                 179 non-null    float64
 11  urban_population_growth          179 non-null    float64
 12  imports     

In [14]:
# Including only the numeric columns
data_numeric = data.select_dtypes(include=['number'])
data_numeric.head(3)

Unnamed: 0_level_0,gdp,gdp_per_capita,gdp_growth,fertility,infant_mortality,sex_ratio,population,pop_growth,pop_density,urban_population,...,forested_area,internet_users,homicide_rate,refugees,threatened_species,co2_emissions,life_expectancy_avg,primary_school_enrollment_avg,secondary_school_enrollment_avg,post_secondary_enrollment_avg
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,20514.0,551.9,-1.7,4.6,51.7,105.4,38928.0,2.5,59.6,25.8,...,2.1,13.5,6.7,2826.4,40.0,130.85,64.3,103.55,55.05,9.55
Albania,15059.0,5223.8,4.1,1.6,8.0,103.7,2878.0,-0.1,105.0,61.2,...,28.2,71.8,2.3,4.3,142.0,4.3,78.4,107.15,95.45,55.3
Algeria,173757.0,4114.7,1.4,3.0,21.2,102.1,43851.0,2.0,18.4,73.2,...,0.8,49.0,1.4,99.5,155.0,130.5,76.6,109.85,99.65,51.6


In [15]:
# The categorical column
data_categorical = data.select_dtypes(include=['object'])
data_categorical.head(3)

Unnamed: 0_level_0,region
name,Unnamed: 1_level_1
Afghanistan,Southern Asia
Albania,Southern Europe
Algeria,Northern Africa


In [16]:
# Create dummy variables for the 'region' column
data_dummies = pd.get_dummies(data_categorical, drop_first=True).astype(int)
data_dummies.head(3)


Unnamed: 0_level_0,region_Central America,region_Central Asia,region_Eastern Africa,region_Eastern Asia,region_Eastern Europe,region_Middle Africa,region_Northern Africa,region_Northern America,region_Northern Europe,region_Oceania,region_South America,region_South-Eastern Asia,region_Southern Africa,region_Southern Asia,region_Southern Europe,region_Western Africa,region_Western Asia,region_Western Europe
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Afghanistan,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
Albania,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
Algeria,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [17]:
# Scaling the numeric data

scaler = StandardScaler()
data_scaled = pd.DataFrame(scaler.fit_transform(data_numeric), columns=data_numeric.columns, index=data_numeric.index)
data_scaled.head(3)

Unnamed: 0_level_0,gdp,gdp_per_capita,gdp_growth,fertility,infant_mortality,sex_ratio,population,pop_growth,pop_density,urban_population,...,forested_area,internet_users,homicide_rate,refugees,threatened_species,co2_emissions,life_expectancy_avg,primary_school_enrollment_avg,secondary_school_enrollment_avg,post_secondary_enrollment_avg
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,-0.233153,-0.595869,-1.578889,1.485904,1.486176,0.145774,-0.027865,1.100765,-0.143343,-1.509992,...,-1.25762,-1.4453,-0.004969,2.001153,-0.583202,-0.062217,-1.050849,0.085382,-0.966303,-1.087416
Albania,-0.235986,-0.418838,0.254261,-0.836024,-0.681715,0.068196,-0.263291,-1.220468,-0.121318,0.064323,...,-0.105159,0.569624,-0.442269,-0.386262,-0.33665,-0.220148,0.77948,0.357589,0.397978,0.524021
Algeria,-0.153553,-0.460865,-0.599102,0.247543,-0.026883,-0.004818,0.004285,0.654374,-0.163329,0.597989,...,-1.315022,-0.218373,-0.531716,-0.305726,-0.305227,-0.062654,0.545821,0.561745,0.53981,0.393697


In [18]:
# Concatenate the data_scaler and data_dummies dataframes to form the final dataset

data_final = pd.concat([data_scaled, data_dummies], axis=1)
data_final.head(3)

Unnamed: 0_level_0,gdp,gdp_per_capita,gdp_growth,fertility,infant_mortality,sex_ratio,population,pop_growth,pop_density,urban_population,...,region_Northern Europe,region_Oceania,region_South America,region_South-Eastern Asia,region_Southern Africa,region_Southern Asia,region_Southern Europe,region_Western Africa,region_Western Asia,region_Western Europe
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,-0.233153,-0.595869,-1.578889,1.485904,1.486176,0.145774,-0.027865,1.100765,-0.143343,-1.509992,...,0,0,0,0,0,1,0,0,0,0
Albania,-0.235986,-0.418838,0.254261,-0.836024,-0.681715,0.068196,-0.263291,-1.220468,-0.121318,0.064323,...,0,0,0,0,0,0,1,0,0,0
Algeria,-0.153553,-0.460865,-0.599102,0.247543,-0.026883,-0.004818,0.004285,0.654374,-0.163329,0.597989,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# save the data_final dataframe as a new csv file
data_final.to_csv('life_data_processed.csv', index=True)

**TRAIN AND TEST SPLIT**

In [19]:
# Split the data into features (X) and target (y)
X = data_final.drop(columns=['life_expectancy_avg'])  # Features
y = data_final['life_expectancy_avg']  # Target variable

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]} rows")
print(f"Testing set size: {X_test.shape[0]} rows")

Training set size: 143 rows
Testing set size: 36 rows


In [20]:
print(X.shape)
print(y.shape)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(179, 46)
(179,)
(143, 46)
(36, 46)
(143,)
(36,)
