# Import libraries

In [29]:
import pandas as pd
import seaborn as sns
import session_info
from sklearn import compose
from sklearn.linear_model import Ridge
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

In [30]:
try:
    session_info_output = session_info.show()
except Exception as error:
    print("The session info has been requested already:")
    
session_info_output

The session info has been requested already:


  mod_version = _find_version(mod.__version__)


In [31]:
import janitor # This library generates an error when I execute session_info()

# Import external files

- Extension of the "missing" functions from Pandas
- DataFrame and dictionary functions
- Statistical functions

In [32]:
%run utils/u.0.0-pandas_missing_extension.ipynb
%run utils/u.0.1-df_functions.ipynb
%run utils/u.0.2-statistical-functions.ipynb

# Get the processed data (V2)

In [33]:
arg_di_df = pd.read_csv('../data/processed/WDICSV_PROCESSED_V2.csv').clean_names(case_type="snake")

print(arg_di_df.shape)
arg_di_df.info()

(54, 28)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54 entries, 0 to 53
Data columns (total 28 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   year                    54 non-null     int64  
 1   it_mlt_main             54 non-null     float64
 2   ny_gdp_mktp_kd_zg       54 non-null     float64
 3   ny_gdp_pcap_kd          54 non-null     float64
 4   ny_gdp_pcap_kd_zg       54 non-null     float64
 5   sp_dyn_le00_in          53 non-null     float64
 6   it_cel_sets             54 non-null     float64
 7   sh_dth_imrt             53 non-null     float64
 8   sp_pop_0014_to_zs       54 non-null     float64
 9   sp_pop_1564_to_zs       54 non-null     float64
 10  sp_pop_65_up_to_zs      54 non-null     float64
 11  sp_pop_totl             54 non-null     float64
 12  sm_pop_refg             54 non-null     float64
 13  sm_pop_refg_or          44 non-null     float64
 14  se_pre_enrr             45 non-null

# Get columns names and types

In [34]:
indicators_names, categorical_cols, numeric_cols, columns_with_missing_values = get_columns(arg_di_df)

In [35]:
print(f'Total Columns: {len(indicators_names)}')
print(f'Categorical Columns: {len(categorical_cols)}')
print(f'Numeric Columns: {len(numeric_cols)}')

numeric_cols = list(numeric_cols)

Total Columns: 28
Categorical Columns: 6
Numeric Columns: 22


In [36]:
indicators_names

['year, Year',
 'it_mlt_main, Fixed telephone subscriptions',
 'ny_gdp_mktp_kd_zg, GDP growth (annual %)',
 'ny_gdp_pcap_kd, GDP per capita (constant 2015 US$)',
 'ny_gdp_pcap_kd_zg, GDP per capita growth (annual %)',
 'sp_dyn_le00_in, Life expectancy at birth, total (years)',
 'it_cel_sets, Mobile cellular subscriptions',
 'sh_dth_imrt, Number of infant deaths',
 'sp_pop_0014_to_zs, Population ages 0-14 (% of total population)',
 'sp_pop_1564_to_zs, Population ages 15-64 (% of total population)',
 'sp_pop_65_up_to_zs, Population ages 65 and above (% of total population)',
 'sp_pop_totl, Population, total',
 'sm_pop_refg, Refugee population by country or territory of asylum',
 'sm_pop_refg_or, Refugee population by country or territory of origin',
 'se_pre_enrr, School enrollment, preprimary (% gross)',
 'se_prm_enrr, School enrollment, primary (% gross)',
 'se_sec_enrr, School enrollment, secondary (% gross)',
 'se_ter_enrr, School enrollment, tertiary (% gross)',
 'sp_urb_totl_in_zs,

In [37]:
columns_with_missing_values

['sm_pop_refg_or',
 'se_ter_enrr',
 'se_pre_enrr',
 'se_prm_enrr',
 'se_sec_enrr',
 'sp_dyn_le00_in',
 'sh_dth_imrt']

# Target variable:

sp_dyn_le00_in (Life expectancy at birth, total (years))

In [38]:
target_variable = "sp_dyn_le00_in"
target_variable_name = get_indicator_name(target_variable, True)

# Process data

## Ordinal coding

This method implies to transform every category into an integer. It's useful when the categories have an order.

### Apply to a group of columns

Column transformer: Applies transformers to columns of an array or pandas DataFrame.

make_column_transformer(): Construct a ColumnTransformer from the given transformers. This is a shorthand for the ColumnTransformer constructor; it does not require, and does not permit, naming the transformers. Instead, they will be given names automatically based on their types.


In [39]:
categorical_variables_ordered_values = [list(item.keys()) for item in df_categorical_values["Indicator Colors"]]

for item in categorical_variables_ordered_values:
    item.remove("Unknown")
    item.insert(0, "Unknown")

In [40]:
ordinal_encoder = OrdinalEncoder(categories= categorical_variables_ordered_values)

In [41]:
categorical_transformer = compose.make_column_transformer(
    # Tuples of the form (transformer, columns) specifying the transformer objects to be applied to subsets of the data
    (ordinal_encoder, df_categorical_values["Indicator Name"]),
    remainder="passthrough"
    # By specifying remainder='passthrough', all remaining columns that were not specified in transformers,
    # but present in the data passed to fit will be automatically passed through. This subset of columns is
    # concatenated with the output of the transformers. For dataframes, extra columns not seen during fit
    # will be excluded from the output of transform. 
)

In [42]:
arg_di_df_transformed = (
    pd.DataFrame(
        categorical_transformer.fit_transform(arg_di_df),
        columns=categorical_transformer.get_feature_names_out(),
        index=arg_di_df.index
    )
    .rename(
        columns=remove_prefix
    )
)

In [43]:
categorical_transformer

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [44]:
encoder = categorical_transformer.named_transformers_['ordinalencoder']
# Check categories for each column
for col_name, categories in zip(list(df_categorical_values["Indicator Name"]), encoder.categories_):
    print(f"{col_name}:")
    for i, cat in enumerate(categories):
        print(f"  {cat} → {i}")

ny_gdp_mktp_kd_zg_cat:
  Unknown → 0
  Negative growth → 1
  Low growth → 2
  Moderate growth → 3
  High growth → 4
sp_dyn_le00_in_cat:
  Unknown → 0
  Low life expectancy → 1
  Medium life expectancy → 2
  High life expectancy → 3
ny_gdp_pcap_kd_cat:
  Unknown → 0
  Low income → 1
  Middle income → 2
  High income → 3
se_sec_enrr_cat:
  Unknown → 0
  Low → 1
  Medium → 2
  High → 3
ny_gdp_mktp_kd_grw_cat:
  Unknown → 0
  High volatility → 1
  Medium volatility → 2
  Low volatility → 3
it_cel_sets_cat:
  Unknown → 0
  Low → 1
  Medium → 2
  High → 3
  Very High → 4


In [45]:
arg_di_df_transformed[categorical_cols].head(5)

Unnamed: 0,ny_gdp_mktp_kd_zg_cat,sp_dyn_le00_in_cat,it_cel_sets_cat,ny_gdp_pcap_kd_cat,se_sec_enrr_cat,ny_gdp_mktp_kd_grw_cat
0,3.0,2.0,1.0,2.0,2.0,0.0
1,4.0,2.0,0.0,2.0,2.0,0.0
2,2.0,2.0,0.0,2.0,2.0,2.0
3,3.0,2.0,0.0,2.0,2.0,3.0
4,4.0,2.0,0.0,2.0,2.0,3.0


In [46]:
arg_di_df_transformed[categorical_cols].describe()

Unnamed: 0,ny_gdp_mktp_kd_zg_cat,sp_dyn_le00_in_cat,it_cel_sets_cat,ny_gdp_pcap_kd_cat,se_sec_enrr_cat,ny_gdp_mktp_kd_grw_cat
count,54.0,54.0,54.0,54.0,54.0,54.0
mean,2.481481,2.648148,1.944444,2.296296,2.407407,1.277778
std,1.342214,0.587846,1.47196,0.460911,0.961887,0.626962
min,1.0,0.0,0.0,2.0,0.0,0.0
25%,1.0,2.0,1.0,2.0,2.0,1.0
50%,3.0,3.0,1.0,2.0,3.0,1.0
75%,4.0,3.0,4.0,3.0,3.0,1.75
max,4.0,3.0,4.0,3.0,3.0,3.0


In [47]:
print("There is no categorical variables in the transformed data")
arg_di_df_transformed.select_dtypes(include=['object']).columns

There is no categorical variables in the transformed data


Index([], dtype='object')

## Scale Numerical Features

In [48]:
if 'year' in numeric_cols:
    numeric_cols.remove('year')

In [49]:
scaler = StandardScaler()
arg_di_df_transformed[numeric_cols] = scaler.fit_transform(arg_di_df_transformed[numeric_cols])

In [50]:
arg_di_df_transformed[numeric_cols].describe()

Unnamed: 0,it_mlt_main,ny_gdp_mktp_kd_zg,ny_gdp_pcap_kd,ny_gdp_pcap_kd_zg,sp_dyn_le00_in,it_cel_sets,sh_dth_imrt,sp_pop_0014_to_zs,sp_pop_1564_to_zs,sp_pop_65_up_to_zs,...,sm_pop_refg,sm_pop_refg_or,se_pre_enrr,se_prm_enrr,se_sec_enrr,se_ter_enrr,sp_urb_totl_in_zs,year_of_dictatorship,it_cel_sets_pct,dem_dep_cat
count,54.0,54.0,54.0,54.0,53.0,54.0,53.0,54.0,54.0,54.0,...,54.0,44.0,45.0,47.0,48.0,44.0,54.0,54.0,54.0,54.0
mean,6.476301e-17,-5.756712e-17,-5.345518e-16,2.4671620000000003e-17,2.853064e-15,-2.878356e-17,-1.717704e-16,8.552829e-16,3.569161e-15,1.727014e-16,...,5.139921e-17,-4.1633360000000003e-17,1.97373e-16,1.758641e-15,-3.376928e-16,-1.917658e-16,-1.110223e-16,6.167906000000001e-17,7.401487e-17,-1.069104e-16
std,1.00939,1.00939,1.00939,1.00939,1.00957,1.00939,1.00957,1.00939,1.00939,1.00939,...,1.00939,1.011561,1.0113,1.010811,1.010582,1.011561,1.00939,1.00939,1.00939,1.00939
min,-1.346503,-2.381731,-1.609576,-2.332194,-1.924073,-0.7638043,-1.295161,-2.067495,-1.406246,-1.770959,...,-0.8275976,-1.005516,-2.068652,-1.369023,-1.656433,-1.442387,-2.114291,-0.5345225,-0.7747653,-1.852406
25%,-0.9994729,-0.7555036,-0.7684132,-0.7776206,-0.8991756,-0.7638043,-0.7975029,-0.9073865,-0.9643474,-0.8476192,...,-0.7487734,-0.8050253,-0.7112004,-1.063677,-1.036507,-1.021893,-0.7660271,-0.5345225,-0.7747653,-0.871078
50%,0.2626002,0.1253846,-0.3636115,0.08627998,0.2163184,-0.7133432,-0.1985796,0.3552099,0.007998749,0.1219308,...,-0.1884167,-0.4760948,0.1388185,0.01853656,-0.03954943,-0.01211492,0.2697168,-0.5345225,-0.7136044,-0.02923399
75%,1.097088,0.7512209,0.9845795,0.7374341,0.9088391,1.281157,0.4151685,0.8451986,0.8644281,0.755034,...,0.08914206,0.722935,0.842989,0.8755528,0.7982536,0.8403892,0.8528474,-0.5345225,1.254396,0.9644444
max,1.370511,1.598804,1.932067,1.684094,1.369678,1.778019,1.939047,1.165843,1.903152,1.763496,...,2.697312,2.521871,1.477965,1.596394,1.631297,2.030766,1.264111,1.870829,1.831949,1.430109


# Impute missing values

MICE (Multiple Imputation by Chained Equations) is a technique for imputing missing data, where multiple possible values are generated for each missing entry using iterative regression models.

If dependent variables have NA values, then linear regression will predict NA results

In [51]:
def impute_with_mice(data, target_variable, numeric_cols):    
    rows_with_na_target_variable = data[data[target_variable].isna()].copy()
    rows_with_na_target_variable = list(rows_with_na_target_variable.index)
    if target_variable in numeric_cols:
        numeric_cols.remove(target_variable)
        
    mice_imputer = IterativeImputer(estimator=Ridge())

    # data[numeric_cols] = pd.DataFrame(mice_imputer.fit_transform(data[numeric_cols]), columns=numeric_cols)
    return pd.DataFrame(mice_imputer.fit_transform(data[numeric_cols]), columns=numeric_cols)

In [52]:
arg_di_df_transformed[numeric_cols] = impute_with_mice(arg_di_df_transformed, target_variable, numeric_cols)

In [53]:
print("Check missing values in all columns")
print("The unique Nan value is the target variable in one row")
arg_di_df_transformed[arg_di_df_transformed.isna().any(axis=1)]

Check missing values in all columns
The unique Nan value is the target variable in one row


Unnamed: 0,ny_gdp_mktp_kd_zg_cat,sp_dyn_le00_in_cat,ny_gdp_pcap_kd_cat,se_sec_enrr_cat,ny_gdp_mktp_kd_grw_cat,it_cel_sets_cat,year,it_mlt_main,ny_gdp_mktp_kd_zg,ny_gdp_pcap_kd,...,sm_pop_refg,sm_pop_refg_or,se_pre_enrr,se_prm_enrr,se_sec_enrr,se_ter_enrr,sp_urb_totl_in_zs,year_of_dictatorship,it_cel_sets_pct,dem_dep_cat
53,1.0,0.0,3.0,0.0,1.0,4.0,2023.0,0.412407,-0.66078,1.011066,...,-0.68224,-0.184756,1.424528,-0.03874,1.578337,1.890704,1.264111,-0.534522,1.418794,-1.852406


In [26]:
arg_di_df.iloc[rows_with_na_target_variable][target_variable]

53   NaN
Name: sp_dyn_le00_in, dtype: float64

# Save the processed data (V3)

In [27]:
arg_di_df_transformed.to_csv('../data/processed/WDICSV_PROCESSED_V3.csv', index=False)