# Import libraries

In [28]:
import pandas as pd
import seaborn as sns
import session_info
from sklearn import compose
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder

In [29]:
try:
    session_info_output = session_info.show()
except Exception as error:
    print("The session info has been requested already:")
    
session_info_output

  mod_version = _find_version(mod.__version__)


In [30]:
import janitor # This library generates an error when I execute session_info()

# Import external files

- Extension of the "missing" functions from Pandas
- DataFrame and dictionary functions
- Statistical functions

In [31]:
%run utils/u.0.0-pandas_missing_extension.ipynb
%run utils/u.0.1-df_functions.ipynb
%run utils/u.0.2-statistical-functions.ipynb

# Get the processed data (V2)

In [32]:
arg_di_df = pd.read_csv('../data/processed/WDICSV_PROCESSED_V2.csv').clean_names(case_type="snake")

print(arg_di_df.shape)
arg_di_df.info()

(54, 28)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54 entries, 0 to 53
Data columns (total 28 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   year                    54 non-null     int64  
 1   it_mlt_main             54 non-null     float64
 2   ny_gdp_mktp_kd_zg       54 non-null     float64
 3   ny_gdp_pcap_kd          54 non-null     float64
 4   ny_gdp_pcap_kd_zg       54 non-null     float64
 5   sp_dyn_le00_in          53 non-null     float64
 6   it_cel_sets             54 non-null     float64
 7   sh_dth_imrt             53 non-null     float64
 8   sp_pop_0014_to_zs       54 non-null     float64
 9   sp_pop_1564_to_zs       54 non-null     float64
 10  sp_pop_65_up_to_zs      54 non-null     float64
 11  sp_pop_totl             54 non-null     float64
 12  sm_pop_refg             54 non-null     float64
 13  sm_pop_refg_or          44 non-null     float64
 14  se_pre_enrr             45 non-null

# Get columns names and types

In [33]:
indicators_names, categorical_cols, numeric_cols, columns_with_missing_values = get_columns(arg_di_df)

In [34]:
print(f'Total Columns: {len(indicators_names)}')
print(f'Categorical Columns: {len(categorical_cols)}')
print(f'Numeric Columns: {len(numeric_cols)}')

Total Columns: 28
Categorical Columns: 6
Numeric Columns: 22


In [35]:
indicators_names

['year, Year',
 'it_mlt_main, Fixed telephone subscriptions',
 'ny_gdp_mktp_kd_zg, GDP growth (annual %)',
 'ny_gdp_pcap_kd, GDP per capita (constant 2015 US$)',
 'ny_gdp_pcap_kd_zg, GDP per capita growth (annual %)',
 'sp_dyn_le00_in, Life expectancy at birth, total (years)',
 'it_cel_sets, Mobile cellular subscriptions',
 'sh_dth_imrt, Number of infant deaths',
 'sp_pop_0014_to_zs, Population ages 0-14 (% of total population)',
 'sp_pop_1564_to_zs, Population ages 15-64 (% of total population)',
 'sp_pop_65_up_to_zs, Population ages 65 and above (% of total population)',
 'sp_pop_totl, Population, total',
 'sm_pop_refg, Refugee population by country or territory of asylum',
 'sm_pop_refg_or, Refugee population by country or territory of origin',
 'se_pre_enrr, School enrollment, preprimary (% gross)',
 'se_prm_enrr, School enrollment, primary (% gross)',
 'se_sec_enrr, School enrollment, secondary (% gross)',
 'se_ter_enrr, School enrollment, tertiary (% gross)',
 'sp_urb_totl_in_zs,

In [36]:
columns_with_missing_values

['sm_pop_refg_or',
 'se_ter_enrr',
 'se_pre_enrr',
 'se_prm_enrr',
 'se_sec_enrr',
 'sp_dyn_le00_in',
 'sh_dth_imrt']

# Target variable:

sp_dyn_le00_in (Life expectancy at birth, total (years))

In [37]:
target_variable = "sp_dyn_le00_in"
target_variable_name = get_indicator_name(target_variable, True)

# Process data

## Ordinal coding

This method implies to transform every category into an integer. It's useful when the categories have an order.

### Apply to a group of columns

Column transformer: Applies transformers to columns of an array or pandas DataFrame.

make_column_transformer(): Construct a ColumnTransformer from the given transformers. This is a shorthand for the ColumnTransformer constructor; it does not require, and does not permit, naming the transformers. Instead, they will be given names automatically based on their types.


In [38]:
categorical_variables_ordered_values = [list(item.keys()) for item in df_categorical_values["Indicator Colors"]]

for item in categorical_variables_ordered_values:
    item.remove("Unknown")
    item.insert(0, "Unknown")

In [39]:
ordinal_encoder = OrdinalEncoder(categories= categorical_variables_ordered_values)

In [40]:
categorical_transformer = compose.make_column_transformer(
    # Tuples of the form (transformer, columns) specifying the transformer objects to be applied to subsets of the data
    (ordinal_encoder, df_categorical_values["Indicator Name"]),
    remainder="passthrough"
    # By specifying remainder='passthrough', all remaining columns that were not specified in transformers,
    # but present in the data passed to fit will be automatically passed through. This subset of columns is
    # concatenated with the output of the transformers. For dataframes, extra columns not seen during fit
    # will be excluded from the output of transform. 
)

In [41]:
arg_di_df_transformed = (
    pd.DataFrame(
        categorical_transformer.fit_transform(arg_di_df),
        columns=categorical_transformer.get_feature_names_out(),
        index=arg_di_df.index
    )
    .rename(
        columns=remove_prefix
    )
)

In [42]:
categorical_transformer

In [43]:
encoder = categorical_transformer.named_transformers_['ordinalencoder']
# Check categories for each column
for col_name, categories in zip(list(df_categorical_values["Indicator Name"]), encoder.categories_):
    print(f"{col_name}:")
    for i, cat in enumerate(categories):
        print(f"  {cat} → {i}")

ny_gdp_mktp_kd_zg_cat:
  Unknown → 0
  Negative growth → 1
  Low growth → 2
  Moderate growth → 3
  High growth → 4
sp_dyn_le00_in_cat:
  Unknown → 0
  Low life expectancy → 1
  Medium life expectancy → 2
  High life expectancy → 3
it_cel_sets_cat:
  Unknown → 0
  Low → 1
  Medium → 2
  High → 3
  Very High → 4
ny_gdp_pcap_kd_cat:
  Unknown → 0
  Low income → 1
  Middle income → 2
  High income → 3
se_sec_enrr_cat:
  Unknown → 0
  Low → 1
  Medium → 2
  High → 3
ny_gdp_mktp_kd_grw_cat:
  Unknown → 0
  Low volatility → 1
  Medium volatility → 2
  High volatility → 3


In [44]:
arg_di_df_transformed[categorical_cols].head(5)

Unnamed: 0,ny_gdp_mktp_kd_zg_cat,sp_dyn_le00_in_cat,it_cel_sets_cat,ny_gdp_pcap_kd_cat,se_sec_enrr_cat,ny_gdp_mktp_kd_grw_cat
0,3.0,2.0,1.0,2.0,2.0,0.0
1,4.0,2.0,0.0,2.0,2.0,0.0
2,2.0,2.0,0.0,2.0,2.0,2.0
3,3.0,2.0,0.0,2.0,2.0,1.0
4,4.0,2.0,0.0,2.0,2.0,1.0


In [45]:
arg_di_df_transformed[categorical_cols].describe()

Unnamed: 0,ny_gdp_mktp_kd_zg_cat,sp_dyn_le00_in_cat,it_cel_sets_cat,ny_gdp_pcap_kd_cat,se_sec_enrr_cat,ny_gdp_mktp_kd_grw_cat
count,54.0,54.0,54.0,54.0,54.0,54.0
mean,2.481481,2.648148,1.944444,2.296296,2.407407,2.574074
std,1.342214,0.587846,1.47196,0.460911,0.961887,0.7673
min,1.0,0.0,0.0,2.0,0.0,0.0
25%,1.0,2.0,1.0,2.0,2.0,2.0
50%,3.0,3.0,1.0,2.0,3.0,3.0
75%,4.0,3.0,4.0,3.0,3.0,3.0
max,4.0,3.0,4.0,3.0,3.0,3.0


In [46]:
print("There is no categorical variables in the transformed data")
arg_di_df_transformed.select_dtypes(include=['object']).columns

There is no categorical variables in the transformed data


Index([], dtype='object')

## Scale Numerical Features

In [47]:
scaler = MinMaxScaler()
arg_di_df_transformed[numeric_cols] = scaler.fit_transform(arg_di_df_transformed[numeric_cols])

In [48]:
arg_di_df_transformed[numeric_cols].describe()

Unnamed: 0,year,it_mlt_main,ny_gdp_mktp_kd_zg,ny_gdp_pcap_kd,ny_gdp_pcap_kd_zg,sp_dyn_le00_in,it_cel_sets,sh_dth_imrt,sp_pop_0014_to_zs,sp_pop_1564_to_zs,...,sm_pop_refg,sm_pop_refg_or,se_pre_enrr,se_prm_enrr,se_sec_enrr,se_ter_enrr,sp_urb_totl_in_zs,year_of_dictatorship,it_cel_sets_pct,dem_dep_cat
count,54.0,54.0,54.0,54.0,54.0,53.0,54.0,53.0,54.0,54.0,...,54.0,44.0,45.0,47.0,48.0,44.0,54.0,54.0,54.0,54.0
mean,0.5,0.495582,0.598345,0.454471,0.580684,0.584159,0.300495,0.400457,0.639431,0.424925,...,0.234785,0.28506,0.583275,0.461663,0.503823,0.415296,0.625826,0.222222,0.297219,0.564325
std,0.296833,0.371507,0.253581,0.285006,0.251324,0.306511,0.397113,0.312154,0.312182,0.305007,...,0.286359,0.286773,0.285145,0.340866,0.30738,0.291252,0.298777,0.419643,0.387227,0.307505
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.25,0.127725,0.408545,0.237506,0.387067,0.311164,0.0,0.153873,0.358796,0.133528,...,0.022362,0.056838,0.382746,0.102969,0.188558,0.12107,0.399083,0.0,0.0,0.298956
50%,0.5,0.592232,0.629844,0.351804,0.602166,0.649834,0.019852,0.339057,0.749289,0.427342,...,0.181333,0.150089,0.622416,0.467914,0.491793,0.411808,0.705661,0.0,0.023463,0.555419
75%,0.75,0.899366,0.787068,0.732472,0.764295,0.860087,0.804525,0.528825,0.900832,0.686129,...,0.260075,0.490009,0.820963,0.756917,0.746621,0.657263,0.878267,0.0,0.778437,0.858138
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Save the processed data (V3)

In [50]:
arg_di_df_transformed.to_csv('../data/processed/WDICSV_PROCESSED_V3.csv', index=False)