In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import session_info
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
import sklearn.metrics as metrics
import statsmodels.api as sm

In [None]:
try:
    session_info_output = session_info.show()
except Exception as error:
    print("The session info has been requested already:")
    
session_info_output

In [None]:
import janitor # This library generates an error when I execute session_info()

# Import external files

- Extension of the "missing" functions from Pandas
- DataFrame and dictionary functions
- Statistical functions

In [None]:
%run utils/u.0.0-pandas_missing_extension.ipynb
%run utils/u.0.1-df_functions.ipynb
%run utils/u.0.2-statistical-functions.ipynb

# Get the processed data (V3)

In [None]:
arg_di_df_processed = pd.read_csv('../data/processed/WDICSV_PROCESSED_V3.csv').clean_names(case_type="snake")

print(arg_di_df_processed.shape)
arg_di_df_processed.info()

(54, 19)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54 entries, 0 to 53
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   ny_gdp_mktp_kd_zg_cat   54 non-null     float64
 1   sp_dyn_le00_in_cat      54 non-null     float64
 2   ny_gdp_pcap_kd_cat      54 non-null     float64
 3   se_sec_enrr_cat         54 non-null     float64
 4   ny_gdp_mktp_kd_grw_cat  54 non-null     float64
 5   it_cel_sets_cat         54 non-null     float64
 6   year                    54 non-null     float64
 7   it_mlt_main             54 non-null     float64
 8   ny_gdp_pcap_kd          54 non-null     float64
 9   sp_dyn_le00_in          53 non-null     float64
 10  sh_dth_imrt             54 non-null     float64
 11  sm_pop_refg             54 non-null     float64
 12  sm_pop_refg_or          54 non-null     float64
 13  se_pre_enrr             54 non-null     float64
 14  se_sec_enrr             54 non-null

# Get columns names and types

In [None]:
indicators_names, categorical_cols, numeric_cols, columns_with_missing_values = get_columns(arg_di_df_processed, True)

In [None]:
print(f'Total Columns: {len(indicators_names)}')
print(f'Categorical Columns: {len(categorical_cols)}')
print(f'Numeric Columns: {len(numeric_cols)}')

numeric_cols = list(numeric_cols)

Total Columns: 19
Categorical Columns: 6
Numeric Columns: 13


In [None]:
arg_di_df_processed.dtypes

ny_gdp_mktp_kd_zg_cat     float64
sp_dyn_le00_in_cat        float64
ny_gdp_pcap_kd_cat        float64
se_sec_enrr_cat           float64
ny_gdp_mktp_kd_grw_cat    float64
it_cel_sets_cat           float64
year                      float64
it_mlt_main               float64
ny_gdp_pcap_kd            float64
sp_dyn_le00_in            float64
sh_dth_imrt               float64
sm_pop_refg               float64
sm_pop_refg_or            float64
se_pre_enrr               float64
se_sec_enrr               float64
sp_urb_totl_in_zs         float64
year_of_dictatorship      float64
it_cel_sets_pct           float64
dem_dep_pct               float64
dtype: object

In [None]:
indicators_names

['ny_gdp_mktp_kd_zg_cat, GDP growth categories',
 'sp_dyn_le00_in_cat, Life Expectancy categories',
 'ny_gdp_pcap_kd_cat, GDP Per Capita Categories',
 'se_sec_enrr_cat, Secondary school enrollment categories',
 'ny_gdp_mktp_kd_grw_cat, GDP Growth Volatility',
 'it_cel_sets_cat, Mobile Subscription Categories',
 'year, Year',
 'it_mlt_main, Fixed telephone subscriptions',
 'ny_gdp_pcap_kd, GDP per capita (constant 2015 US$)',
 'sp_dyn_le00_in, Life expectancy at birth, total (years)',
 'sh_dth_imrt, Number of infant deaths',
 'sm_pop_refg, Refugee population by country or territory of asylum',
 'sm_pop_refg_or, Refugee population by country or territory of origin',
 'se_pre_enrr, School enrollment, preprimary (% gross)',
 'se_sec_enrr, School enrollment, secondary (% gross)',
 'sp_urb_totl_in_zs, Urban population (% of total population)',
 'year_of_dictatorship, It was a year of military dictatorship',
 'it_cel_sets_pct, Mobile Subscription (% of total population)',
 'dem_dep_pct, Perce

In [None]:
columns_with_missing_values

['sp_dyn_le00_in']

# Split target variable

Target variable: sp_dyn_le00_in (Life expectancy at birth, total (years))

In [None]:
target_variable = "sp_dyn_le00_in"
target_variable_name = get_indicator_name(target_variable, True)

In [None]:
if target_variable in numeric_cols:
    numeric_cols.remove(target_variable)
    
if 'year' in numeric_cols:
    numeric_cols.remove('year')

## Split rows with missing values in the target variable

In [None]:
rows_with_na = arg_di_df_processed[arg_di_df_processed[target_variable].isna()].copy()
rows_with_na

Unnamed: 0,ny_gdp_mktp_kd_zg_cat,sp_dyn_le00_in_cat,ny_gdp_pcap_kd_cat,se_sec_enrr_cat,ny_gdp_mktp_kd_grw_cat,it_cel_sets_cat,year,it_mlt_main,ny_gdp_pcap_kd,sp_dyn_le00_in,sh_dth_imrt,sm_pop_refg,sm_pop_refg_or,se_pre_enrr,se_sec_enrr,sp_urb_totl_in_zs,year_of_dictatorship,it_cel_sets_pct,dem_dep_pct
53,1.0,0.0,3.0,0.0,1.0,4.0,2023.0,0.412407,1.011066,,-0.924576,-0.68224,-0.184756,1.036592,1.152048,1.264111,-0.534522,1.418794,-1.852406


In [None]:
X = arg_di_df_processed.drop(index=rows_with_na.index)[numeric_cols]
y = arg_di_df_processed.drop(index=rows_with_na.index)[target_variable]

X.shape, y.shape

((53, 11), (53,))

# Apply Logistic Regression

In [None]:
logistic_reg = LogisticRegression()
logistic_reg.fit(x_train,y_train)
predictions = logistic_reg.predict(x_test)


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(y_test, prediction_test, labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix= cm,display_labels=model.classes_)
disp.plot(cmap="gray")
plt.show()
sns.heatmap(cf, annot=True, square=True, cmap="coolwarm")

In [None]:

print(metrics.accuracy_score(y_test, prediction_test))
model.predict_proba(X_test)
model.coef_
model.feature_names_in_
weights = pd.Series(model.coef_[0], index=X.columns.values)


In [None]:
st_x = StandardScaler()
X_train = st_x.fit_transform(X_train)
X_test = st_x.transform(X_test)


In [None]:

def logistic_model(C_, solver_, multiclass_):
    logistic_regression_model = LogisticRegression(random_state=42, solver=solver_, multi_class=multiclass_, n_jobs=-1, C=C_)
    return logistic_regression_model


In [None]:

multiclass = ['ovr','multinomial']
solver_list = ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga']
scores = []
params = []
for i in multiclass:
    for j in solver_list:
        try:
            model = logistic_model(1,j,i)
            model.fit(X_train, y_train)
            predictions = model.predict(X_test)
            params.append(i + "-" + j)
            accuracy = accuracy_score(y_test, predictions)
            scores.append(accuracy)
        except:
            None


In [None]:

fig = plt.figure(figsize=(10,10))
sns.barplot(x=params, y=scores).set_title("Beans Accuracy")
plt.xticks(rotation=90)