# __Predicting Extreme Poverty on a country-scale__

In [36]:
import pandas as pd 
import warnings
import plots as plot
from sklearn.model_selection import KFold
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn import neighbors
from sklearn.model_selection import GridSearchCV
import category_encoders as ec
from sklearn import preprocessing
import numpy as np

warnings.filterwarnings('ignore')

## Index
1. Data Exploration and Data Preparation
    * 1.1 Data Transformation
    * 1.2 Target Column
    * 1.3 Feature Overview
    * 1.4 Data Imputation
    * 1.5 Data Vizualization & Exploration
2. Answering the 3 Questions
    * 2.1 What percentage of the world population lives in extreme poverty?
    * 2.2 Which characteristics are predictive for countries with large populations living in extreme poverty?
    * 2.3 Which characteristics are predictive for populations emerging from extreme poverty?
3. Conclusions
    * 3.1 Comparing Q1 and Q2
4. Future Work
    * 4.1 More Data

# 1 Data Exploration and Data Preparation

## The Data Set
blabla

## 1.1 Data Transformation
TODO How was it transformed

In [2]:
# READ SOURCE CSV
raw = pd.read_csv("unesco_poverty_dataset.csv") 
keys = raw.DEMO_IND.unique() 

# DEFINE BASE CSV
base = raw[['LOCATION', 'TIME']]

# FOR EVERY VAR JOIN ON LOCATION & TIME 
for i in range(0,len(keys)):
    loop = raw.loc[raw.DEMO_IND == keys[i]]
    base = pd.merge(base, loop[['LOCATION', 'TIME', 'Value']],  how='left', left_on=['LOCATION','TIME'], right_on = ['LOCATION','TIME']) 
    base.columns = base.columns.str.replace('Value', keys[i])

# DROP DUPLICATES
base = base.drop_duplicates()

## 1.2 Target Column
_what is the target column supposed to be?_    
There are 3 different metrics to count the GNI of a state: __LCU, Atlas, PPP__, so which is the correct one?   

The Poverty Threshold changed throughout the years: 
* 1/day in 1996 (measure unknown)   
* 1.25/day in 2005 (measure unknown, presumably Atlas)   
* 1.9/day in 2015 (PPP)   
_the $ values being average per capita income of a person per day_   

[World Bank Press Release, October 2015]('https://www.worldbank.org/en/news/press-release/2015/10/04/world-bank-forecasts-global-poverty-to-fall-below-10-for-first-time-major-hurdles-remain-in-goal-to-end-poverty-by-2030')

In [3]:
# SEPARATE INTO 3 SUB-TABLES: 1970-2004, 2005-2014, 2015-2019
sub_0 = base[base['TIME'] < 2005]
sub_1 = base[(base['TIME'] >= 2005) & (base['TIME'] < 2015)]
sub_2 = base[base['TIME'] >= 2015]

# WRITE TARGET VARIABLES
sub_0['poverty'] = base['NY_GNP_PCAP_CN'].apply(lambda x: (x / 365) < 1)
sub_1['poverty'] = base['NY_GNP_PCAP_CD'].apply(lambda x: (x / 365) < 1.25)
sub_2['poverty'] = base['NY_GNP_PCAP_PP_CD'].apply(lambda x: (x / 365) < 1.9)

# RE-CONCAT SUB-DATAFRAMES
base = pd.concat([sub_0, sub_1, sub_2])

# SHOW HOW MANY COUNTRIES WERE POOR AT LEAST ONCE
poor = base[base['poverty'] == True]
perc_poor_countries_ever = round(poor['LOCATION'].drop_duplicates().shape[0] / base['LOCATION'].drop_duplicates().shape[0] * 100,2)

print('From 1970-2019, all countries considered,', perc_poor_countries_ever, '% have lived in extreme poverty at least once.')

From 1970-2019, all countries considered, 29.61 % have lived in extreme poverty at least once.


In [6]:
# PLOT
# plot.combined_line_chart(base.copy())

## 1.3 Features
bla bla

In [8]:
# GET DATA PER COLUMN
na_percent = []
na_total = []
minimum = []
maximum = []
for col in base.columns:
    na_percent.append(round(base[col].isna().sum() / base.shape[0] * 100, 2))
    na_total.append(base[col].isna().sum())
    minimum.append(base[col].min())
    maximum.append(base[col].max())

# GET VARIABLE DESCRIPTIONS
descriptions = raw['Indicator'].drop_duplicates().tolist()
descriptions.insert(0, 'LOCATION')
descriptions.insert(1, 'TIME')
descriptions.insert(38, 'poverty')

features = pd.DataFrame(
    {'descriptions': descriptions, 
    'na_percent': na_percent, 
    'na_total': na_total,
    'minimum': minimum,
    'maximum': maximum},
    index=base.columns) 

# features

## 1.4 Data Imputation

In [23]:
# READ TRANSFORMED CSV FILE
raw = pd.read_csv("transformed.csv")  
feature_descriptions = pd.read_csv("feature_descriptions.csv")

# FEATURES WITH LESS THAN 50% MISSING VALUES
features = feature_descriptions.where(feature_descriptions['na_percent']<=50.0).dropna(0)

# ONLY DEMOGRAFIC FEATURES!
#cols_to_drop = 7:13 + 18:25
cols = features['Unnamed: 0'].tolist()
cols = cols[0:7]+ cols[13:18] + [cols[25]]
dataset = raw[cols]
    
by_country = dataset.groupby(by=dataset['LOCATION'])  
dataset_full = pd.DataFrame(columns=cols)
dataset_full2 = pd.DataFrame(columns=cols)


for name, group in by_country :
    tdf = pd.DataFrame(columns=cols)
    tdf2 = pd.DataFrame(columns=cols) 

    tdf['TIME'] = group['TIME']
    tdf['poverty'] = group['poverty']

    # cols with all NaN values
    all_null = group.isna().all()  
    null_cols = all_null.where(all_null == 1).dropna(0).index.tolist()
    tdf[null_cols] = 0

    # cols for interpolation
    cols_to_int = all_null.where(all_null == 0).dropna(0).index.tolist()[2:]
    cols_to_int.remove('poverty')

    tdf[cols_to_int] = group[cols_to_int].interpolate(method='linear', axis=0)
    tdf['LOCATION'] = name 

    # fill the NaN values that were not interpolated
    tdf.fillna(tdf.mean(), inplace=True)

    # Another way to interpolate - take mean for the cols with all NaNs
    tdf2 = group.interpolate(method ='linear', limit_direction ='forward', axis = 0)
    tdf2 = tdf2.interpolate(method ='linear', limit_direction ='backward', axis = 0)
    tdf2['LOCATION'] = name
    tdf2.fillna(dataset.drop(labels=['LOCATION'], axis=1).mean(), inplace=True)
    dataset_full2 = pd.concat([dataset_full2,tdf2])
    
    dataset_full = pd.concat([dataset_full,tdf])

# NA -> mean    
dataset_full2.sort_index(inplace=True)
# NA -> 0
dataset_full.sort_index(inplace=True)

# dataset_full2.head(100)

## 1.5 Visualizations & Data Exploration

In [17]:
pass

# 2 Answering the Questions

## 2.1 - What percentage of the world population lives in extreme poverty?
bla bla

In [20]:
pass

## 2.2 Which characteristics are predictive for countries with large populations living in extreme poverty?

bla bla

### Training the model

In [62]:
# GROUND TRUTH AS NUMERIC
y = dataset_full['poverty']
y = y.apply(lambda x: 1 if x==True else 0)
X_2 = dataset_full2.drop(labels=['LOCATION', 'poverty'], axis=1)

# FUNCTIONS FOR ML 
def print_performance (classifier, X, y, scores= ['accuracy', 'precision', 'recall'], model=''):
    for score in scores:
        cv2 = cross_val_score(classifier, X, y, cv=10, scoring=score)
        cv2_m = cv2.mean()
        cv2_sd = cv2.std()
        print(model + ' ' + score +" : " + str(round(cv2_m, 5))+ ' +- '+ str(round(cv2_sd, 5)))

def r_classifier (X, y, alpha=1.0, fit_intercept=True, normalize=True, solver='auto', max_iter=1000, tol=0.0001) :
    reg = linear_model.RidgeClassifier(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, max_iter=max_iter, tol=0.001, solver='auto', random_state=30)
    print_performance(reg, X , y, model='Ridge Calssifier', scores= ['accuracy'])
    reg.fit(X,y)
    return reg   

ridge_2 =  r_classifier(X_2,y, alpha=0.1)

Ridge Calssifier accuracy : 0.94262 +- 0.00034


### Finding the most predictive features

In [59]:
R2_coef = np.array(ridge_2.coef_)
X2_cols = X_2.columns
R2_relativ = R2_coef/np.abs(R2_coef).sum()
table = {'col':X2_cols, 'absolute':[], 'relative':[]}

# Fill into Dataframe
for i in range(0,len(X2_cols)):
    table['absolute'].append(round(R2_coef[0,i],6))
    table['relative'].append(round(R2_relativ[0,i],6))
    
weights = pd.DataFrame.from_dict(table)
weights

Unnamed: 0,col,absolute,relative
0,TIME,-0.001943,-0.075534
1,SP_DYN_TFRT_IN,0.012889,0.501145
2,SP_DYN_LE00_IN,-0.003791,-0.147391
3,SP_DYN_IMRT_IN,0.000964,0.037497
4,SP_POP_GROW,-0.00555,-0.215784
5,SP_RUR_TOTL_ZS,-0.00058,-0.022568
6,200101,0.0,0.0
7,200151,1e-06,3.2e-05
8,200345,-0.0,-1e-05
9,200343,1e-06,2.8e-05


In [63]:
# TODO visualize

## 2.3 - Which characteristics are predictive for populations emerging from extreme poverty?
bla bla

In [22]:
pass