# Financial Inclusion in Africa

Here we are going to describe and implement a solution to predicting whether an individual feature
set is likely to have a back account or not.

In [145]:
# for data processing
import pandas as pd
import seaborn as sns

# for display
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

# for machine learning
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from skopt import forest_minimize
from category_encoders import TargetEncoder

# for io
import warnings

warnings.filterwarnings('ignore')

##### Read files

In [146]:
# load files into a pandas dataframe

train = pd.read_csv('assets/Train.csv')
test = pd.read_csv('assets/Test.csv')

##### A Preview of the training data

In [147]:
train.head()

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_1,Yes,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,uniqueid_2,No,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,uniqueid_3,Yes,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,uniqueid_4,No,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,uniqueid_5,No,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed


## Combine train and test set for easy preprocessing

In [148]:
# mapping the bank account with 0 to NO and 1 to YES
train['bank_account'] = train['bank_account'].map({'No':0, 'Yes':1})

In [149]:
# Combine train and test set
ntrain = train.shape[0] # to be used to split train and test set from the combined dataframe

all_data = pd.concat((train, test)).reset_index(drop=True)
print(f'The shape of the combined dataframe is: {all_data.shape}')

The shape of the combined dataframe is: (33610, 13)


### Feature Engineering

In [150]:
cat_cols = [
    'country',
    'location_type',
    'cellphone_access',
    'gender_of_respondent',
    'relationship_with_head',
    'marital_status',
    'education_level',
    'job_type'
]

# Encode categorical features

# Select main columns to be used in training
main_cols = all_data.columns.difference(['uniqueid', 'bank_account'])
encoder = TargetEncoder(cols=main_cols)

temp = encoder.fit_transform(all_data[main_cols], all_data['bank_account'])
temp['bank_account'] = all_data['bank_account']
all_data = temp
# all_data = pd.get_dummies(data = all_data, columns = cat_cols)
all_data.head()

Unnamed: 0,age_of_respondent,cellphone_access,country,education_level,gender_of_respondent,household_size,job_type,location_type,marital_status,relationship_with_head,year,bank_account
0,0.144128,0.183797,0.250659,0.232773,0.106795,0.142602,0.131738,0.116503,0.173598,0.106595,0.208349,1.0
1,0.112727,0.017133,0.250659,0.038981,0.106795,0.149667,0.202429,0.116503,0.090103,0.177149,0.208349,0.0
2,0.18374,0.183797,0.250659,0.570361,0.189696,0.149667,0.131738,0.178739,0.121007,0.098802,0.208349,1.0
3,0.164751,0.183797,0.250659,0.085451,0.106795,0.149667,0.541232,0.116503,0.173598,0.177149,0.208349,0.0
4,0.18374,0.017133,0.250659,0.085451,0.189696,0.114514,0.079507,0.178739,0.121007,0.086586,0.208349,0.0


In [151]:
# Separate train and test data from the combined dataframe
train_df = all_data[:ntrain]
test_df = all_data[ntrain:]

We select the required columns that we will have in our input data set.
- We remove the uniqueid because it means nothing to the required output.
- We also remove the bank_account information because it has no organic output.
- We remove the year because time series information may cause the data fitting to overfit.

In [152]:
x = train_df[main_cols]
y = train_df['bank_account']
# lets take a look now

# Split data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
x_train.head()

Unnamed: 0,age_of_respondent,cellphone_access,country,education_level,gender_of_respondent,household_size,job_type,location_type,marital_status,relationship_with_head,year
19929,0.147727,0.183797,0.091692,0.085451,0.106795,0.118378,0.094974,0.178739,0.121007,0.106595,0.091692
13449,0.166667,0.183797,0.114825,0.232773,0.106795,0.144214,0.116706,0.116503,0.090103,0.177149,0.114825
5449,0.144385,0.183797,0.250659,0.232773,0.189696,0.144214,0.131738,0.178739,0.173598,0.177149,0.208349
6956,0.136691,0.183797,0.114825,0.038981,0.106795,0.149667,0.116706,0.116503,0.090103,0.177149,0.114825
2737,0.144128,0.017133,0.250659,0.085451,0.106795,0.149667,0.116706,0.116503,0.173598,0.106595,0.208349


### Model Training & Optimization

We now start modelling and training. We are going to the keras Sequential model to construct and
fit our input data set.

We will place the modelling process into its own function so that we can later optimize the hyper
params.

In [153]:
def modeller(params):
    model = XGBClassifier(learning_rate = params[0],
                            n_estimators = params[1],
                            max_depth = params[2],
                            min_child_weight = params[3],
                            gamma = params[4],
                            subsample = params[5],
                            colsample_bytree = params[6],
                            seed = 42,
                            verbosity=0)
    auc = cross_val_score(model, x_train, y_train, cv = 10, scoring = 'roc_auc')
    print(auc.mean())
    return -auc.mean()

# then we define a tuning space for our random forest algo to optimize in
space = [(1e-3, 1e-1, 'log-uniform'), # learning rate
          (100, 2400), # n_estimators
          (1, 10), # max_depth
          (1, 6.), # min_child_weight
          (0, 0.5), # gamma
          (0.1, 1.), # subsample
          (0.1, 1.)] # colsample_bytree
# then we optimize
result = [0.0124869455295219, 2180, 6, 6., 0.01297998739050575, 0.2336707906620142, 0.2859693229517501] # forest_minimize(modeller, space, random_state = 42, n_random_starts = 20, n_calls  = 20, verbose = 1).x
# print(result)

### Making predictions of the test set and creating a submission file

Once an optimized hyperparameter configuration is found we then model it and fit our data.

In [154]:
model = XGBClassifier(learning_rate = result[0],
                        n_estimators = result[1],
                        max_depth = result[2],
                        min_child_weight = result[3],
                        gamma = result[4],
                        subsample = result[5],
                        colsample_bytree = result[6],
                        seed = 42)

eval_s = [(x, y), (x_test, y_test)]
model.fit(x_train, y_train, eval_set=eval_s, verbose=0)

y_pred = model.predict(x_test)

print(f'XGB model accuracy: {accuracy_score(y_pred, y_test)*100}%')
# Make prediction on the test set

test_data = encoder.fit_transform(test_df[main_cols], test_df['bank_account'])
test_data['bank_account'] = model.predict(test_data)
test_data['bank_account'] = test_data['bank_account'].astype(int)
# Create a submission file
sub_file = test_data.copy()
test_df.head()

XGB model accuracy: 89.31708699348258%


Unnamed: 0,age_of_respondent,cellphone_access,country,education_level,gender_of_respondent,household_size,job_type,location_type,marital_status,relationship_with_head,year,bank_account
23524,0.167614,0.183797,0.250659,0.232773,0.189696,0.142602,0.775194,0.178739,0.173598,0.177149,0.208349,
23525,0.154206,0.183797,0.250659,0.570361,0.189696,0.126429,0.541232,0.178739,0.173598,0.177149,0.208349,
23526,0.107143,0.017133,0.250659,0.038981,0.106795,0.142602,0.094974,0.116503,0.173598,0.059853,0.208349,
23527,0.144385,0.017133,0.250659,0.085451,0.106795,0.132156,0.094974,0.116503,0.173598,0.177149,0.208349,
23528,0.010823,0.017133,0.250659,0.232773,0.189696,0.142602,0.094974,0.178739,0.121007,0.086586,0.208349,


In [None]:
sub_file['unique_id'] = sub_file.apply(lambda k: f'{k["uniqueid"]} x {k["country"]}', axis=1)
sub_file = sub_file.filter(['unique_id', 'bank_account'], axis=1)
# Check the distribution of your predictions
# sns.countplot(sub_file.predictions);

In [None]:
# Create a csv file and upload to zindi
sub_file.to_csv('baseline.csv', index = False)
# files.download('baseline.csv')

### Monitoring Learning Rate

We plot learning rate to tune and optimize our learning algorithm

In [None]:
results = model.evals_result()
# files.download('Baseline.csv')
plt.plot(results['validation_0']['logloss'], label='train')
plt.plot(results['validation_1']['logloss'], label='test')
# show the legend
plt.legend()
# show the plot
plt.show()