<a href="https://colab.research.google.com/github/aissam-out/Financial-Inclusion-in-Africa/blob/master/financial_inclusion_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Resources

In [0]:
import warnings
import numpy as np
import pandas as pd 
import seaborn as sns
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix, log_loss, accuracy_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
%matplotlib inline

In [0]:
# Code to read csv file into Colaboratory:
!pip install -U -q PyDrive #if needed
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
# Train data
downloaded_train = drive.CreateFile({'id':'********************'}) 
downloaded_train.GetContentFile('Train_v2.csv')

# Test data
downloaded_train = drive.CreateFile({'id':'********************'}) 
downloaded_train.GetContentFile('Test_v2.csv')

# Submission file
downloaded_train = drive.CreateFile({'id':'********************'}) 
downloaded_train.GetContentFile('SubmissionFile.csv')

# Variable definitions
downloaded_train = drive.CreateFile({'id':'*********************'}) 
downloaded_train.GetContentFile('VariableDefinitions.csv')

# loading the dataset
df_train = pd.read_csv("Train_v2.csv")
df_test = pd.read_csv("Test_v2.csv")
submission_file = pd.read_csv("SubmissionFile.csv")
variables = pd.read_csv("VariableDefinitions.csv")

In [0]:
data = df_train.copy()
test = df_test.copy()

# Data preprocessing

In [0]:
data.head(1)

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_1,Yes,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed


In [0]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23524 entries, 0 to 23523
Data columns (total 13 columns):
country                   23524 non-null object
year                      23524 non-null int64
uniqueid                  23524 non-null object
bank_account              23524 non-null object
location_type             23524 non-null object
cellphone_access          23524 non-null object
household_size            23524 non-null int64
age_of_respondent         23524 non-null int64
gender_of_respondent      23524 non-null object
relationship_with_head    23524 non-null object
marital_status            23524 non-null object
education_level           23524 non-null object
job_type                  23524 non-null object
dtypes: int64(3), object(10)
memory usage: 2.3+ MB
None


In [6]:
# Convert target label to numerical Data
le = LabelEncoder()
data['bank_account'] = le.fit_transform(data['bank_account'])
data.head()

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_1,1,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,uniqueid_2,0,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,uniqueid_3,1,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,uniqueid_4,0,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,uniqueid_5,0,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed


In [0]:
# Convert the following numerical labels from integer to float
data['year_'] = data['year']
test['year_'] = test['year']

float_array = data[['household_size', 'age_of_respondent', 'year_']].values.astype(float)
float_array = test[['household_size', 'age_of_respondent', 'year_']].values.astype(float)

In [8]:
# convert categorical features to numerical features - One Hot Encoding
data['country_'] = data['country']
test['country_'] = test['country']

categ = ['relationship_with_head', 'marital_status', 'education_level', 'job_type', 'country_']

# One Hot Encoding conversion
data = pd.get_dummies(data, prefix_sep='_', columns = categ)
test = pd.get_dummies(test, prefix_sep='_', columns = categ)

# Labelncoder conversion
data['location_type'] = le.fit_transform(data['location_type'])
data['cellphone_access'] = le.fit_transform(data['cellphone_access'])
data['gender_of_respondent'] = le.fit_transform(data['gender_of_respondent'])


test['location_type'] = le.fit_transform(test['location_type'])
test['cellphone_access'] = le.fit_transform(test['cellphone_access'])
test['gender_of_respondent'] = le.fit_transform(test['gender_of_respondent'])


data.head()

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,year_,relationship_with_head_Child,relationship_with_head_Head of Household,relationship_with_head_Other non-relatives,relationship_with_head_Other relative,relationship_with_head_Parent,relationship_with_head_Spouse,marital_status_Divorced/Seperated,marital_status_Dont know,marital_status_Married/Living together,marital_status_Single/Never Married,marital_status_Widowed,education_level_No formal education,education_level_Other/Dont know/RTA,education_level_Primary education,education_level_Secondary education,education_level_Tertiary education,education_level_Vocational/Specialised training,job_type_Dont Know/Refuse to answer,job_type_Farming and Fishing,job_type_Formally employed Government,job_type_Formally employed Private,job_type_Government Dependent,job_type_Informally employed,job_type_No Income,job_type_Other Income,job_type_Remittance Dependent,job_type_Self employed,country__Kenya,country__Rwanda,country__Tanzania,country__Uganda
0,Kenya,2018,uniqueid_1,1,0,1,3,24,0,2018,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0
1,Kenya,2018,uniqueid_2,0,0,0,5,70,0,2018,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0
2,Kenya,2018,uniqueid_3,1,1,1,5,26,1,2018,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0
3,Kenya,2018,uniqueid_4,0,0,1,5,34,0,2018,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0
4,Kenya,2018,uniqueid_5,0,1,0,8,26,1,2018,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0


In [0]:
# Separate training features from target
X_train = data.drop(['year', 'uniqueid', 'bank_account', 'country'], axis=1)
y_train = data['bank_account']

X_test = test.drop(['year', 'uniqueid', 'country'], axis=1)

#rescale X_train and X_test
scaler = MinMaxScaler(feature_range=(0, 1))
X_train_rescaled = scaler.fit_transform(X_train)
X_test_rescaled = scaler.fit_transform(X_test)

In [11]:
data.head()

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,year_,relationship_with_head_Child,relationship_with_head_Head of Household,relationship_with_head_Other non-relatives,relationship_with_head_Other relative,relationship_with_head_Parent,relationship_with_head_Spouse,marital_status_Divorced/Seperated,marital_status_Dont know,marital_status_Married/Living together,marital_status_Single/Never Married,marital_status_Widowed,education_level_No formal education,education_level_Other/Dont know/RTA,education_level_Primary education,education_level_Secondary education,education_level_Tertiary education,education_level_Vocational/Specialised training,job_type_Dont Know/Refuse to answer,job_type_Farming and Fishing,job_type_Formally employed Government,job_type_Formally employed Private,job_type_Government Dependent,job_type_Informally employed,job_type_No Income,job_type_Other Income,job_type_Remittance Dependent,job_type_Self employed,country__Kenya,country__Rwanda,country__Tanzania,country__Uganda
0,Kenya,2018,uniqueid_1,1,0,1,3,24,0,2018,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0
1,Kenya,2018,uniqueid_2,0,0,0,5,70,0,2018,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0
2,Kenya,2018,uniqueid_3,1,1,1,5,26,1,2018,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0
3,Kenya,2018,uniqueid_4,0,0,1,5,34,0,2018,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0
4,Kenya,2018,uniqueid_5,0,1,0,8,26,1,2018,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0


In [0]:
# Split train data with stratification
X_Train, X_val, y_Train, y_val = train_test_split(X_train_rescaled, y_train, stratify = y_train, test_size = 0.2, random_state=42)

# Training

In [17]:
# import XGBClassifier
from xgboost import XGBClassifier

my_model = XGBClassifier()

# model paramaters 
param_grid = {'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

my_model2 = GridSearchCV(my_model, param_grid)
my_model2.fit(X_Train, y_Train)
print(my_model2.best_params_)



{'colsample_bytree': 0.8, 'gamma': 2, 'max_depth': 4, 'min_child_weight': 5, 'subsample': 0.6}


In [23]:
# fit and Evaluate model
my_model3 = XGBClassifier(min_child_weight = 5, gamma = 0.5, subsample = 0.6, colsample_bytree = 0.6, max_depth = 5)
my_model3.fit(X_Train, y_Train)
y_pred = my_model3.predict(X_val)

# Get error rate
print("Error rate of Random Forest classifier: ", 1 - accuracy_score(y_val, y_pred))

# Get confusion matrix
confusion_matrix(y_pred, y_val)

Error rate of Random Forest classifier:  0.11052072263549417


array([[3942,  419],
       [ 101,  243]])

In [0]:
# Get the predicted result for the test Data
test.bank_account = my_model3.predict(X_test_rescaled)

In [0]:
# create submission DataFrame
submission_df3 = pd.DataFrame({"uniqueid": test["uniqueid"] + " x " + test["country"], "bank_account": test.bank_account})

In [26]:
submission_df3.head()

Unnamed: 0,uniqueid,bank_account
0,uniqueid_6056 x Kenya,1
1,uniqueid_6060 x Kenya,1
2,uniqueid_6065 x Kenya,0
3,uniqueid_6072 x Kenya,0
4,uniqueid_6073 x Kenya,0


# Save results

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

submission_df3.to_csv("submission.csv", header=True, index=False)

%cp submission.csv gdrive/'My Drive'/'Colab Notebooks'/zindi