In [None]:
!unzip financial-inclusion-in-africa.zip

Archive:  financial-inclusion-in-africa.zip
  inflating: StarterNotebook.ipynb   
  inflating: Train.csv               
  inflating: Test.csv                
  inflating: VariableDefinitions.csv  
  inflating: SampleSubmission.csv    


In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


In [None]:
!pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.10.1-py2.py3-none-any.whl (107 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.7/107.7 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-24.4.0-py3-none-any.whl (24 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-24.4.0 scikit-optimize-0.10.1


In [None]:
import pandas as pd
from catboost import CatBoostClassifier,Pool,utils
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error as mae
import matplotlib.pyplot as plt
import numpy as np
from functools import partial
from skopt import space
from skopt import gp_minimize
from sklearn import model_selection

In [None]:
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')

In [None]:
unique_ids = test["uniqueid"] +" x "+ test["country"]

In [None]:
def preprocess(df):
  if "bank_account" in df.columns:
    df["bank_account"] = df["bank_account"].astype("category").cat.codes
  df["country"] = df["country"].astype("category").cat.codes
  df["gender_of_respondent"] = df["gender_of_respondent"].astype("category").cat.codes
  df["relationship_with_head"] = df["relationship_with_head"].astype("category").cat.codes
  df["marital_status"] = df["marital_status"].astype("category").cat.codes
  df["education_level"] = df["education_level"].astype("category").cat.codes
  df["job_type"] = df["job_type"].astype("category").cat.codes
  df["cellphone_access"] = df["cellphone_access"].astype("category").cat.codes
  df["location_type"] = df["location_type"].astype("category").cat.codes
  df = df.drop(columns=["uniqueid","year"])
  return df

In [None]:
train = preprocess(train)
test = preprocess(test)

In [None]:
X = train.drop(columns=["bank_account"])
y = train["bank_account"]

In [None]:
def optimize(param,param_names,X,y):
  params = dict(zip(param_names,param))
  params["verbose"] = False
  params["task_type"]="GPU"
  model = CatBoostClassifier(**params)
  kf = model_selection.KFold(n_splits=5)
  scores = []
  for idx in kf.split(X=X,y=y):
    train_idx,test_idx = idx[0],idx[1]
    X_train,X_test = X.iloc[train_idx],X.iloc[test_idx]
    y_train,y_test = y.iloc[train_idx],y.iloc[test_idx]
    train_pool = Pool(X_train,y_train)
    test_pool = Pool(X_test,y_test)
    model.fit(train_pool)
    pred = model.predict(X_test)
    score = mae(y_test,pred)
    scores.append(score)
  return np.mean(scores)

In [None]:
params_space = [
    space.Integer(100,500,name='iterations'),
    space.Real(0.1,1,prior="uniform",name='learning_rate'),
    space.Integer(6,16,name="depth"),
    space.Categorical(["Logloss"],name="loss_function"),
    space.Categorical(["SymmetricTree","Depthwise","Lossguide"],name="grow_policy"),
]
param_names=['iterations','learning_rate','depth','loss_function',"grow_policy"]

In [None]:
optimization_function = partial(optimize,param_names=param_names,X=X,y=y)

In [None]:
result = gp_minimize(optimization_function,dimensions= params_space,n_calls=10,n_random_starts=10,verbose=15)

Iteration No: 1 started. Evaluating function at random point.
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 26.3960
Function value obtained: 0.1481
Current minimum: 0.1481
Iteration No: 2 started. Evaluating function at random point.
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 8.7168
Function value obtained: 0.1302
Current minimum: 0.1302
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 122.2368
Function value obtained: 0.1379
Current minimum: 0.1302
Iteration No: 4 started. Evaluating function at random point.
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 24.2717
Function value obtained: 0.1348
Current minimum: 0.1302
Iteration No: 5 started. Evaluating function at random point.
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 28.0683
Function value obtained: 0.1447
Current minimum: 0.1302
Iteration No: 6 started. Evalu

# **Train**

In [None]:
paramters = dict(zip(param_names,result.x))

In [None]:
paramters["verbose"] = False
paramters["task_type"]="GPU"
paramters

{'iterations': 179,
 'learning_rate': 0.10545197631229736,
 'depth': 8,
 'loss_function': 'Logloss',
 'grow_policy': 'Lossguide',
 'verbose': False,
 'task_type': 'GPU'}

In [None]:
x_train, x_val, y_train, y_val = train_test_split(train.drop(columns=["bank_account"]), train["bank_account"], test_size=0.2, random_state=42)

In [None]:
train_pool = Pool(x_train, y_train)
val_pool = Pool(x_val, y_val)

In [None]:
model = CatBoostClassifier(**paramters)

In [None]:
model.fit(train_pool)

<catboost.core.CatBoostClassifier at 0x7e976b588d60>

In [None]:
preds = model.predict(x_val)
mae(y_val,preds)

0.1075451647183847

In [None]:
preds = model.predict(test)

In [None]:
df = pd.DataFrame({"unique_id":unique_ids,"bank_account":preds})

In [None]:
df.to_csv("submission.csv",index=False)

In [None]:
df

Unnamed: 0,unique_id,bank_account
0,uniqueid_6056 x Kenya,1
1,uniqueid_6060 x Kenya,1
2,uniqueid_6065 x Kenya,0
3,uniqueid_6072 x Kenya,0
4,uniqueid_6073 x Kenya,0
...,...,...
10081,uniqueid_2998 x Uganda,0
10082,uniqueid_2999 x Uganda,0
10083,uniqueid_3000 x Uganda,0
10084,uniqueid_3001 x Uganda,0
