# Build Model

### Import

In [1]:
import pandas as pd

from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
def wrangle(file_path):
    df = pd.read_csv(file_path)
    # drop columns which are not useful in the model
    df = df.drop(columns=['year', 'uniqueid'])
    
    return df

In [3]:
# data = wrangle("Train.csv")
data = wrangle("/content/drive/MyDrive/Financial Inclusion in Africa/Train.csv") ## when file is in drive
print(data.shape)
data.head()

(23524, 11)


Unnamed: 0,country,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,Yes,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,No,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,Yes,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,No,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,No,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed


## Encoding Categorical Features

In [4]:
# Identify columns with categorical values for encoding
categorical_columns = [x for x in data.columns if type(data[x][1]) == str]
print(categorical_columns)
print(f"Our dataframe has {len(categorical_columns)} categorical columns")

# Instantiate label encoder
label_encoder = LabelEncoder()

# Transfor data
for column in data.columns:
  data[column] = label_encoder.fit_transform(data[column])
data.head()

['country', 'bank_account', 'location_type', 'cellphone_access', 'gender_of_respondent', 'relationship_with_head', 'marital_status', 'education_level', 'job_type']
Our dataframe has 9 categorical columns


Unnamed: 0,country,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,0,1,0,1,2,8,0,5,2,3,9
1,0,0,0,0,4,54,0,1,4,0,4
2,0,1,1,1,4,10,1,3,3,5,9
3,0,0,0,1,4,18,0,1,2,2,3
4,0,0,1,0,7,10,1,0,3,2,5


## Split
Feature matrix and Target Vector


In [5]:
target = "bank_account"
X_train = data.drop(columns=target)
y_train = data[target]
print(X_train.shape)
print(y_train.shape)

(23524, 10)
(23524,)


## Resample Data
We will use **Oversampling** since we had an imbalanced class of target vector

In [6]:
# Instantiate RandomOverSampler
over_sampler = RandomOverSampler(random_state=42)

X_train_over, y_train_over = over_sampler.fit_resample(X_train, y_train)
print(X_train_over.shape)

(40424, 10)


## Build Model


### Baseline

In [7]:
baseline_accuracy = y_train.value_counts(normalize=True).max()
baseline_accuracy

0.859207617752083

### Iterate

In [8]:
# Instantiate "classifier" as "clf"
clf = RandomForestClassifier(random_state=42)

# Perform cross-validation with your classifier, using the over-sampled training data.
# We want five folds, so set cv to 5.
# We also want to speed up training, to set n_jobs to -1
cv_acc_scores = cross_val_score(clf, X_train_over, y_train_over, cv=5, n_jobs=-1)
print(cv_acc_scores)

# The above step is not important

[0.67495362 0.90760668 0.90748299 0.92962276 0.72748639]


In [9]:
# Instantiate "classifier" as "clf" (could be a pipeline)
clf = RandomForestClassifier(random_state=42)

# Define a hyperparameter grid for model tuning
params = {
    "randomforestclassifier__n_estimators": range(25, 100, 25),
    "randomforestclassifier__max_depth": range(10, 50, 10)
}
params

{'randomforestclassifier__n_estimators': range(25, 100, 25),
 'randomforestclassifier__max_depth': range(10, 50, 10)}

Perform a hyperparameter grid search

In [14]:
model = GridSearchCV(
    clf,
    param_grid=params,
    cv=5,
    n_jobs=-1,
    verbose=1
)
model

In [15]:
# Train Model
model.fit(X_train_over, y_train_over)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


ValueError: ignored