# 1. Import you data and perform basic data exploration phase.

In [1]:
import pandas as pd

# List of columns to keep
columns_to_keep = [
    "country", 
    "location_type", 
    "cellphone_access", 
    "household_size", 
    "age_of_respondent", 
    "gender_of_respondent", 
    "relationship_with_head", 
    "marital_status", 
    "education_level", 
    "job_type",
    "bank_account"  # This is your target variable
]

# Load the dataset
data = pd.read_csv("Financial_inclusion_dataset.csv",  usecols = columns_to_keep)

data

Unnamed: 0,country,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,Yes,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,No,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,Yes,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,No,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,No,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed
...,...,...,...,...,...,...,...,...,...,...,...
23519,Uganda,No,Rural,Yes,4,48,Female,Head of Household,Divorced/Seperated,No formal education,Other Income
23520,Uganda,No,Rural,Yes,2,27,Female,Head of Household,Single/Never Married,Secondary education,Other Income
23521,Uganda,No,Rural,Yes,5,27,Female,Parent,Widowed,Primary education,Other Income
23522,Uganda,No,Urban,Yes,7,30,Female,Parent,Divorced/Seperated,Secondary education,Self employed


### - Display General Information about the Dataset.

In [2]:
# Display dataset info
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23524 entries, 0 to 23523
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   country                 23524 non-null  object
 1   bank_account            23524 non-null  object
 2   location_type           23524 non-null  object
 3   cellphone_access        23524 non-null  object
 4   household_size          23524 non-null  int64 
 5   age_of_respondent       23524 non-null  int64 
 6   gender_of_respondent    23524 non-null  object
 7   relationship_with_head  23524 non-null  object
 8   marital_status          23524 non-null  object
 9   education_level         23524 non-null  object
 10  job_type                23524 non-null  object
dtypes: int64(2), object(9)
memory usage: 2.0+ MB


### - Handle Missing and Corrupted Values.
- There are no missing values to handle.

In [3]:
data.isnull().sum()

country                   0
bank_account              0
location_type             0
cellphone_access          0
household_size            0
age_of_respondent         0
gender_of_respondent      0
relationship_with_head    0
marital_status            0
education_level           0
job_type                  0
dtype: int64

### - Remove Duplicates.

In [4]:
# Check for duplicates
data = data.drop_duplicates()

### - Handle Outliers.
- There are no outliers in the dataset.

### - Encode Categorical Features.

In [5]:
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Apply LabelEncoder to each categorical column
for col in ["country", "location_type", "cellphone_access", "gender_of_respondent", 
            "relationship_with_head","marital_status", "education_level", "job_type"]:
    
    # Use .loc to set values
    data.loc[:, col] = label_encoder.fit_transform(data[col])

In [6]:
data

Unnamed: 0,country,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,0,Yes,0,1,3,24,0,5,2,3,9
1,0,No,0,0,5,70,0,1,4,0,4
2,0,Yes,1,1,5,26,1,3,3,5,9
3,0,No,0,1,5,34,0,1,2,2,3
4,0,No,1,0,8,26,1,0,3,2,5
...,...,...,...,...,...,...,...,...,...,...,...
23519,3,No,0,1,4,48,0,1,0,0,7
23520,3,No,0,1,2,27,0,1,3,3,7
23521,3,No,0,1,5,27,0,4,4,2,7
23522,3,No,1,1,7,30,0,4,0,3,9


# 2. Train and Test a Machine Learning Classifier.

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Define features and target
features = data.drop("bank_account", axis=1)
label = data["bank_account"]

# Split the data
x_train, x_test, y_train, y_test = train_test_split(features, label, train_size=0.8)

# Train the model
model = RandomForestClassifier()
model.fit(x_train, y_train)

# Test the model
y_pred = model.predict(x_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')

Accuracy: 0.8277035873265253


In [8]:
import pickle

# Save the model to a file
with open("model.pkl", "wb") as file:
    pickle.dump(model, file)