In [None]:
# This notebook is solution to Zindi challenge @ https://zindi.africa/competitions/financial-inclusion-in-africa
# Refer for More details and Dataset

In [5]:
# Import deps

import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder


In [6]:
# Import Dataset

train_df = pd.read_csv('../data/financial-inclusion-in-africa/Train.csv')

# Drop uneccessary columns
columns_to_drop = ['uniqueid']
train_df = train_df.drop(columns_to_drop, axis=1)

train_df.tail()

Unnamed: 0,country,year,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
23519,Uganda,2018,No,Rural,Yes,4,48,Female,Head of Household,Divorced/Seperated,No formal education,Other Income
23520,Uganda,2018,No,Rural,Yes,2,27,Female,Head of Household,Single/Never Married,Secondary education,Other Income
23521,Uganda,2018,No,Rural,Yes,5,27,Female,Parent,Widowed,Primary education,Other Income
23522,Uganda,2018,No,Urban,Yes,7,30,Female,Parent,Divorced/Seperated,Secondary education,Self employed
23523,Uganda,2018,No,Rural,Yes,10,20,Male,Child,Single/Never Married,Secondary education,No Income


In [30]:
# Preprocessing of features

def preprocess(data):
    
    
    # 1. Frequency encode country column
    
    data['country'] = data['country'].map(data['country'].value_counts().to_dict())
    
    
    # 2. Frequency encode location_type column
    
    data['location_type'] = data['location_type'].map(data['location_type'].value_counts().to_dict())
    
    # 3. Frequency encode cellphone_access column
    
    data['cellphone_access'] = data['cellphone_access'].map(data['cellphone_access'].value_counts().to_dict())
    
    # 4. Frequency encode gender_of_respondent column
    
    data['gender_of_respondent'] = data['gender_of_respondent'].map(data['gender_of_respondent'].value_counts().to_dict())
    
    # 5. Frequency encode relationship_with_head column
    data['relationship_with_head'] = data['relationship_with_head'].map(data['relationship_with_head'].value_counts().to_dict())
    
    # 6. Frequency encode gender_of_respondent column
    data['marital_status'] = data['marital_status'].map(data['marital_status'].value_counts().to_dict())
    
    # 7. Frequency encode gender_of_respondent column
    data['education_level'] = data['education_level'].map(data['education_level'].value_counts().to_dict())
    
    
    # 8. Frequency encode gender_of_respondent column
    data['job_type'] = data['job_type'].map(data['job_type'].value_counts().to_dict())

    return data

train_df = preprocess(train_df)
    
  # 2. Label encode bank_account column and move to the end
label_encoder = LabelEncoder()
train_df['bank_account'] = label_encoder.fit_transform(train_df['bank_account'])
    
preprocess(train_df).head()
    

Unnamed: 0,country,year,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,6068,2018,1,14343,17454,3,24,13877,6520,10749,4223,6437
1,6068,2018,0,14343,6070,5,70,13877,12831,2708,4515,247
2,6068,2018,1,9181,17454,5,26,9647,668,7983,803,6437
3,6068,2018,0,14343,17454,5,34,13877,12831,10749,12791,1055
4,6068,2018,0,9181,6070,8,26,9647,2229,7983,12791,5597


In [31]:
# Separate Features and Labels

columns_to_drop = ['bank_account']
features = train_df.drop(columns_to_drop, axis=1)


# Prepare Labels
labels = train_df["bank_account"]



In [32]:
# Create Logistic Regression Model

model = LogisticRegression(max_iter=1000)

In [33]:
# Train model on training data
model.fit(features, labels)

In [36]:
# Testing Dataset
test_df = pd.read_csv('../data/financial-inclusion-in-africa/Test.csv')

# Drop uneccessary columns
columns_to_drop = ['uniqueid']
testing_df = test_df.drop(columns_to_drop, axis=1)

testing_df = preprocess(testing_df)

testing_df.head()


Unnamed: 0,country,year,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,2601,2018,3897,7559,3,30,4239,5487,4663,1902,166
1,2601,2018,3897,7559,7,51,4239,5487,4663,359,444
2,2601,2018,6189,2527,3,77,5847,465,4663,1836,1107
3,2601,2018,6189,2527,6,39,5847,5487,4663,5479,1107
4,2601,2018,3897,2527,3,16,4239,962,3447,1902,1107


In [43]:

y_pred = model.predict(testing_df)

print(y_pred)

[0 0 0 ... 0 0 0]


In [23]:
# View submission sample

submission_csv = pd.read_csv('../data/financial-inclusion-in-africa/SampleSubmission.csv')

submission_csv.tail()

Unnamed: 0,unique_id,bank_account
33605,uniqueid_2998 x Uganda,0
33606,uniqueid_2999 x Uganda,0
33607,uniqueid_3000 x Uganda,0
33608,uniqueid_3001 x Uganda,0
33609,uniqueid_3002 x Uganda,0


In [64]:
# Preparing a submission sample

for value in y_pred:
    submission_df = pd.DataFrame({"unique_id" : test_df["uniqueid"] + " x " + test_df["country"], "bank_account": value})

submission_df.to_csv('../data/financial-inclusion-in-africa/submission-1.csv', encoding="utf-8", index=False)

print(submission_df.count())

unique_id       10086
bank_account    10086
dtype: int64
