In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
train_data=pd.read_csv("datasets/Train_v2.csv")
test_data=pd.read_csv("datasets/Test_v2.csv")

In [None]:
train_data.head()

## 2. Data Wrangling

1. Filling missing values
2. Check for duplicates

In [None]:
test_data.head()

In [None]:
train_data.info()

In [None]:
train_data.isnull().sum()

In [None]:
test_data.isnull().sum()

## Exploratory Data Analysis 
1. Data Profiling
2. Data Relationships
3. Feature Engineering

In [None]:
all_data = pd.concat([train_data, test_data])

In [None]:
all_data

In [None]:
sns.countplot(x="age_of_respondent", data=train_data)

In [None]:
features = ["household_size", "age_of_respondent", "location_type", "cellphone_access"]
fig=plt.subplots(figsize=(12, 9))

for index,feature in enumerate(features):
    plt.subplot(2,2,index+1)
    plt.subplots_adjust(hspace=1.0)
    sns.histplot(x=train_data[feature])
  

In [None]:
features = ["bank_account", "education_level", "location_type", "cellphone_access"]
fig=plt.subplots(figsize=(12, 9))

for index,feature in enumerate(features):
    plt.subplot(2,2,index+1)
    plt.subplots_adjust(hspace=1.0)
    sns.countplot(data=train_data, x=feature, hue="country")

# Pre-processing and Training Data Development

* Create Dummy or Indicator features for Categorical variables

*   Standardize the magnitude of numeric features

*   Split into testing and training of data sets



In [None]:
all_data.drop(columns="bank_account", inplace=True)

In [46]:
country_dummies_train = pd.get_dummies(train_data["country"])

In [47]:
train_data = train_data.join(country_dummies_train)

In [48]:
country_dummies_test = pd.get_dummies(test_data["country"])
test_data = test_data.join(country_dummies_test)

In [None]:
all_data.info()

In [None]:
all_data.head()

In [49]:
to_train_data = train_data[["Kenya", "Rwanda", "Tanzania", "Uganda", "age_of_respondent", "household_size"]]

In [50]:
to_test_data = test_data[["Kenya", "Rwanda", "Tanzania", "Uganda", "age_of_respondent", "household_size"]]

In [70]:
to_test_data

Unnamed: 0,Kenya,Rwanda,Tanzania,Uganda,age_of_respondent,household_size
0,1,0,0,0,30,3
1,1,0,0,0,51,7
2,1,0,0,0,77,3
3,1,0,0,0,39,6
4,1,0,0,0,16,3
...,...,...,...,...,...,...
10081,0,0,0,1,62,2
10082,0,0,0,1,42,8
10083,0,0,0,1,39,1
10084,0,0,0,1,28,6


In [51]:
to_train_data.shape

(23524, 6)

In [54]:
X = to_train_data.values

In [64]:
from sklearn.preprocessing import LabelEncoder

income_encoder=LabelEncoder()
#invoking fit_transform method on object
train_data['bank_account'] = income_encoder.fit_transform(train_data['bank_account'])

train_data.head()

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type,Kenya,Rwanda,Tanzania,Uganda
0,Kenya,2018,uniqueid_1,1,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed,1,0,0,0
1,Kenya,2018,uniqueid_2,0,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent,1,0,0,0
2,Kenya,2018,uniqueid_3,1,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed,1,0,0,0
3,Kenya,2018,uniqueid_4,0,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private,1,0,0,0
4,Kenya,2018,uniqueid_5,0,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed,1,0,0,0


In [65]:
y = train_data['bank_account']

In [53]:
to_test_data.shape

(10086, 6)

In [66]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.15)

## Modelling 



*   Fit Models with Training Data set (Hint: RandomForest regressor, SupportVector, XGBoostRegressor)

*   Review Model Outcomes – Iterate over additional models as needed

*   Identify the Final Model



In [67]:
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB() # Instantiate the object
classifier.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [68]:
y_pred = classifier.predict(X_test)

In [69]:
# evaluating performance : Accuracy, Precision, Recall
from sklearn import metrics

print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
print("Precision: ", metrics.precision_score(y_test, y_pred))
print("Recall: ", metrics.recall_score(y_test, y_pred))
print("F1 Score: ", metrics.f1_score(y_test, y_pred))

Accuracy:  0.8447152167752905
Precision:  0.3178294573643411
Recall:  0.08183632734530938
F1 Score:  0.13015873015873014


In [71]:
y_submission = classifier.predict(to_test_data)

In [73]:
y_submission

array([0, 0, 0, ..., 0, 0, 0])

In [76]:
submission_file = pd.read_csv("datasets/SubmissionFile.csv", index_col=None)
submission_file.head()

Unnamed: 0,uniqueid,bank_account
0,uniqueid_7867 x Kenya,1.0
1,uniqueid_6722 x Kenya,0.0
2,uniqueid_6714 x Kenya,1.0
3,uniqueid_8103 x Kenya,1.0
4,uniqueid_8657 x Kenya,1.0


In [77]:
submission_file["bank_account"] = y_submission

In [81]:
submission_file.to_csv("submission_1.csv", index= False)