**Center for Microfinance Dataset**

A dataset on microfinance from The Centre for Micro Finance (CMF) at the Institute for Financial Management Research (Chennai, India).

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Load-Data" data-toc-modified-id="Load-Data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Load Data</a></span><ul class="toc-item"><li><span><a href="#Exploration-Data" data-toc-modified-id="Exploration-Data-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Exploration Data</a></span></li><li><span><a href="#Modeling-Data" data-toc-modified-id="Modeling-Data-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Modeling Data</a></span></li></ul></li><li><span><a href="#Data-Exploration" data-toc-modified-id="Data-Exploration-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Data Exploration</a></span><ul class="toc-item"><li><span><a href="#Full-Data-Visualization" data-toc-modified-id="Full-Data-Visualization-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Full Data Visualization</a></span></li><li><span><a href="#Modeling-Data-Preparation" data-toc-modified-id="Modeling-Data-Preparation-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Modeling Data Preparation</a></span></li></ul></li><li><span><a href="#Standard-Models" data-toc-modified-id="Standard-Models-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Standard Models</a></span><ul class="toc-item"><li><span><a href="#Two-Model" data-toc-modified-id="Two-Model-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Two Model</a></span></li><li><span><a href="#Interaction-Term" data-toc-modified-id="Interaction-Term-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Interaction Term</a></span></li><li><span><a href="#Class-Transformations" data-toc-modified-id="Class-Transformations-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Class Transformations</a></span></li></ul></li><li><span><a href="#Generalized-Random-Forest" data-toc-modified-id="Generalized-Random-Forest-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Generalized Random Forest</a></span></li><li><span><a href="#Evaluation" data-toc-modified-id="Evaluation-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Evaluation</a></span><ul class="toc-item"><li><span><a href="#Iterations" data-toc-modified-id="Iterations-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>Iterations</a></span></li><li><span><a href="#Visual" data-toc-modified-id="Visual-5.2"><span class="toc-item-num">5.2&nbsp;&nbsp;</span>Visual</a></span></li><li><span><a href="#Variance" data-toc-modified-id="Variance-5.3"><span class="toc-item-num">5.3&nbsp;&nbsp;</span>Variance</a></span></li><li><span><a href="#Econometric" data-toc-modified-id="Econometric-5.4"><span class="toc-item-num">5.4&nbsp;&nbsp;</span>Econometric</a></span></li></ul></li><li><span><a href="#Summary" data-toc-modified-id="Summary-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Summary</a></span></li></ul></div>

In [None]:
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier

from causeinfer.data import cmf_micro
from causeinfer.utils import plot_unit_distributions, train_test_split, over_sample
from causeinfer.standard_algorithms import TwoModel, InteractionTerm
from causeinfer.standard_algorithms import BinaryTransformation
from causeinfer.standard_algorithms import QuaternaryTransformation
from causeinfer.evaluation import qini_score, auuc_score
from causeinfer.evaluation import plot_cum_effect, plot_cum_gain, plot_qini
from causeinfer.evaluation import plot_batch_responses, signal_to_noise
from causeinfer.evaluation import iterate_model, eval_table

pd.set_option("display.max_rows", 16)
pd.set_option('display.max_columns', None)
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:99% !important; }</style>"))

In [None]:
os.getcwd()

In [None]:
head_shape = helpers_py.head_shape

# Load Data

In [None]:
# Deprecated - see: https://github.com/andrewtavis/causeinfer/tree/master/causeinfer/data/datasets
# cmf_micro.download_cmf_micro() 

## Exploration Data

In [None]:
# The full mostly unformatted dataset is loaded
data_raw_ = cmf_micro.load_cmf_micro(user_file_path="datasets/cmf_micro",
                                     format_covariates=False, 
                                     normalize=False)

df_full = pd.DataFrame(data_raw["dataset_full"], 
                       columns=data_raw["dataset_full_names"])

display(df_full.head())
df_full.shape

## Modeling Data

In [None]:
# The formatted dataset is loaded
data_cmf_micro = cmf_micro.load_cmf_micro(user_file_path="datasets/cmf_micro",
                                          format_covariates=True, 
                                          normalize=True)

df = pd.DataFrame(data_cmf_micro["dataset_full"], 
                  columns=data_cmf_micro["dataset_full_names"])

In [None]:
# Covariates, treatments and responses are loaded separately
X = data_cmf_micro["features"]

y = data_cmf_micro["response_biz_index"] # response_biz_index or response_women_emp

w = data_cmf_micro["treatment"]

# Data Exploration

In [None]:
sns.set(style="whitegrid")

## Full Data Visualization

In [None]:
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, sharey=True, figsize=(20,5))

plot_unit_distributions(df=df_full, variable='variable', treatment = None,
                        plot_x_label='x', plot_y_label='y', plot_title='title', 
                        bins=None, figsize=None, fontsize=25, axis=ax1),

plot_unit_distributions(df=df_full, variable='variable', treatment = 'treatment',
                        plot_x_label='x', plot_y_label='y', plot_title='title', 
                        bins=None, figsize=None, fontsize=25, axis=ax2)

## Modeling Data Preparation

In [None]:
df.pivot_table(values=['response_biz_index', 'response_women_emp'],
               index='treatment',
               aggfunc=[np.mean],
               margins=True)

In [None]:
# Counts for treatment
control_indexes = [i for i, e in enumerate(w) if e == 0]
treatment_indexes = [i for i, e in enumerate(w) if e == 1]

print(len(control_indexes))
print(len(treatment_indexes))

In [None]:
X_control = X[control_indexes]
y_control = y[control_indexes]
w_control = w[control_indexes]

X_treatment = X[treatment_indexes]
y_treatment = y[treatment_indexes]
w_treatment = w[treatment_indexes]

In [None]:
# Over-sampling of control
X_os, y_os, w_os = over_sample(X_1=X_control, y_1=y_control, w_1=w_control, 
                               sample_2_size=len(X_treatment), shuffle=True)

In [None]:
X_split = np.append(X_os, X_treatment, axis=0)
y_split = np.append(y_os, y_treatment, axis=0)
w_split = np.append(w_os, w_treatment, axis=0)

X_split.shape, y_split.shape, w_split.shape # Should all be equal in the first dimension

In [None]:
X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(X_split, y_split, w_split, 
                                                                     percent_train=0.7, random_state=42, 
                                                                     maintain_proportions=True)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape, w_train.shape, w_test.shape

In [None]:
print(np.array(np.unique(w_train, return_counts=True)).T)
print(np.array(np.unique(w_test, return_counts=True)).T)

In [None]:
sn_ratio = signal_to_noise(y=y_split, w=w_split)
sn_ratio

# Standard Models

## Two Model

In [None]:
tm = TwoModel(treatment_model=RandomForestClassifier(),
              control_model=RandomForestClassifier())
tm.fit(X=X_train, y=y_train, w=w_train)

In [None]:
tm_preds = tm.predict(X=X_test)
tm_preds

## Interaction Term

In [None]:
it = InteractionTerm(model=RandomForestClassifier())
it.fit(X=X_train, y=y_train, w=w_train)

In [None]:
it_preds = it.predict(X=X_test)
it_preds

## Class Transformations

Class transformation approaches aren't available for continuous response values yet. Work will be done to modify those algorithms to make them applicable in these settings.

# Generalized Random Forest

# Evaluation

## Iterations

In [None]:
tm_effects = [tm_preds[i][0] - tm_preds[i][1] for i in range(len(tm_preds))]
it_effects = [it_preds[i][0] - it_preds[i][1] for i in range(len(it_preds))]

In [None]:
eval_dict = {'y_test': y_test, 'w_test': w_test, 
             'two_model': tm_effects, 'interaction_term': it_effects}

In [None]:
df_eval = pd.DataFrame(eval_dict, columns = eval_dict.keys())

display(df_eval.head())
df_eval.shape

In [None]:
models = [col for col in eval_dict.keys() if col not in ['y_test', 'w_test']]

## Visual

In [None]:
fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=False, figsize=(20,5))

plot_cum_effect(df=df_eval, n=100, models=models, percent_of_pop=False, 
                outcome_col='y_test', treatment_col='w_test', random_seed=42, 
                figsize=(10,5), fontsize=20, axis=ax1, legend_metrics=False)

plot_batch_responses(df=df_eval, n=10, models=models, 
                     outcome_col='y_test', treatment_col='w_test', normalize=False,
                     figsize=None, fontsize=15, axis=ax2)

In [None]:
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, sharey=False, figsize=(20,5))

plot_cum_gain(df=df_eval, n=100, models=models, percent_of_pop=True,
              outcome_col='y_test', treatment_col='w_test', normalize=True, random_seed=42, 
              figsize=None, fontsize=20, axis=ax1, legend_metrics=True)

plot_qini(df=df_eval, n=100, models=models, percent_of_pop=True, 
          outcome_col='y_test', treatment_col='w_test', normalize=True, random_seed=42, 
          figsize=None, fontsize=20, axis=ax2, legend_metrics=True)

## Variance

## Econometric

# Summary