# [Home Credit Default Risk](https://www.kaggle.com/c/home-credit-default-risk)
## Preprocessing - Bureau Balance table
### Platform: Python 3, colab.research.google.com

In [0]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly
import seaborn as sns
import plotly
import plotly.graph_objs as go
from google.colab import drive

In [2]:
plotly.offline.init_notebook_mode(connected=True)
sns.set(style='whitegrid', context='notebook')

In [0]:
# run in each cell where Plotly is used
def configure_plotly_browser_state():
  import IPython
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              plotly: 'https://cdn.plot.ly/plotly-latest.min.js?noext',
            },
          });
        </script>
        '''))

## Load data

In [4]:
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [5]:
bureau_balance_orig = pd.read_csv("/content/gdrive/My Drive/Colab Notebooks/kaggle-home-credit-default-risk/data/bureau_balance.csv", header=0)
bureau_balance_orig.shape

(27299925, 3)

In [6]:
bureau_balance_orig.head(5)

Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE,STATUS
0,5715448,0,C
1,5715448,-1,C
2,5715448,-2,C
3,5715448,-3,C
4,5715448,-4,C


In [7]:
bureau_balance_orig.shape

(27299925, 3)

## Exploration

In [8]:
for i in range(len(bureau_balance_orig.columns)):
    print(bureau_balance_orig.columns[i], bureau_balance_orig.dtypes[i])

SK_ID_BUREAU int64
MONTHS_BALANCE int64
STATUS object


In [10]:
describe_numerical = bureau_balance_orig.describe()
for i in describe_numerical.columns:
    print(i)
    print(describe_numerical.loc[:, i])
    print()

SK_ID_BUREAU
count    2.729992e+07
mean     6.036297e+06
std      4.923489e+05
min      5.001709e+06
25%      5.730933e+06
50%      6.070821e+06
75%      6.431951e+06
max      6.842888e+06
Name: SK_ID_BUREAU, dtype: float64

MONTHS_BALANCE
count    2.729992e+07
mean    -3.074169e+01
std      2.386451e+01
min     -9.600000e+01
25%     -4.600000e+01
50%     -2.500000e+01
75%     -1.100000e+01
max      0.000000e+00
Name: MONTHS_BALANCE, dtype: float64



Plot numerical features

In [16]:
def plot_numerical_feature(feature, min_x=None, max_x=None, size=None):
    configure_plotly_browser_state()
    if not min_x:
        min_x = describe_numerical.loc["min", feature]
    if not max_x:
        max_x = describe_numerical.loc["max", feature]
    if not size:
        size = int((max_x - min_x)/30)
    random_idx = np.random.randint(low=0, high=len(bureau_balance_orig)-1, size=500000) # limit size to save memory
    plotly.offline.iplot({
        "data": [go.Histogram(
            x=bureau_balance_orig.loc[random_idx, feature],
            xbins={
                "start": min_x,
                "end": max_x,
                "size": size
            }
        )],
        "layout": go.Layout(title="{} histogram".format(feature), height=300)
    })
plot_numerical_feature("MONTHS_BALANCE")

Output hidden; open in https://colab.research.google.com to view.

Plot categorical features

In [17]:
def plot_categorical_feature(feature):
    configure_plotly_browser_state()
    feature_counts = bureau_balance_orig[feature].value_counts()
    plotly.offline.iplot({
        "data": [go.Bar(
            x=feature_counts.index,
            y=feature_counts.values
        )],
        "layout": go.Layout(title="{} bar".format(feature), height=300)
    })
plot_categorical_feature("STATUS")

In [0]:
bureau_balance = bureau_balance_orig.copy(deep=True) # retain original features for later

## Duplicates

In [19]:
bureau_balance.loc[bureau_balance.duplicated()==True, ]

Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE,STATUS


## Missing data

In [20]:
def get_missing(df):
    missing = df.isnull().sum()
    missing = missing.sort_values(ascending=False)
    return missing.loc[missing > 0]
print("# of columns with missing values")
print(len(get_missing(bureau_balance)))
get_missing(bureau_balance)

# of columns with missing values
0


Series([], dtype: int64)

## Outliers

None are present. See decriptive statistics above.

## Correlated features

In [23]:
def show_feature_correlation(df):
    df_corr = df.corr()
    high_correlations = pd.DataFrame(columns=["f1", "f2", "corr"])
    for i, row in df_corr.iterrows():
        for j in row.index:
            if i == j:
                continue
            exists = high_correlations.loc[
                ((high_correlations["f1"] == i) & (high_correlations["f2"] == j)) | 
                ((high_correlations["f2"] == i) & (high_correlations["f1"] == j))]
            if len(exists) > 0:
                continue           
            high_correlations.loc[len(high_correlations), :] = [i, j, abs(df_corr.loc[i, j])]
    high_correlations = high_correlations.sort_values(by="corr", ascending=False)
    return high_correlations
high_correlations = show_feature_correlation(bureau_balance)
print(len(high_correlations))
print(high_correlations.head(10))

1
             f1              f2       corr
0  SK_ID_BUREAU  MONTHS_BALANCE  0.0118726


In [26]:
feature1 = "MONTHS_BALANCE"
feature2 = "STATUS"
configure_plotly_browser_state()
random_idx = np.random.randint(low=0, high=len(bureau_balance)-1, size=50000) # limit size to save memory
plotly.offline.iplot({
    "data": [go.Scatter(
        x=bureau_balance.loc[random_idx, feature1],
        y=bureau_balance.loc[random_idx, feature2],
        mode = "markers",
        marker = {
            "size": 2
        }
    )],
    "layout": go.Layout(title="{} - {} scatter".format(feature1, feature2), height=300)
})

## Skewed data

None. The only numerical variable MONTHS_BALANCE appears well balanced.

## Imbalanced classes

Imbalanced features that may benefit from additional treatment:

In [28]:
imbalanced_classes = ["STATUS"]
plot_categorical_feature(imbalanced_classes[0])

## Save datasets

In [0]:
bureau_balance.to_csv("/content/gdrive/My Drive/Colab Notebooks/kaggle-home-credit-default-risk/data/bureau_balance_clean.csv")

In [30]:
bureau_balance_check = pd.read_csv("/content/gdrive/My Drive/Colab Notebooks/kaggle-home-credit-default-risk/data/bureau_balance_clean.csv", header=0, index_col=0)
assert bureau_balance.shape == bureau_balance_check.shape


elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison

