#### Imports

In [1]:
import numpy as np
import pandas as pd
pd.options.display.max_rows = 200
pd.options.display.max_columns = 100
pd.set_option('display.float_format', lambda x: '%.3f' % x)
from math import floor
from sklearn.cluster import KMeans

from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

#### Read Data

In [6]:
df = pd.read_csv("..\data\cs-training.csv", low_memory=False)

In [None]:
del df['Unnamed: 0']

#### Sneak peak at data

In [9]:
df.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,0.766,45,2,0.803,9120.0,13,0,6,0,2.0
1,0,0.957,40,0,0.122,2600.0,4,0,0,0,1.0
2,0,0.658,38,1,0.085,3042.0,2,1,0,0,0.0
3,0,0.234,30,0,0.036,3300.0,5,0,0,0,0.0
4,0,0.907,49,1,0.025,63588.0,7,0,1,0,0.0


#### Target Variable

In [12]:
df.SeriousDlqin2yrs.value_counts()

0    139974
1     10026
Name: SeriousDlqin2yrs, dtype: int64

####  Basic stats of data

In [13]:
df.describe()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,150000.0,150000.0,150000.0,150000.0,150000.0,120269.0,150000.0,150000.0,150000.0,150000.0,146076.0
mean,0.067,6.048,52.295,0.421,353.005,6670.221,8.453,0.266,1.018,0.24,0.757
std,0.25,249.755,14.772,4.193,2037.819,14384.674,5.146,4.169,1.13,4.155,1.115
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.03,41.0,0.0,0.175,3400.0,5.0,0.0,0.0,0.0,0.0
50%,0.0,0.154,52.0,0.0,0.367,5400.0,8.0,0.0,1.0,0.0,0.0
75%,0.0,0.559,63.0,0.0,0.868,8249.0,11.0,0.0,2.0,0.0,1.0
max,1.0,50708.0,109.0,98.0,329664.0,3008750.0,58.0,98.0,54.0,98.0,20.0


#### Null Containing columns

In [14]:
df.isnull().sum()

SeriousDlqin2yrs                            0
RevolvingUtilizationOfUnsecuredLines        0
age                                         0
NumberOfTime30-59DaysPastDueNotWorse        0
DebtRatio                                   0
MonthlyIncome                           29731
NumberOfOpenCreditLinesAndLoans             0
NumberOfTimes90DaysLate                     0
NumberRealEstateLoansOrLines                0
NumberOfTime60-89DaysPastDueNotWorse        0
NumberOfDependents                       3924
dtype: int64

In [17]:
with_null_income = df[df.MonthlyIncome.isnull()]

In [21]:
with_null_income.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
6,0,0.306,57,0,5710.0,,8,0,3,0,0.0
8,0,0.117,27,0,46.0,,2,0,0,0,
16,0,0.061,78,0,2058.0,,10,0,2,0,0.0
32,0,0.083,62,0,977.0,,6,0,1,0,0.0
41,0,0.073,81,0,75.0,,7,0,0,0,0.0


In [19]:
with_null_income.SeriousDlqin2yrs.value_counts()

0    28062
1     1669
Name: SeriousDlqin2yrs, dtype: int64

In [22]:
with_null_income.describe()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,29731.0,29731.0,29731.0,29731.0,29731.0,0.0,29731.0,29731.0,29731.0,29731.0,25807.0
mean,0.056,6.649,56.362,0.58,1673.397,,7.216,0.485,0.871,0.453,0.316
std,0.23,217.815,15.439,6.255,4248.373,,4.843,6.25,1.034,6.242,0.81
min,0.0,0.0,21.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.016,46.0,0.0,123.0,,4.0,0.0,0.0,0.0,0.0
50%,0.0,0.082,57.0,0.0,1159.0,,6.0,0.0,1.0,0.0,0.0
75%,0.0,0.441,67.0,0.0,2382.0,,10.0,0.0,1.0,0.0,0.0
max,1.0,22198.0,109.0,98.0,329664.0,,45.0,98.0,23.0,98.0,9.0
