In [1]:
import pandas as pd

In [2]:
# Function for reading the data
def read_csv(filename):
    
    df = pd.read_csv(filename)
    
    return df

In [3]:
# Read 'credit-data.csv'
df = read_csv('data/credit-data.csv')

In [4]:
df.head()

Unnamed: 0,PersonID,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,zipcode,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,98976,0,1.0,55,60601,0,505.0,0.0,2,0,0,0,0.0
1,98991,0,0.547745,71,60601,0,0.459565,15666.0,7,0,2,0,0.0
2,99012,0,0.04428,51,60601,0,0.01452,4200.0,5,0,0,0,0.0
3,99023,0,0.914249,55,60601,4,0.794875,9052.0,12,0,3,0,0.0
4,99027,0,0.026599,45,60601,0,0.049966,10406.0,4,0,0,0,2.0


In [5]:
df.shape

(41016, 13)

In [6]:
# Check the missing data
df.isna().sum()

PersonID                                   0
SeriousDlqin2yrs                           0
RevolvingUtilizationOfUnsecuredLines       0
age                                        0
zipcode                                    0
NumberOfTime30-59DaysPastDueNotWorse       0
DebtRatio                                  0
MonthlyIncome                           7974
NumberOfOpenCreditLinesAndLoans            0
NumberOfTimes90DaysLate                    0
NumberRealEstateLoansOrLines               0
NumberOfTime60-89DaysPastDueNotWorse       0
NumberOfDependents                      1037
dtype: int64

- We can see that there are a lot of missing data for __MonthlyIncome__ and __NumberOfDependents__.
- Therefore, we probably do not want to simply drop all missing data.

In [7]:
df.dtypes

PersonID                                  int64
SeriousDlqin2yrs                          int64
RevolvingUtilizationOfUnsecuredLines    float64
age                                       int64
zipcode                                   int64
NumberOfTime30-59DaysPastDueNotWorse      int64
DebtRatio                               float64
MonthlyIncome                           float64
NumberOfOpenCreditLinesAndLoans           int64
NumberOfTimes90DaysLate                   int64
NumberRealEstateLoansOrLines              int64
NumberOfTime60-89DaysPastDueNotWorse      int64
NumberOfDependents                      float64
dtype: object

We will use the average value of __MonthlyIncome__ and the most common value of __NumberOfDependents__ to fill these missing data. 

In [8]:
def fill_continuous_na(df, columns):
    
    for column in columns:
        estimate = round(df[column].mean())
        df[column] = df[column].fillna(estimate)
    
    return df

In [9]:
def fill_categorical_na(df, columns):
    
    for column in columns:
        df[column] = df[column].fillna(99)
        estimate = df[column].value_counts().index[0]
        df[column] = df[column].replace([99], estimate)
    
    return df

In [10]:
df = fill_continuous_na(df, ['MonthlyIncome'])
df = fill_categorical_na(df, ['NumberOfDependents'])

Now we verify that there is no missing data.

In [11]:
df.isna().sum()

PersonID                                0
SeriousDlqin2yrs                        0
RevolvingUtilizationOfUnsecuredLines    0
age                                     0
zipcode                                 0
NumberOfTime30-59DaysPastDueNotWorse    0
DebtRatio                               0
MonthlyIncome                           0
NumberOfOpenCreditLinesAndLoans         0
NumberOfTimes90DaysLate                 0
NumberRealEstateLoansOrLines            0
NumberOfTime60-89DaysPastDueNotWorse    0
NumberOfDependents                      0
dtype: int64