Description of data set found here: https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients

Questions we can ask:


In [45]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = pd.read_excel("default of credit card clients.xls", skiprows = 1)

In [3]:
data.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [4]:
#relabeling data for clarity
data.EDUCATION = data.EDUCATION.map({1: 'graduate school', 2: 'university', 3: 'high school', 4: 'others'})
data.MARRIAGE = data.MARRIAGE.map({1: 'married', 2: 'single', 3: 'others'})
data.SEX = data.SEX.apply(lambda x: "MALE" if x == 1 else "FEMALE")

In [5]:
data.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,FEMALE,university,married,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,FEMALE,university,single,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,FEMALE,university,single,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,FEMALE,university,married,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,MALE,university,married,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [6]:
#Group by sex
sex_gp = data.groupby('SEX')
print "Median debt\n"
print sex_gp.LIMIT_BAL.median()
print "\nTotal number of default payments/Total number of loans"
print "\n", sex_gp['default payment next month'].mean()

Median debt

SEX
FEMALE    150000
MALE      130000
Name: LIMIT_BAL, dtype: int64

Total number of default payments/Total number of loans

SEX
FEMALE    0.207763
MALE      0.241672
Name: default payment next month, dtype: float64


In [7]:
#Group by education
sex_gp = data.groupby('EDUCATION')
print "Median debt\n"
print sex_gp.LIMIT_BAL.median()
print "\nTotal number of default payments/Total number of loans"
print "\n", sex_gp['default payment next month'].mean()

Median debt

EDUCATION
graduate school    200000
high school         80000
others             200000
university         110000
Name: LIMIT_BAL, dtype: int64

Total number of default payments/Total number of loans

EDUCATION
graduate school    0.192348
high school        0.251576
others             0.056911
university         0.237349
Name: default payment next month, dtype: float64


In [8]:
#group by age bracket
def agebracket(x):
    if x >= 18 and x <= 24:
        return "18-24"
    elif x >= 25 and x <= 34:
        return "25-34"
    elif x >= 35 and x <= 44:
        return "35-44"
    elif x >= 45 and x <= 54:
        return "45-54"
    elif x >= 55:
        return "55+"
    
data['Age Bracket'] = data.AGE.apply(lambda x: agebracket(x))
agebr_gp = data.groupby('Age Bracket')
print "Median debt\n"
print agebr_gp.LIMIT_BAL.median()
print "\nLikelihood of default"
print "\n", agebr_gp['default payment next month'].mean()

Median debt

Age Bracket
18-24     50000
25-34    150000
35-44    180000
45-54    130000
55+      120000
Name: LIMIT_BAL, dtype: int64

Likelihood of default

Age Bracket
18-24    0.271881
25-34    0.202982
35-44    0.218563
45-54    0.239310
55+      0.266857
Name: default payment next month, dtype: float64


In [9]:
#By all combinations of sex, education, and age bracket
grouped_data = data.groupby(['SEX', 'MARRIAGE', 'EDUCATION', 'Age Bracket'])
print "Median debt\n"
print grouped_data.LIMIT_BAL.median()
print "\nLikelihood of default"
print "\n", grouped_data['default payment next month'].mean()

Median debt

SEX     MARRIAGE  EDUCATION        Age Bracket
FEMALE  married   graduate school  18-24           50000
                                   25-34          210000
                                   35-44          230000
                                   45-54          235000
                                   55+            250000
                  high school      18-24           45000
                                   25-34           90000
                                   35-44          110000
                                   45-54           80000
                                   55+             80000
                  others           18-24           70000
                                   25-34          185000
                                   35-44          225000
                                   45-54          295000
                                   55+            240000
                  university       18-24           50000
                            

In [10]:
#Group with highest median credit
#Group with lowest median credit
#Group with highest percentage of defaults
#Group with lowest percentage of defaults

In [11]:
#derive som fields, e.g. amount owed, total amount paid

In [43]:
data.head()

Unnamed: 0,Age Bracket,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,18-24,20000,FEMALE,university,married,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,25-34,120000,FEMALE,university,single,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,25-34,90000,FEMALE,university,single,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,35-44,50000,FEMALE,university,married,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,55+,50000,MALE,university,married,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0
