In [3]:
# 라이브러리 불러오기
import pandas as pd
import numpy as np

# 시각화
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
from wordcloud import WordCloud

# styling
%matplotlib inline
sns.set_style('darkgrid')
mpl.rcParams['font.size'] = 14
mpl.rcParams['figure.facecolor'] = '#00000000'
mpl.rcParams['font.size'] = 14
mpl.rcParams['figure.facecolor'] = '#00000000'

import warnings
warnings.filterwarnings("ignore")

In [4]:
# 데이터 불러오기
df = pd.read_csv('../project_1/datasets/credit_risk_dataset.csv')
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


#### feature description
- person_age: Age of the individual applying for the loan. 나이
- person_income: Annual income of the individual. 소득
- person_home_ownership: Type of home ownership of the individual. 소유주택형태
    - rent: The individual is currently renting a property.
    - mortgage: The individual has a mortgage on the property they own.
    - own: The individual owns their home outright.
    - other: Other categories of home ownership that may be specific to the dataset.
- person_emp_length: Employment length of the individual in years. 근무 기간
- loan_intent: The intent behind the loan application. 대출 목적
- loan_grade: The grade assigned to the loan based on the creditworthiness of the borrower. 대출 등급
    - A: The borrower has a high creditworthiness, indicating low risk.
    - B: The borrower is relatively low-risk, but not as creditworthy as Grade A.
    - C: The borrower's creditworthiness is moderate.
    - D: The borrower is considered to have higher risk compared to previous grades.
    - E: The borrower's creditworthiness is lower, indicating a higher risk.
    - F: The borrower poses a significant credit risk.
    - G: The borrower's creditworthiness is the lowest, signifying the highest risk.
- loan_amnt: The loan amount requested by the individual. 대출액
- loan_int_rate: The interest rate associated with the loan. 대출 이자율
- loan_status: Loan status, where 0 indicates non-default and 1 indicates default. 대출상태
    - 0: Non-default - The borrower successfully repaid the loan as agreed, and there was no default. 상환완료
    - 1: Default - The borrower failed to repay the loan according to the agreed-upon terms and defaulted on the loan. 채무불이행
- loan_percent_income: The percentage of income represented by the loan amount.대출액대비소득비율
- cb_person_default_on_file: Historical default of the individual as per credit bureau records. 채무불이행이력
    - Y: The individual has a history of defaults on their credit file.
    - N: The individual does not have any history of defaults.
- cb_preson_cred_hist_length: The length of credit history for the individual.신용기록기간

- person_age (나이): 대출을 신청한 개인의 나이
- person_income (연간 소득): 해당 개인의 연간 소득
- person_home_ownership (주택 소유 형태): 해당 개인의 주택 소유 형태다.
    - rent (임차): 해당 개인이 현재 주택을 임차 중인 경우
    - mortgage (담보대출): 해당 개인이 소유한 주택에 대한 모기지(담보대출)가 있는 경우
    - own (자가 소유): 해당 개인이 주택을 완전 소유하고 있는 경우
    - other (기타): 데이터셋에 특정한 다른 주택 소유 형태가 있는 경우
- person_emp_length (근무 기간): 해당 개인의 근무 기간(연 단위)
- loan_intent (대출 목적): 대출 신청 목적
- loan_grade (대출 등급): 대출에 대한 신용 위험도를 나타내는 등급
    - A, B, C, D, E, F, G: A부터 G까지 대출 신청자의 신용 위험도에 따라 분류된 등급
- loan_amnt (대출 금액): 대출 신청자가 요청한 대출 금액
- loan_int_rate (대출 이자율): 대출에 연결된 이자율
- loan_status (대출 상태): 대출 상태를 나타내며, 0은 채무 불이행이 없음을, 1은 채무 불이행
- loan_percent_income (대출 금액 대비 소득 비율): 대출 금액이 소득에 차지하는 비율
- cb_person_default_on_file (신용 기록의 채무불이행 여부): 해당 개인의 신용 기록에서 채무불이행 이력
    - Y: 해당 개인의 신용 기록에 채무불이행 이력이 있는 경우
    - N: 해당 개인의 신용 기록에 채무불이행 이력이 없는 경우
- cb_preson_cred_hist_length (신용 기록 기간): 해당 개인의 신용 기록 기간, 일반적으로 연도(year) 단위로 표시, 해당 개인이 금융 제품을 사용하거나 대출을 시작한 후부터 현재까지의 기간을 나타냄. 긴 신용 기록 기간은 대출 승인 및 신용평가에 긍정적인 영향

#### EDA

In [5]:
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [6]:
df.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length
count,32581.0,32581.0,31686.0,32581.0,29465.0,32581.0,32581.0,32581.0
mean,27.7346,66074.85,4.789686,9589.371106,11.011695,0.218164,0.170203,5.804211
std,6.348078,61983.12,4.14263,6322.086646,3.240459,0.413006,0.106782,4.055001
min,20.0,4000.0,0.0,500.0,5.42,0.0,0.0,2.0
25%,23.0,38500.0,2.0,5000.0,7.9,0.0,0.09,3.0
50%,26.0,55000.0,4.0,8000.0,10.99,0.0,0.15,4.0
75%,30.0,79200.0,7.0,12200.0,13.47,0.0,0.23,8.0
max,144.0,6000000.0,123.0,35000.0,23.22,1.0,0.83,30.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32581 entries, 0 to 32580
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  32581 non-null  int64  
 1   person_income               32581 non-null  int64  
 2   person_home_ownership       32581 non-null  object 
 3   person_emp_length           31686 non-null  float64
 4   loan_intent                 32581 non-null  object 
 5   loan_grade                  32581 non-null  object 
 6   loan_amnt                   32581 non-null  int64  
 7   loan_int_rate               29465 non-null  float64
 8   loan_status                 32581 non-null  int64  
 9   loan_percent_income         32581 non-null  float64
 10  cb_person_default_on_file   32581 non-null  object 
 11  cb_person_cred_hist_length  32581 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 3.0+ MB


In [8]:
df.isnull().sum()

person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length              895
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3116
loan_status                      0
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64

In [9]:
df.nunique()

person_age                      58
person_income                 4295
person_home_ownership            4
person_emp_length               36
loan_intent                      6
loan_grade                       7
loan_amnt                      753
loan_int_rate                  348
loan_status                      2
loan_percent_income             77
cb_person_default_on_file        2
cb_person_cred_hist_length      29
dtype: int64

In [10]:
df.loan_status

0        1
1        0
2        1
3        1
4        1
        ..
32576    0
32577    0
32578    1
32579    0
32580    0
Name: loan_status, Length: 32581, dtype: int64

In [11]:
df['loan_intent']

0               PERSONAL
1              EDUCATION
2                MEDICAL
3                MEDICAL
4                MEDICAL
              ...       
32576           PERSONAL
32577           PERSONAL
32578    HOMEIMPROVEMENT
32579           PERSONAL
32580            MEDICAL
Name: loan_intent, Length: 32581, dtype: object