In [118]:
import numpy as np
import pandas as pd

import seaborn as sns

### Описание датасета

* **Home Ownership** - домовладение
* **Annual Income** - годовой доход
* **Years in current job** - количество лет на текущем месте работы
* **Tax Liens** - налоговые обременения
* **Number of Open Accounts** - количество открытых счетов
* **Years of Credit History** - количество лет кредитной истории
* **Maximum Open Credit** - наибольший открытый кредит
* **Number of Credit Problems** - количество проблем с кредитом
* **Months since last delinquent** - количество месяцев с последней просрочки платежа
* **Bankruptcies** - банкротства
* **Purpose** - цель кредита
* **Term** - срок кредита
* **Current Loan Amount** - текущая сумма кредита
* **Current Credit Balance** - текущий кредитный баланс
* **Monthly Debt** - ежемесячный долг
* **Credit Default** - факт невыполнения кредитных обязательств (0 - погашен вовремя, 1 - просрочка)

**Пути к директориям и файлам**

In [119]:
TRAIN_DATASET_PATH = './course_project_train.csv'
TEST_DATASET_PATH = './course_project_test.csv'

**Загрузка данных**

In [120]:
df_train = pd.read_csv(TRAIN_DATASET_PATH)

df_test = pd.read_csv(TEST_DATASET_PATH)

df_train.shape, df_test.shape

((7500, 17), (2500, 16))

In [121]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Home Ownership                7500 non-null   object 
 1   Annual Income                 5943 non-null   float64
 2   Years in current job          7129 non-null   object 
 3   Tax Liens                     7500 non-null   float64
 4   Number of Open Accounts       7500 non-null   float64
 5   Years of Credit History       7500 non-null   float64
 6   Maximum Open Credit           7500 non-null   float64
 7   Number of Credit Problems     7500 non-null   float64
 8   Months since last delinquent  3419 non-null   float64
 9   Bankruptcies                  7486 non-null   float64
 10  Purpose                       7500 non-null   object 
 11  Term                          7500 non-null   object 
 12  Current Loan Amount           7500 non-null   float64
 13  Cur

In [122]:
df_train.describe()

Unnamed: 0,Annual Income,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default
count,5943.0,7500.0,7500.0,7500.0,7500.0,7500.0,3419.0,7486.0,7500.0,7500.0,7500.0,5943.0,7500.0
mean,1366392.0,0.030133,11.130933,18.317467,945153.7,0.17,34.6926,0.117152,11873180.0,289833.2,18314.454133,1151.087498,0.281733
std,845339.2,0.271604,4.908924,7.041946,16026220.0,0.498598,21.688806,0.347192,31926120.0,317871.4,11926.764673,1604.451418,0.449874
min,164597.0,0.0,2.0,4.0,0.0,0.0,0.0,0.0,11242.0,0.0,0.0,585.0,0.0
25%,844341.0,0.0,8.0,13.5,279229.5,0.0,16.0,0.0,180169.0,114256.5,10067.5,711.0,0.0
50%,1168386.0,0.0,10.0,17.0,478159.0,0.0,32.0,0.0,309573.0,209323.0,16076.5,731.0,0.0
75%,1640137.0,0.0,14.0,21.8,793501.5,0.0,50.0,0.0,519882.0,360406.2,23818.0,743.0,1.0
max,10149340.0,7.0,43.0,57.7,1304726000.0,7.0,118.0,4.0,100000000.0,6506797.0,136679.0,7510.0,1.0


In [123]:
df_train.select_dtypes(include=['object'])

Unnamed: 0,Home Ownership,Years in current job,Purpose,Term
0,Own Home,,debt consolidation,Short Term
1,Own Home,10+ years,debt consolidation,Long Term
2,Home Mortgage,8 years,debt consolidation,Short Term
3,Own Home,6 years,debt consolidation,Short Term
4,Rent,8 years,debt consolidation,Short Term
...,...,...,...,...
7495,Rent,< 1 year,other,Short Term
7496,Home Mortgage,1 year,debt consolidation,Long Term
7497,Rent,6 years,buy a car,Short Term
7498,Home Mortgage,,debt consolidation,Short Term


### *Home Ownership*

In [124]:
df_train['Home Ownership'].value_counts()

Home Mortgage    3637
Rent             3204
Own Home          647
Have Mortgage      12
Name: Home Ownership, dtype: int64

In [125]:
df_train.loc[df_train['Home Ownership'] == 'Have Mortgage', 'Home Ownership'] = 'Home Mortgage'

In [126]:
df_train['Home Ownership'].value_counts()

Home Mortgage    3649
Rent             3204
Own Home          647
Name: Home Ownership, dtype: int64

In [127]:
df_train = pd.get_dummies(df_train, columns=['Home Ownership'])

In [128]:
df_train.sample(2)

Unnamed: 0,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default,Home Ownership_Home Mortgage,Home Ownership_Own Home,Home Ownership_Rent
409,952660.0,8 years,0.0,5.0,14.5,573210.0,0.0,,0.0,debt consolidation,Short Term,220616.0,165338.0,1675.0,749.0,0,1,0,0
1061,596999.0,3 years,0.0,7.0,13.3,440748.0,0.0,,0.0,debt consolidation,Short Term,324038.0,304893.0,7612.0,739.0,1,0,0,1


### *Annual income*

In [129]:
df_train['Annual Income NaN'] = 0
df_train.loc[df_train['Annual Income'].isna(), 'Annual Income NaN'] = 1
df_train['Annual Income'].fillna(df_train['Annual Income'].median(), inplace=True)

In [130]:
df_train['Annual Income'].value_counts()

1168386.0    1558
1161660.0       4
1058376.0       4
1043651.0       4
969475.0        4
             ... 
1272316.0       1
712082.0        1
695476.0        1
678889.0        1
402192.0        1
Name: Annual Income, Length: 5478, dtype: int64

In [131]:
df_train.sample(2)

Unnamed: 0,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default,Home Ownership_Home Mortgage,Home Ownership_Own Home,Home Ownership_Rent,Annual Income NaN
4927,1168386.0,5 years,0.0,8.0,11.7,99132.0,0.0,54.0,0.0,debt consolidation,Short Term,87406.0,73625.0,8728.0,,0,0,0,1,1
3936,735927.0,7 years,0.0,6.0,8.9,156948.0,0.0,,0.0,debt consolidation,Short Term,179388.0,106799.0,11898.0,7200.0,1,1,0,0,0


### *Years in current job*

In [132]:
df_train['Years in current job NaN'] = 0
df_train.loc[df_train['Annual Income'].isna(), 'Years in current job NaN'] = 1
years_mode = df_train['Years in current job'].mode()[0]
years_mode

'10+ years'

In [133]:
df_train['Years in current job'].fillna(years_mode, inplace=True)

In [134]:
df_train['Years in current job'].value_counts()

10+ years    2703
2 years       705
3 years       620
< 1 year      563
5 years       516
1 year        504
4 years       469
6 years       426
7 years       396
8 years       339
9 years       259
Name: Years in current job, dtype: int64

In [141]:
years_to_numbers = {'< 1 year': 0, '1 year': 1, '2 years': 2, '3 years': 3,
              '4 years': 4, '5 years': 5, '6 years': 6, '7 years': 7,
             '8 years': 8, '9 years': 9, '10+ years': 10}

In [142]:
df_train['Years in current job'].replace(years_to_numbers, inplace=True)

In [155]:
df_train.sample(3)

Unnamed: 0,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Purpose,...,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default,Home Ownership_Home Mortgage,Home Ownership_Own Home,Home Ownership_Rent,Annual Income NaN,Years in current job NaN
2116,1133388.0,2,0.0,8.0,14.6,194964.0,1.0,,1.0,debt consolidation,...,99999999.0,49001.0,7329.0,746.0,0,1,0,0,0,0
2761,1168386.0,10,0.0,6.0,22.3,818664.0,1.0,,1.0,debt consolidation,...,490930.0,155553.0,14303.0,,0,1,0,0,1,0
1937,1360989.0,10,0.0,12.0,23.4,1111462.0,0.0,,0.0,debt consolidation,...,245960.0,289883.0,20075.0,750.0,0,1,0,0,0,0


### *Months since last delinquent*

In [90]:
df_train['Months since last delinquent'].value_counts()

14.0     76
29.0     71
33.0     68
8.0      68
12.0     65
         ..
86.0      1
118.0     1
84.0      1
91.0      1
92.0      1
Name: Months since last delinquent, Length: 89, dtype: int64