In [1]:
import numpy as np
import pandas as pd
from sklearn import linear_model, model_selection
from matplotlib import pyplot as plt
import imblearn
from imblearn import over_sampling

In [2]:
data = pd.read_csv('./data/rankingcard.csv', index_col=0)
data.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
3,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
4,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [3]:
data.shape

(150000, 11)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150000 entries, 1 to 150000
Data columns (total 11 columns):
SeriousDlqin2yrs                        150000 non-null int64
RevolvingUtilizationOfUnsecuredLines    150000 non-null float64
age                                     150000 non-null int64
NumberOfTime30-59DaysPastDueNotWorse    150000 non-null int64
DebtRatio                               150000 non-null float64
MonthlyIncome                           120269 non-null float64
NumberOfOpenCreditLinesAndLoans         150000 non-null int64
NumberOfTimes90DaysLate                 150000 non-null int64
NumberRealEstateLoansOrLines            150000 non-null int64
NumberOfTime60-89DaysPastDueNotWorse    150000 non-null int64
NumberOfDependents                      146076 non-null float64
dtypes: float64(4), int64(7)
memory usage: 13.7 MB


In [5]:
# 查看缺失值的比例
data.isnull().mean()

SeriousDlqin2yrs                        0.000000
RevolvingUtilizationOfUnsecuredLines    0.000000
age                                     0.000000
NumberOfTime30-59DaysPastDueNotWorse    0.000000
DebtRatio                               0.000000
MonthlyIncome                           0.198207
NumberOfOpenCreditLinesAndLoans         0.000000
NumberOfTimes90DaysLate                 0.000000
NumberRealEstateLoansOrLines            0.000000
NumberOfTime60-89DaysPastDueNotWorse    0.000000
NumberOfDependents                      0.026160
dtype: float64

In [6]:
# 去重
data.drop_duplicates(inplace=True)

In [7]:
data.shape

(149391, 11)

In [8]:
# 因为删除了一些重复的样本  要恢复索引
data.index = range(data.shape[0])

In [9]:
# 再次查看缺失值
data.isnull().mean()

SeriousDlqin2yrs                        0.000000
RevolvingUtilizationOfUnsecuredLines    0.000000
age                                     0.000000
NumberOfTime30-59DaysPastDueNotWorse    0.000000
DebtRatio                               0.000000
MonthlyIncome                           0.195601
NumberOfOpenCreditLinesAndLoans         0.000000
NumberOfTimes90DaysLate                 0.000000
NumberRealEstateLoansOrLines            0.000000
NumberOfTime60-89DaysPastDueNotWorse    0.000000
NumberOfDependents                      0.025624
dtype: float64

In [10]:
# NumberOfDependents 家庭成员数量 我们使用均值来填充
# MonthlyIncome 月收入 因为缺失20%的数据，有点多，一般情况下，高收入会对申请贷款起到一个助力作用
# 没有写的很有可能是低收入人群  对于收入这个缺失值 采取随机森林回归进行填充
data['NumberOfDependents'].fillna(value=int(data['NumberOfDependents'].mean()), inplace=True)


In [11]:
data.isnull().mean()

SeriousDlqin2yrs                        0.000000
RevolvingUtilizationOfUnsecuredLines    0.000000
age                                     0.000000
NumberOfTime30-59DaysPastDueNotWorse    0.000000
DebtRatio                               0.000000
MonthlyIncome                           0.195601
NumberOfOpenCreditLinesAndLoans         0.000000
NumberOfTimes90DaysLate                 0.000000
NumberRealEstateLoansOrLines            0.000000
NumberOfTime60-89DaysPastDueNotWorse    0.000000
NumberOfDependents                      0.000000
dtype: float64

In [12]:
# 定义随机森林回归填充函数
def fill_missing_rf(x, y, to_fill):
    """
    x 原始的特征矩阵
    y 标签矩阵
    to_fill 要填充的一列
    """
    df = x.copy()
    fill = df.loc[:, to_fill]
    df = pd.concat([data[data.columns[data.columns != to_fill]], pd.DataFrame(y)], axis=1)
    
    # 找出我们的训练集和测试集
    Ytrain = fill[fill.notna()]
    Ytest = fill[fill.isna()]
    Xtrain = df.loc[Ytrain.index, :]
    Xtest = df.loc[Ytest.index, :]
    
    # 使用随机森林回归填充值
    from sklearn import ensemble
    rfr = ensemble.RandomForestRegressor(n_estimators=100).fit(Xtrain, Ytrain)
    pred = rfr.predict(Xtest)
    return pred
    

In [13]:
# X = data[data.columns[data.columns != 'SeriousDlqin2yrs']]
# # X.head()
# y = data['SeriousDlqin2yrs']
# y.head()

In [14]:
X = data.iloc[:, 1:]
y = data['SeriousDlqin2yrs']
X.head()

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [15]:
# MonthlyIncome 使用随机森林回归预测的值进行填充
y_pred = fill_missing_rf(X, y, 'MonthlyIncome')
data.loc[data.loc[:, 'MonthlyIncome'].isnull(), 'MonthlyIncome'] = y_pred

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149391 entries, 0 to 149390
Data columns (total 11 columns):
SeriousDlqin2yrs                        149391 non-null int64
RevolvingUtilizationOfUnsecuredLines    149391 non-null float64
age                                     149391 non-null int64
NumberOfTime30-59DaysPastDueNotWorse    149391 non-null int64
DebtRatio                               149391 non-null float64
MonthlyIncome                           149391 non-null float64
NumberOfOpenCreditLinesAndLoans         149391 non-null int64
NumberOfTimes90DaysLate                 149391 non-null int64
NumberRealEstateLoansOrLines            149391 non-null int64
NumberOfTime60-89DaysPastDueNotWorse    149391 non-null int64
NumberOfDependents                      149391 non-null float64
dtypes: float64(4), int64(7)
memory usage: 12.5 MB


In [17]:
# 使用describe 查看异常值
# 由下面的表格 可以看出SeriousDlqin2yrs 存在不均衡问题 右偏 90%多的数据都是0
# RevolvingUtilizationOfUnsecuredLines 贷款以及信用卡可用额度与总额度比例  应该是正常的
# age 年龄 应该介于8~110之间 儿童账户至少需要8岁 年龄最小值为0应该是异常值
# NumberOfTime30-59DaysPastDueNotWorse 信用卡逾期30~59天并没有变坏 逾期98次这个数据有点不正常
# NumberOfTime60-89DaysPastDueNotWorse 信用卡逾期60~89天并没有变坏 逾期98次这个数据有点不正常
# NumberOfTimes90DaysLate 信用卡逾期90天或更坏的次数 逾期98次这个数据有点不正常
# DebtRatio 每月偿还债务的比例
# MonthlyIncome 月收入
# NumberOfOpenCreditLinesAndLoans 开放式贷款和信贷数量
# NumberRealEstateLoansOrLines 抵押贷款或者房地产贷款数量
# NumberOfDependents 家庭成员数量 20可能是异常数据
data.describe([0.01, 0.1, 0.25, 0.50, 0.75, 0.9, 0.99]).T

Unnamed: 0,count,mean,std,min,1%,10%,25%,50%,75%,90%,99%,max
SeriousDlqin2yrs,149391.0,0.066999,0.250021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
RevolvingUtilizationOfUnsecuredLines,149391.0,6.071087,250.263672,0.0,0.0,0.003199,0.030132,0.154235,0.556494,0.978007,1.093922,50708.0
age,149391.0,52.306237,14.725962,0.0,24.0,33.0,41.0,52.0,63.0,72.0,87.0,109.0
NumberOfTime30-59DaysPastDueNotWorse,149391.0,0.393886,3.852953,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,98.0
DebtRatio,149391.0,354.43674,2041.843455,0.0,0.0,0.034991,0.177441,0.368234,0.875279,1275.0,4985.1,329664.0
MonthlyIncome,149391.0,5424.568467,13232.400177,0.0,0.0,0.18,1800.0,4419.0,7416.0,10800.0,23250.0,3008750.0
NumberOfOpenCreditLinesAndLoans,149391.0,8.480892,5.136515,0.0,0.0,3.0,5.0,8.0,11.0,15.0,24.0,58.0
NumberOfTimes90DaysLate,149391.0,0.23812,3.826165,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,98.0
NumberRealEstateLoansOrLines,149391.0,1.022391,1.130196,0.0,0.0,0.0,0.0,1.0,2.0,2.0,4.0,54.0
NumberOfTime60-89DaysPastDueNotWorse,149391.0,0.212503,3.810523,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,98.0


In [18]:
# 查看年龄为0样本 只有一个 删掉
data[data['age'] == 0]
data = data[data['age'] != 0]

In [19]:
# 再次确认
(data['age'] == 0).sum()

0

In [20]:
# 接下来处理NumberOfTime30-59DaysPastDueNotWorse NumberOfTime60-89DaysPastDueNotWorse
# NumberOfTimes90DaysLate 98次的异常值
data[data['NumberOfTime30-59DaysPastDueNotWorse'] > 90].count()

SeriousDlqin2yrs                        225
RevolvingUtilizationOfUnsecuredLines    225
age                                     225
NumberOfTime30-59DaysPastDueNotWorse    225
DebtRatio                               225
MonthlyIncome                           225
NumberOfOpenCreditLinesAndLoans         225
NumberOfTimes90DaysLate                 225
NumberRealEstateLoansOrLines            225
NumberOfTime60-89DaysPastDueNotWorse    225
NumberOfDependents                      225
dtype: int64

In [21]:
data = data[data['NumberOfTime30-59DaysPastDueNotWorse'] < 90]
data = data[data['NumberOfTime60-89DaysPastDueNotWorse'] < 90]
data = data[data['NumberOfTimes90DaysLate'] < 90]

In [22]:
# data[data['NumberOfTime30-59DaysPastDueNotWorse'] > 90].count()
# data[data['NumberOfTime60-89DaysPastDueNotWorse'] > 90].count()
data[data['NumberOfTimes90DaysLate'] > 90].count()

SeriousDlqin2yrs                        0
RevolvingUtilizationOfUnsecuredLines    0
age                                     0
NumberOfTime30-59DaysPastDueNotWorse    0
DebtRatio                               0
MonthlyIncome                           0
NumberOfOpenCreditLinesAndLoans         0
NumberOfTimes90DaysLate                 0
NumberRealEstateLoansOrLines            0
NumberOfTime60-89DaysPastDueNotWorse    0
NumberOfDependents                      0
dtype: int64

In [23]:
# 再次恢复索引
data.index = range(data.shape[0])
data.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [24]:
# 接下来探讨样本不均衡问题
# 由下面打印可以看出 数据存在样本不均衡问题
(data['SeriousDlqin2yrs'] == 0).sum() / (data['SeriousDlqin2yrs'] == 1).sum()

14.108376380026334

In [25]:
# 对于样本不均衡问题 采用上采样（增加少数样本量）下采样（减少多数样本数量）来处理
X = data.iloc[:, 1:]
y = data['SeriousDlqin2yrs']
sm = over_sampling.SMOTE(random_state=66)
X, y = sm.fit_sample(X, y)

In [26]:
pd.Series(y).value_counts()

1    139292
0    139292
dtype: int64

In [27]:
pd.Series(y).value_counts()[1] / pd.Series(y).value_counts()[0]

1.0

In [28]:
# 划分训练集和测试集
X = pd.DataFrame(X)
y = pd.DataFrame(y)
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.3, random_state=66)


In [31]:
# 保存训练集和测试集
model_data = pd.concat([y_train, X_train], axis=1)
model_data.index = range(model_data.shape[0])
model_data.columns = data.columns
model_data.to_csv('./data/model_data.csv', index=False)

test_data = pd.concat([y_test, X_test], axis=1)
test_data.index = range(test_data.shape[0])
test_data.columns = data.columns
test_data.to_csv('./data/test_data.csv', index=False)

In [32]:
model_data = pd.read_csv('./data/model_data.csv')
model_data.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,0,0.049337,53.0,0.0,805.0,0.5,12.0,0.0,1.0,0.0,0.0
1,1,0.491558,44.543111,1.0,0.327845,6045.913777,4.51437,0.0,1.51437,0.51437,1.456889
2,0,0.005643,69.0,0.0,0.315537,5000.0,9.0,0.0,1.0,0.0,1.0
3,1,0.475414,32.620861,1.137418,0.779157,3800.0,9.758279,0.0,1.758279,0.0,1.516558
4,1,0.257248,43.0,0.0,0.626258,10731.0,7.0,0.0,4.0,0.0,4.0
