In [1]:
%config ZMQInteractiveShell.ast_node_interactivity='all'
%matplotlib inline
import warnings;warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from sklearn.preprocessing import Imputer, LabelEncoder
from sklearn import metrics
import os
from scipy import stats as spstats

In [2]:
#### load data files ###

# Change file name prefix here and you are good to go:D
# prefix = "train_"
# prefix = "A_"

# Generate data file name list
file_names = ["Application.csv","Personas.csv","History_Application.csv","History_Payment.csv"]
# for idx, val in enumerate(file_names):
#     file_names[idx] = prefix + val  

# load data files
prefix = "train_"
d01 = pd.read_csv(f'./data/{prefix}{file_names[0]}')
d02 = pd.read_csv(f'./data/{prefix}{file_names[1]}')
d03 = pd.read_csv(f'./data/{prefix}{file_names[2]}')
d04 = pd.read_csv(f'./data/{prefix}{file_names[3]}')
prefix = "A_"
# load data files
d05 = pd.read_csv(f'./data/{prefix}{file_names[0]}')
d06 = pd.read_csv(f'./data/{prefix}{file_names[1]}')
d07 = pd.read_csv(f'./data/{prefix}{file_names[2]}')
d08 = pd.read_csv(f'./data/{prefix}{file_names[3]}')

# Concat
d1 = pd.concat([d01, d05], axis=0)
d2 = pd.concat([d02, d06], axis=0)
d3 = pd.concat([d03, d07], axis=0)
d4 = pd.concat([d04, d08], axis=0)

In [3]:
### Functions

# 取一个列的众数
def get_mode(df, col):
    return pd.Series(df[col]).mode()[0]

# 置多个列的空值为所属列0
def fill_zero(df, cols):
    dict = {}
    for col in cols:
        dict[col] = 0
    return df.fillna(dict)

# 置多个列的空值为所属列的平均数
def fill_mean(df, cols):
    dict = {}
    for col in cols:
        dict[col] = df[col].mean()
    return df.fillna(dict)

# 置多个列的空值为所属列的众数
def fill_mode(df, cols):
    dict = {}
    for col in cols:
        dict[col] = get_mode(df, col)
    return df.fillna(dict)

# 把多个离散型的列转化为one-hot编码列
def dummy_catgorical(df, cols, keep_n_cats):
    print('initial num of cols: ' + str(len(df.columns.to_list())))
    for col in cols:
        # 跳过离散变量太多的
        idx_list = df[col].value_counts().index.to_list()
        print(col+'has'+str(len(idx_list)))
        if (len(idx_list) > keep_n_cats):
            print(' col含有过多值, skipping')
        else:
            # 生成one-hot编码列
            df = pd.get_dummies(df, columns=[col])
    return df

# 打印一个df的na值
def print_na(df):
    # 检查d12 nan值
    df_na = df.isna().sum()
    for idx in df_na.index.to_list():
        if (df_na[idx]!=0):
            print(str(idx)+' '+str(df_na[idx]))

def print_val(df, col):
    return df[col].value_counts(dropna=False)

In [4]:
### intersection
# ''' d01&d02 '''
# id_train = set(d01['申请编号'])
# len(id_train)
# id_test = set(d02['申请编号'])
# len(id_test)
# inter = id_train.intersection(id_test)
# len(inter)

In [5]:
### Merge d1 d2 ###
uid = '申请编号'
d12 = pd.merge(d1, d2, how='inner', on=uid)
d12.shape

(161511, 62)

In [6]:
### d12 异常值处理

# 日期类
d12.loc[(d12['孩子个数'] == -1), '孩子个数'] = get_mode(d12, '孩子个数')

d12.loc[(d12['最近一次换手机号码距申请日天数']>0), '最近一次换手机号码距申请日天数'] = 0

d12.loc[(d12['工作日期距申请日期天数']==292204), '工作日期距申请日期天数'] = np.NaN
d12 = fill_mean(d12, ['工作日期距申请日期天数'])
d12.loc[(d12['工作日期距申请日期天数']>0), '工作日期距申请日期天数'] = 0

d12.loc[(d12['身份认证日期距申请日期天数']>0), '身份认证日期距申请日期天数'] = 0

d12 = fill_zero(d12, ['有车时间'])

# d12['最近一次换手机号码距申请日天数'].value_counts(dropna=False)
# d12['工作日期距申请日期天数'].value_counts(dropna=False)
# d12['身份认证日期距申请日期天数'].value_counts(dropna=False)
# d12['有车时间'].value_counts(dropna=False)

In [7]:
### d12 自定义特征

# 外部评分
d12['有外部评分'] = d12['外部评分'].apply(lambda x: 0 if np.isnan(x) else 1 )
d12 = fill_mean(d12, ['外部评分'])

d12['外部评分'].value_counts(dropna=False)
d12['有外部评分'].value_counts(dropna=False)

# # 其他衍生特征
d12['工作日期距申请日期月数'] = round(d12['工作日期距申请日期天数']/30)
d12['身份认证日期距申请日期月数'] = round(d12['身份认证日期距申请日期天数']/30)
d12['注册日期距申请日期月数'] = round(d12['注册日期距申请日期天数']/30)
d12['最近一次换手机号码距申请日月数'] = round(d12['最近一次换手机号码距申请日天数']/30)
d12['有车时间月数'] = round(d12['有车时间']/30)


0.474482    90682
0.401226        5
0.507349        4
0.393062        4
0.601635        4
            ...  
0.468025        1
0.563485        1
0.399266        1
0.280054        1
0.340233        1
Name: 外部评分, Length: 65205, dtype: int64

0    90682
1    70829
Name: 有外部评分, dtype: int64

In [8]:
# d12.columns

d12['客户居住地评分mean'] = (d12['客户居住地评分1'] + d12['客户居住地评分2']) / 2
d12['客户居住地评分sum'] = (d12['客户居住地评分1'] + d12['客户居住地评分2'])

d12['地址是否一致标志sum'] = (d12['地址是否一致标志1'] + d12['地址是否一致标志2'] + d12['地址是否一致标志3'] +
    d12['地址是否一致标志4'] + d12['地址是否一致标志5'] + d12['地址是否一致标志6'])
d12['地址是否一致标志mean'] = d12['地址是否一致标志sum'] / 6

# print_val(d12,'居住信息1')
d12['有居住信息1'] = d12['居住信息1'].apply(lambda x: 0 if np.isnan(x) else 1 )
d12['有居住信息2'] = d12['居住信息2'].apply(lambda x: 0 if np.isnan(x) else 1 )
d12['有居住信息3'] = d12['居住信息3'].apply(lambda x: 0 if np.isnan(x) else 1 )
d12['有居住信息4'] = d12['居住信息4'].apply(lambda x: 0 if np.isnan(x) else 1 )

d12 = fill_zero(d12, ['居住信息1', '居住信息2', '居住信息3', '居住信息4'])
d12['居住信息sum'] = (d12['居住信息1'] + d12['居住信息2'] + d12['居住信息3'] + d12['居住信息4'])
d12['居住信息mean'] = d12['居住信息sum'] / 4

d12.loc[(d12['居住信息1']==0), '居住信息1'] = d12['居住信息1'].mean()
d12.loc[(d12['居住信息2']==0), '居住信息2'] = d12['居住信息2'].mean()
d12.loc[(d12['居住信息3']==0), '居住信息3'] = d12['居住信息3'].mean()
d12.loc[(d12['居住信息4']==0), '居住信息4'] = d12['居住信息4'].mean()

d12 = fill_mode(d12, ['社交圈违约信息2_2', '社交圈违约信息2_1', '社交圈违约信息1_2', '社交圈违约信息1_1'])
d12['社交圈违约信息2sum'] = (d12['社交圈违约信息2_2'] + d12['社交圈违约信息2_1'])
d12['社交圈违约信息2mean'] = d12['社交圈违约信息2sum'] / 2
d12['社交圈违约信息1sum'] = (d12['社交圈违约信息1_2'] + d12['社交圈违约信息1_1'])
d12['社交圈违约信息1mean'] = d12['社交圈违约信息1sum'] / 4

# print_val(d12,'居住信息1')
# print_val(d12,'居住信息2')
# print_val(d12,'居住信息3')
# print_val(d12,'居住信息4')
# print_val(d12,'有居住信息1')
# print_val(d12,'有居住信息2')
# print_val(d12,'有居住信息3')
# print_val(d12,'有居住信息4')
# print_val(d12,'居住信息sum')
# print_val(d12,'居住信息mean')


In [9]:
### Outlier：Get ###
low = .01
high = .99
quant_df = d12.drop([uid], axis=1).quantile([low, high]).T
quant_df.to_csv('./tmp/1_d12_outlier.csv')

### Outlier：Apply ###

outliers = pd.read_csv('./tmp/0_outlier1.csv', index_col=0)
outliers.shape
outliers.head()

def set_outlier(col):
    if col < col_min:
        col = col_min
    elif col > col_max:
        col = col_max
    return col

# def set_outlier(col):
#     if col < col_min or col > col_max:
#         col = np.NaN
#     return col
    
for col in outliers.index:
# for col in '出生日期距申请日期天数', '社交圈违约信息1_2':
    col_min = outliers.loc[col, 'min']
    col_max = outliers.loc[col, 'max']
    d12[col] = d12[col].apply(set_outlier)
    
# d12.to_csv(f'./tmp/2_eda_outlier.csv')

(61, 2)

Unnamed: 0,min,max
地址是否一致标志1,0,1
地址是否一致标志2,0,1
地址是否一致标志3,0,1
地址是否一致标志4,0,1
地址是否一致标志5,0,1


In [23]:
# 检查d12 nan值

# d12.shape

# '''d12 nan值'''
# print_na(d12)

# d12['收入类型'].value_counts(dropna=False)
# d12_dum.columns.to_list()

(161511, 82)

'd12 nan值'

In [10]:
### d12 数据空值处理 ###

# 插补平均值
nan_to_mean = [
    '贷款年金',
    '商品价格',
#     '外部评分',
#     '居住信息1',
#     '居住信息2',
#     '居住信息3',
#     '居住信息4',
#     '有车时间'
]
#TODO: 空值插补mean
d12 = fill_mean(d12, nan_to_mean)

nan_to_zero = [
    '贷款申请前1小时内征信查询次数',
    '贷款申请前1天内征信查询次数',
    '贷款申请前1周内征信查询次数',
    '贷款申请前1个月内征信查询次数',
    '贷款申请前1个季度内征信查询次数',
    '贷款申请前1年内征信查询次数'
]
#TODO: 空值插补0
d12 = fill_zero(d12, nan_to_zero)

#TODO: 空值插补mean
d12 = fill_mean(d12, nan_to_mean)

nan_to_mode = [
    '家庭成员数',
#     '社交圈违约信息2_2',
#     '社交圈违约信息2_1',
#     '社交圈违约信息1_2',
#     '社交圈违约信息1_1',
    '陪同申请人',
    '职业'
]
#TODO: 空值插补mode
d12 = fill_mode(d12, nan_to_mode)

# 检查d12 nan值
'''d12 nan值'''
print_na(d12)

'd12 nan值'

In [11]:
### d12 类型变量转dummy列 ###
cat_to_dummy = [
    '陪同申请人',
    '收入类型',
    '教育程度',
    '居住状态',
    '职业',
    '单位类型'
]

d12 = dummy_catgorical(d12, cat_to_dummy, 20)

d12.shape
d12.shape
'''d12 nan值'''
print_na(d12)

initial num of cols: 82
陪同申请人has7
收入类型has8
教育程度has5
居住状态has6
职业has18
单位类型has58
 col含有过多值, skipping


(161511, 121)

(161511, 121)

'd12_dum nan值'

In [13]:
### d12 处理col含有过多值
if ('单位类型' in d12.columns):
    d12.drop(columns='单位类型',inplace=True)

In [14]:
### d3过滤

# 删除特征'是否为最后一次申请记录'
if ('是否为最后一次申请记录' in d3.columns):
    d3.drop(columns='是否为最后一次申请记录',inplace=True)

In [28]:
# 检查d3 nan值
# d3.shape
# d3['首付时间'].value_counts()
# '''d3 nan值'''
# print_na(d3)
# d3['首付率'].describe()
# print_val(d3,'首付率')

In [15]:
### d3数据清洗 - 金额型 ###

# 异常值置NaN
d3.loc[(d3['首付时间'] == 292204.0), '首付时间'] = np.NaN
d3.loc[(d3['首付金额'] == 11), '首付金额'] = np.NaN
d3.loc[(d3['贷款年金'] == 11), '贷款年金'] = np.NaN
d3.loc[(d3['商品价格'] == 11), '商品价格'] = np.NaN
d3.loc[(d3['申请额度'] == 11), '申请额度'] = np.NaN
d3.loc[(d3['信用额度'] == 11), '信用额度'] = np.NaN

# d3['销售区域_count'] = d3.groupby('销售区域')['销售区域'].transform('count')
# d3.loc[d3['销售区域_count'] < 5000, '销售区域'] = np.NaN

# 插补0
# nan_to_zero = [
# ]
#TODO: 空值插补0
# d3 = fill_zero(d3, nan_to_zero)

# 插补mean
nan_to_mean = [
    '贷款年金',
    '首付金额',
    '首付时间',
    '商品价格',
    '申请额度',
    '信用额度',
    '首付率',
    '利率1',
    '利率2'
]
#TODO: 空值插补mean
d3 = fill_mean(d3, nan_to_mean)

### d3数据清洗 - 布尔型 ###
# d3_cols['bool'] = [
#     '是否为当天最后一次申请记录']

### d3数据清洗 - 计数型 ###
# d3_cols['cnt'] = [
#     '贷款期数']

d3.shape
'''d3 nan值'''
print_na(d3)

(744984, 31)

'd3 nan值'

贷款类型 181
陪同申请人 368318
贷款期数 163119
产品组合 181


In [16]:
### d3 自定义特征

# print_val(d3,'历史贷款授信距本次申请时间')
d3.loc[(d3['历史贷款授信距本次申请时间']>0), '历史贷款授信距本次申请时间']=0
d3['历史贷款授信距本次申请月数'] = round(d3['历史贷款授信距本次申请时间']/30)

d3.loc[(d3['首付时间']>0), '首付时间']=0
d3['首付月数'] = round(d3['首付时间']/30)

# '拟实际应还贷款金额'
d3.eval('拟实际应还贷款金额 = 贷款年金 * 贷款期数', inplace=True)
d3['拟实际应还贷款金额'].where(d3['贷款期数']>0, other=d3['贷款年金'], inplace=True)
d3 = fill_zero(d3, ['贷款期数'])


# '信用额度 / 申请额度'
d3.eval('信用额度比 = 信用额度 / 申请额度', inplace=True)

d3.shape
'''d3 nan值'''
print_na(d3)
# d3['信用额度比'].describe()
# print_val(d3,'信用额度比')

# d3['拟实际应还贷款金额'].describe()
# print_val(d3,'拟实际应还贷款金额')

(744984, 35)

'd3 nan值'

贷款类型 181
陪同申请人 368318
产品组合 181


In [17]:
### d3 类型变量转dummy列 ###

cat_to_dummy = [
    '贷款类型',
    '贷款用途',
    '合同状态',
    '付款方式',
    '拒绝原因',
    '陪同申请人',
    '客户类型', #!
    '商品类别',
    '组合类型',
    '产品类型',
    '获客渠道',
    '销售区域',
    '行业',
    '收益级别', #!
    '产品组合',
    '申请时点',
    '申请周内日'
]

d3 = dummy_catgorical(d3, cat_to_dummy, 27)

# 过滤合同状态为'非拒绝'的样本
d3a = d3[d3['合同状态_2']==0]

# 取合同状态为'拒绝'的样本
d3d = d3[d3['合同状态_2']==1]

'''d3a'''
d3a.shape
'''d3d'''
d3d.shape

initial num of cols: 35
贷款类型has3
贷款用途has25
合同状态has4
付款方式has4
拒绝原因has9
陪同申请人has7
客户类型has4
商品类别has27
组合类型has5
产品类型has3
获客渠道has8
销售区域has1665
 col含有过多值, skipping
行业has11
收益级别has5
产品组合has17
申请时点has12
申请周内日has7


'd3a'

(612366, 170)

'd3d'

(132618, 170)

In [18]:
### d3 处理col含有过多值

if ('销售区域' in d3.columns):
    d3.drop(columns='销售区域',inplace=True)
    d3a.drop(columns='销售区域',inplace=True)
    d3d.drop(columns='销售区域',inplace=True)

In [21]:
### d3 group by ###

# 离散特征
d3_cat_cols = [
    '贷款类型',
    '贷款用途',
    '合同状态',
    '付款方式',
#     '拒绝原因',
    '陪同申请人',
    '客户类型',
    '商品类别',
    '组合类型',
    '产品类型',
    '获客渠道',
#     '销售区域',
    '行业',
    '收益级别',
    '产品组合',
    # 时点型
    '申请周内日',
    '申请时点']

d3_agg_dict = {
    '历史申请编号':['count'],
    # 数值型 - 金额
    '贷款年金':['mean', 'max', 'min', 'sum'],
    '申请额度':['mean', 'max', 'min', 'sum'],
    '信用额度':['mean', 'max', 'min', 'sum'],
    '首付金额':['mean', 'max', 'min'],
    '商品价格':['mean', 'max', 'min'],
    '拟实际应还贷款金额':['mean','max', 'min', 'sum'],
    '信用额度比':['mean', 'max', 'min'],
    # 数值型 - 时长 
    '历史贷款授信距本次申请时间':['mean', 'max', 'min'],
    '历史贷款授信距本次申请月数':['mean', 'max', 'min'],
    '首付时间':['mean', 'max', 'min'],
    '首付月数':['mean', 'max', 'min'],
    # 概率型
    '首付率':['mean', 'max', 'min'],
    '利率1':['mean', 'max', 'min'],
    '利率2':['mean', 'max', 'min']
}
    
# 离散型
cat_agg = ['sum', 'mean']
for col1 in d3.columns:
    for col2 in d3_cat_cols:
        if (col2 in col1):
            d3_agg_dict[col1] = cat_agg

# 非拒合同状态group
d3a = d3a.groupby('申请编号').agg(d3_agg_dict)
# 还原列名
d3a.columns = pd.Series(d3a.columns.tolist()).apply(pd.Series).sum(axis=1)

# 拒绝合同状态group
d3d = d3d.groupby('申请编号').agg(d3_agg_dict)
# 还原列名
d3d.columns = pd.Series(d3d.columns.tolist()).apply(pd.Series).sum(axis=1)

# d3all
d3all_agg_dict = {
    '历史申请编号':['count'],
    '贷款年金':['sum'],
    '申请额度':['sum'],
    '信用额度':['sum']
}

# groupby全部
d3all = d3.groupby('申请编号').agg(d3all_agg_dict)
# 还原列名
d3all.columns = pd.Series(d3all.columns.tolist()).apply(pd.Series).sum(axis=1)

d3a.shape
# d3a.columns.to_list()
d3d.shape
# d3d.columns.to_list()
d3all.shape
# d3all.columns.to_list()

(152768, 331)

(53598, 331)

(153006, 4)

In [31]:
### d3all merge d3a
d3all_d3a = pd.merge(d3all, d3a, how='left', on='申请编号')

# d3a_all自定义特征
d3all_d3a.eval('拟申请核准率_a = 历史申请编号count_y / 历史申请编号count_x', inplace=True)
d3all_d3a.eval('贷款年金sum比_a = 贷款年金sum_y / 贷款年金sum_x', inplace=True)
d3all_d3a.eval('申请额度sum比_a = 申请额度sum_y / 申请额度sum_x', inplace=True)
d3all_d3a.eval('信用额度sum比_a = 信用额度sum_y / 信用额度sum_x', inplace=True)
d3all_d3a.eval('信用比申请额度_a = 信用额度sum_y / 申请额度sum_y', inplace=True)

d3all_d3a.shape
d3all_d3a.columns.to_list()

(153006, 340)

['历史申请编号count_x',
 '贷款年金sum_x',
 '申请额度sum_x',
 '信用额度sum_x',
 '历史申请编号count_y',
 '贷款年金mean',
 '贷款年金max',
 '贷款年金min',
 '贷款年金sum_y',
 '申请额度mean',
 '申请额度max',
 '申请额度min',
 '申请额度sum_y',
 '信用额度mean',
 '信用额度max',
 '信用额度min',
 '信用额度sum_y',
 '首付金额mean',
 '首付金额max',
 '首付金额min',
 '商品价格mean',
 '商品价格max',
 '商品价格min',
 '拟实际应还贷款金额mean',
 '拟实际应还贷款金额max',
 '拟实际应还贷款金额min',
 '拟实际应还贷款金额sum',
 '信用额度比mean',
 '信用额度比max',
 '信用额度比min',
 '历史贷款授信距本次申请时间mean',
 '历史贷款授信距本次申请时间max',
 '历史贷款授信距本次申请时间min',
 '历史贷款授信距本次申请月数mean',
 '历史贷款授信距本次申请月数max',
 '历史贷款授信距本次申请月数min',
 '首付时间mean',
 '首付时间max',
 '首付时间min',
 '首付月数mean',
 '首付月数max',
 '首付月数min',
 '首付率mean',
 '首付率max',
 '首付率min',
 '利率1mean',
 '利率1max',
 '利率1min',
 '利率2mean',
 '利率2max',
 '利率2min',
 '贷款类型_0sum',
 '贷款类型_0mean',
 '贷款类型_1sum',
 '贷款类型_1mean',
 '贷款类型_Asum',
 '贷款类型_Amean',
 '贷款用途_0sum',
 '贷款用途_0mean',
 '贷款用途_1sum',
 '贷款用途_1mean',
 '贷款用途_2sum',
 '贷款用途_2mean',
 '贷款用途_3sum',
 '贷款用途_3mean',
 '贷款用途_4sum',
 '贷款用途_4mean',
 '贷款用途_5sum',
 '贷款用途_5mean',
 '贷款用途_6sum',
 '贷款用途_

In [32]:
drop_list = ['贷款年金sum_y', '申请额度sum_y', '信用额度sum_y', '历史申请编号count_y']
d3all_d3a.drop(columns=drop_list,inplace=True)

In [40]:
### d3all_d3a merge d3d
d3all_a_d = pd.merge(d3all_d3a, d3d, how='left', on='申请编号')

# d3all_a_d
d3all_a_d.eval('拟申请拒绝率_d = 历史申请编号count / 历史申请编号count_x', inplace=True)
d3all_a_d.eval('贷款年金sum比_d = 贷款年金sum / 贷款年金sum_x', inplace=True)
d3all_a_d.eval('申请额度sum比_d = 申请额度sum / 申请额度sum_x', inplace=True)
d3all_a_d.eval('信用额度sum比_d = 信用额度sum / 信用额度sum_x', inplace=True)
d3all_a_d.eval('信用比申请额度_d = 信用额度sum / 申请额度sum', inplace=True)

d3all_a_d.shape
d3all_a_d.columns.to_list()

(153006, 672)

['历史申请编号count_x',
 '贷款年金sum_x',
 '申请额度sum_x',
 '信用额度sum_x',
 '贷款年金mean_x',
 '贷款年金max_x',
 '贷款年金min_x',
 '申请额度mean_x',
 '申请额度max_x',
 '申请额度min_x',
 '信用额度mean_x',
 '信用额度max_x',
 '信用额度min_x',
 '首付金额mean_x',
 '首付金额max_x',
 '首付金额min_x',
 '商品价格mean_x',
 '商品价格max_x',
 '商品价格min_x',
 '拟实际应还贷款金额mean_x',
 '拟实际应还贷款金额max_x',
 '拟实际应还贷款金额min_x',
 '拟实际应还贷款金额sum_x',
 '信用额度比mean_x',
 '信用额度比max_x',
 '信用额度比min_x',
 '历史贷款授信距本次申请时间mean_x',
 '历史贷款授信距本次申请时间max_x',
 '历史贷款授信距本次申请时间min_x',
 '历史贷款授信距本次申请月数mean_x',
 '历史贷款授信距本次申请月数max_x',
 '历史贷款授信距本次申请月数min_x',
 '首付时间mean_x',
 '首付时间max_x',
 '首付时间min_x',
 '首付月数mean_x',
 '首付月数max_x',
 '首付月数min_x',
 '首付率mean_x',
 '首付率max_x',
 '首付率min_x',
 '利率1mean_x',
 '利率1max_x',
 '利率1min_x',
 '利率2mean_x',
 '利率2max_x',
 '利率2min_x',
 '贷款类型_0sum_x',
 '贷款类型_0mean_x',
 '贷款类型_1sum_x',
 '贷款类型_1mean_x',
 '贷款类型_Asum_x',
 '贷款类型_Amean_x',
 '贷款用途_0sum_x',
 '贷款用途_0mean_x',
 '贷款用途_1sum_x',
 '贷款用途_1mean_x',
 '贷款用途_2sum_x',
 '贷款用途_2mean_x',
 '贷款用途_3sum_x',
 '贷款用途_3mean_x',
 '贷款用途_4sum_x',
 '贷款用途_4m

In [42]:

drop_list = ['历史申请编号count_x', '贷款年金sum_x', '申请额度sum_x', '信用额度sum_x',
             '历史申请编号count', '贷款年金sum', '申请额度sum', '信用额度sum']
d3all_a_d.drop(columns=drop_list,inplace=True)

In [45]:
### merge d12 and d3a_all
d12.shape
d3all_a_d.shape
d12_d3all_a_d = pd.merge(d12, d3all_a_d, how='left', on='申请编号')
d12_d3all_a_d.shape
print_na(d12_d3all_a_d)

#TODO: 空值插补0
d12_d3all_a_d = d12_d3all_a_d.fillna(0)

(161511, 120)

(153006, 664)

(161511, 784)

贷款年金mean_x 8743
贷款年金max_x 8743
贷款年金min_x 8743
申请额度mean_x 8743
申请额度max_x 8743
申请额度min_x 8743
信用额度mean_x 8743
信用额度max_x 8743
信用额度min_x 8743
首付金额mean_x 8743
首付金额max_x 8743
首付金额min_x 8743
商品价格mean_x 8743
商品价格max_x 8743
商品价格min_x 8743
拟实际应还贷款金额mean_x 8743
拟实际应还贷款金额max_x 8743
拟实际应还贷款金额min_x 8743
拟实际应还贷款金额sum_x 8743
信用额度比mean_x 8743
信用额度比max_x 8743
信用额度比min_x 8743
历史贷款授信距本次申请时间mean_x 8743
历史贷款授信距本次申请时间max_x 8743
历史贷款授信距本次申请时间min_x 8743
历史贷款授信距本次申请月数mean_x 8743
历史贷款授信距本次申请月数max_x 8743
历史贷款授信距本次申请月数min_x 8743
首付时间mean_x 8743
首付时间max_x 8743
首付时间min_x 8743
首付月数mean_x 8743
首付月数max_x 8743
首付月数min_x 8743
首付率mean_x 8743
首付率max_x 8743
首付率min_x 8743
利率1mean_x 8743
利率1max_x 8743
利率1min_x 8743
利率2mean_x 8743
利率2max_x 8743
利率2min_x 8743
贷款类型_0sum_x 8743
贷款类型_0mean_x 8743
贷款类型_1sum_x 8743
贷款类型_1mean_x 8743
贷款类型_Asum_x 8743
贷款类型_Amean_x 8743
贷款用途_0sum_x 8743
贷款用途_0mean_x 8743
贷款用途_1sum_x 8743
贷款用途_1mean_x 8743
贷款用途_2sum_x 8743
贷款用途_2mean_x 8743
贷款用途_3sum_x 8743
贷款用途_3mean_x 8743
贷款用途_4sum_x 8743
贷款用途_4mean_

In [21]:
# # 检查d4 nan值
# '''d4 nan值'''
# print_na(d4)

# # d4.head()
# d0 = pd.read_csv('./data/train_label.csv')
# # d0.shape
# # d0.head()

# d40 = pd.merge(d4, d0, how='left', on='申请编号')

# d40['分期付款日期'].value_counts()
# pd.DataFrame(d40.loc[d40['标签']==0,'分期付款日期'].value_counts().sort_index())

# pd.DataFrame(d40.loc[d40['分期付款日期']==np.NaN,'标签'].value_counts().sort_index())

# d4.groupby(['申请编号','历史申请编号','有支付行为的分期付款']).agg({'本期贷款金额':'sum'}).reset_index()


In [46]:
### d4 缺失值插补

# for col in d4.columns:
#     d4[col].value_counts()

# 距申请日期>0, 置平均值
# d4_cols = {}
# d4_cols['_dt'] = [
#     '分期付款实际支付时间']
    
nan_to_mean = [
    '分期付款实际支付时间',
    '本期还款金额'
]

#TODO 空值插补0
d4 = d4.fillna(0)

In [47]:
### d4 自定义特征

d4.eval('分期付款支付时间差 = 分期付款实际支付时间 - 分期付款应支付时间', inplace=True)
# d4['分期付款应支付时间'].value_counts()
# d4['分期付款实际支付时间'].value_counts()

In [48]:
### d4 group by phase 1 ###

d41_agg_dict = {
    '本期贷款金额':['sum'],
    '本期还款金额':['mean'],
    '分期付款支付时间差':['mean']
}
d41 = d4.groupby(['申请编号','历史申请编号','有支付行为的分期付款']).agg(d41_agg_dict)
d41.columns = pd.Series(d41.columns.tolist()).apply(pd.Series).sum(axis=1)
d41.shape
d41.head()


(5719219, 3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,本期贷款金额sum,本期还款金额mean,分期付款支付时间差mean
申请编号,历史申请编号,有支付行为的分期付款,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,779169,2,1100.0,1100.0,-14.0
0,779169,3,1100.0,1100.0,-13.0
0,779169,4,1100.0,1100.0,1.0
0,779169,5,1100.0,1100.0,-23.0
0,779169,7,1100.0,1100.0,-1.0


In [49]:
### d4 自定义特征
# d41['分期付款支付时间差mean'].value_counts()
# d41.loc[d4['分期付款实际支付时间']>0, '分期付款实际支付时间'].value_counts()

# '是否逾期'
d41['是否逾期'] = d41['分期付款支付时间差mean'].apply(lambda x: 1 if x > 0 else 0 )
d41['是否逾期'].value_counts()

0    5444576
1     274643
Name: 是否逾期, dtype: int64

In [50]:
### d4 group by phase 2 ###

d42_agg_dict = {
    '本期还款金额mean':['sum'],
    '本期贷款金额sum':['sum'],
    '分期付款支付时间差mean':['mean'],
    '是否逾期':['mean','sum'] # mean-逾期占比；sum-累计逾期
}
d42 = d41.groupby(['申请编号','历史申请编号']).agg(d42_agg_dict)
d42.columns = pd.Series(d42.columns.tolist()).apply(pd.Series).sum(axis=1)
d42.shape
d42.head()


(445770, 5)

Unnamed: 0_level_0,Unnamed: 1_level_0,本期还款金额meansum,本期贷款金额sumsum,分期付款支付时间差meanmean,是否逾期mean,是否逾期sum
申请编号,历史申请编号,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,779169,11538.0,16472.0,-11.0,0.083333,1
1,313225,7428.5,8376.0,-6.25,0.0,0
1,753009,11712.0,11712.0,-3.083333,0.0,0
2,702654,57934.0,65493.0,-2.458333,0.0,0
2,710924,38610.0,38610.0,-3.466667,0.066667,1


In [51]:
### d4 自定义特征

d42.eval('本期还款金额ms比贷款金额ss = 本期还款金额meansum / 本期贷款金额sumsum', inplace=True)
d42.columns.to_list()

['本期还款金额meansum',
 '本期贷款金额sumsum',
 '分期付款支付时间差meanmean',
 '是否逾期mean',
 '是否逾期sum',
 '本期还款金额ms比贷款金额ss']

In [52]:
### d4 group by phase 3 ###

d43 = d42.groupby('申请编号').agg({
    '本期还款金额ms比贷款金额ss':['mean'],
    '本期还款金额meansum':['sum'],
    '本期贷款金额sumsum':['sum'],
    '是否逾期sum':['sum','mean'], # 是否逾期sumsum:历史累计逾期数; 是否逾期summean:历史累计逾期占比
    '是否逾期mean':['mean'], # 逾期meanmean:逾期占比均值
})
d43.columns = pd.Series(d43.columns.tolist()).apply(pd.Series).sum(axis=1)
'''d43'''
d43.shape
d43.head()


'd43'

(153309, 6)

Unnamed: 0_level_0,本期还款金额ms比贷款金额ssmean,本期还款金额meansumsum,本期贷款金额sumsumsum,是否逾期sumsum,是否逾期summean,是否逾期meanmean
申请编号,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.700461,11538.0,16472.0,1,1.0,0.083333
1,0.94344,19140.5,20088.0,0,0.0,0.0
2,0.971146,221145.0,228704.0,1,0.25,0.016667
3,1.0,58938.0,58938.0,0,0.0,0.0
5,0.92534,51451.5,52856.0,0,0.0,0.0


In [53]:
### d4 group by phase 4 ###

d4_agg_dict = {}
d4_cols = {}
# 金额型
d4_cols['curr'] = [
    '本期贷款金额',
    '本期还款金额']


# 时长型
d4_cols['time_span'] = [
    '分期付款应支付时间',
    '分期付款实际支付时间']

# 计数型
d4_cols['cnt'] = [
    '有支付行为的分期付款']

# 数值型 - 金额
curr_agg = ['mean', 'max', 'min']
for col in d4_cols['curr']:
    d4_agg_dict[col] = curr_agg

# 数值型 - 时长 
time_span_agg = ['mean', 'max', 'min']
for col in d4_cols['time_span']:
    d4_agg_dict[col] = time_span_agg

d4 = d4.groupby('申请编号').agg(d4_agg_dict)
d4.columns = pd.Series(d4.columns.tolist()).apply(pd.Series).sum(axis=1)
'''d4'''
d4.shape

d4 = pd.merge(d4, d43, how='inner', on='申请编号')
'''d4'''
d4.shape
d4.head()

'd4'

(153309, 12)

'd4'

(153309, 18)

Unnamed: 0_level_0,本期贷款金额mean,本期贷款金额max,本期贷款金额min,本期还款金额mean,本期还款金额max,本期还款金额min,分期付款应支付时间mean,分期付款应支付时间max,分期付款应支付时间min,分期付款实际支付时间mean,分期付款实际支付时间max,分期付款实际支付时间min,本期还款金额ms比贷款金额ssmean,本期还款金额meansumsum,本期贷款金额sumsumsum,是否逾期sumsum,是否逾期summean,是否逾期meanmean
申请编号,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,1098.133333,1100.0,1072.0,880.266667,1100.0,12.0,-1600.0,-1480,-1744,-1610.333333,-1490.0,-1758.0,0.700461,11538.0,16472.0,1,1.0,0.083333
1,803.52,976.0,635.0,778.52,976.0,242.0,-267.92,-19,-518,-272.48,-23.0,-527.0,0.94344,19140.5,20088.0,0,0.0,0.0
2,5578.146341,39611.0,2574.0,5455.414634,39611.0,607.0,-331.829268,-4,-641,-334.439024,-28.0,-648.0,0.971146,221145.0,228704.0,1,0.25,0.016667
3,4911.5,4912.0,4906.0,4911.5,4912.0,4906.0,-252.0,-120,-384,-267.0,-134.0,-396.0,1.0,58938.0,58938.0,0,0.0,0.0
5,1957.62963,17350.0,33.0,1923.222222,17350.0,33.0,-1107.925926,-222,-2306,-1111.407407,-222.0,-2306.0,0.92534,51451.5,52856.0,0,0.0,0.0


In [54]:
### Merge d12 and d4 ###
'''d12'''
d12.shape
'''d4'''
d4.shape

d12_d4 = pd.merge(d12['申请编号'], d4, how='left', on='申请编号')

'''d12_d4'''
d12_d4.shape
# print_na(d12_d4)

#TODO 空值插补0
d12_d4 = d12_d4.fillna(0)

'd12'

(161511, 120)

'd4'

(153309, 18)

'd12_d4'

(161511, 19)

In [55]:
### Merge d12_d3a_all and d12_d4 ###

d12_d3all_a_d.shape
d12_d4.shape
d1234 = pd.merge(d12_d3all_a_d, d12_d4, how='left', on='申请编号')
d1234.shape
print_na(d1234)

(161511, 784)

(161511, 19)

(161511, 802)

In [52]:
## Deal with Inf

rate_features = d1234.columns[d1234.columns.str.contains('比')]
rate_features

for f in rate_features:
    d1234.loc[d1234[f]==np.inf, f] = 0

Index(['信用额度比mean', '信用额度比max', '信用额度比min', '贷款年金sum比', '申请额度sum比', '信用额度sum比',
       '信用比申请额度', '本期还款金额ms比贷款金额ssmean'],
      dtype='object')

In [54]:
d1234.shape

(161511, 409)

In [47]:
### BoxCox

def get_boxcox(x):
    inp = x.apply('abs') + 1
    l, opt_lmbda = spstats.boxcox(inp)
    opt_lmbda
    lmbda = 0
    if opt_lmbda > 0:
        lmbda = opt_lmbda
    ret = spstats.boxcox(inp, lmbda=lmbda)
    return lmbda, ret

# Features
large_feats_drop = ['客户收入']
large_feats = d1234.max()[d1234.max()>100].index.drop(large_feats_drop)

lmbda = {}
for f in large_feats:
    lmbda[f], d1234[f] = get_boxcox(d1234[f])

In [None]:
prefix = 'train'
data_train = pd.merge(d01['申请编号'], d1234, on='申请编号', how='left')
data_train.shape
data_train.to_csv(f'./tmp/{prefix}_d1234.csv')

prefix = 'A'
data_test = pd.merge(d05['申请编号'], d1234, on='申请编号', how='left')
data_test.shape
data_test.to_csv(f'./tmp/{prefix}_d1234.csv')

In [49]:
data_train.head()

Unnamed: 0,申请编号,贷款类型,信用额度,贷款年金,商品价格,陪同申请人,出生日期距申请日期天数,工作日期距申请日期天数,注册日期距申请日期天数,身份认证日期距申请日期天数,...,分期付款应支付时间min,分期付款实际支付时间mean,分期付款实际支付时间max,分期付款实际支付时间min,本期还款金额ms比贷款金额ssmean,本期还款金额meansumsum,本期贷款金额sumsumsum,是否逾期sumsum,是否逾期summean,是否逾期meanmean
0,0,0.0,13.039399,9.767899,12.947897,7.0,-10107.0,-342.0,-5421.0,-3292.0,...,-1744.0,-1610.333333,-1490.0,-1758.0,0.543973,39.477802,43.994965,1.0,1.0,0.083333
1,1,,,,,,,,,,...,,,,,,,,,,
2,2,,,,,,,,,,...,,,,,,,,,,
3,3,,,,,,,,,,...,,,,,,,,,,
4,5,,,,,,,,,,...,,,,,,,,,,


In [42]:
'''All Done'''
data_train.columns.to_list()

'All Done'

['申请编号',
 '贷款类型',
 '信用额度',
 '贷款年金',
 '商品价格',
 '陪同申请人',
 '出生日期距申请日期天数',
 '工作日期距申请日期天数',
 '注册日期距申请日期天数',
 '身份认证日期距申请日期天数',
 '是否提供手机号',
 '是否提供电话',
 '手机号是否有效',
 '是否提供email',
 '申请周内日',
 '申请时点',
 '外部评分',
 '申请人是否额外提供了文件2',
 '申请人是否额外提供了文件3',
 '申请人是否额外提供了文件4',
 '申请人是否额外提供了文件5',
 '申请人是否额外提供了文件6',
 '申请人是否额外提供了文件7',
 '申请人是否额外提供了文件8',
 '申请人是否额外提供了文件9',
 '贷款申请前1小时内征信查询次数',
 '贷款申请前1天内征信查询次数',
 '贷款申请前1周内征信查询次数',
 '贷款申请前1个月内征信查询次数',
 '贷款申请前1个季度内征信查询次数',
 '贷款申请前1年内征信查询次数',
 '性别',
 '是否有车',
 '是否有房',
 '孩子个数',
 '客户收入',
 '收入类型',
 '教育程度',
 '婚姻状态',
 '居住状态',
 '居住地人口密集度',
 '有车时间',
 '职业',
 '家庭成员数',
 '居住信息1',
 '居住信息2',
 '居住信息3',
 '居住信息4',
 '客户居住地评分1',
 '客户居住地评分2',
 '地址是否一致标志1',
 '地址是否一致标志2',
 '地址是否一致标志3',
 '地址是否一致标志4',
 '地址是否一致标志5',
 '地址是否一致标志6',
 '单位类型',
 '社交圈违约信息2_2',
 '社交圈违约信息2_1',
 '社交圈违约信息1_2',
 '社交圈违约信息1_1',
 '最近一次换手机号码距申请日天数',
 '历史申请编号count_x',
 '贷款年金mean',
 '贷款年金max',
 '贷款年金min',
 '申请额度mean',
 '申请额度max',
 '申请额度min',
 '信用额度mean',
 '信用额度max',
 '信用额度min',
 '首付金额mean',
 '首付金额max',
 '首付金额min',
 '商品价格mean',
 '商品