In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# 读取稀疏行为特征数据
**说明：**先处理稀疏的基于时间的行为数据  
- 清理方差为0的特征
- 求取测试集与训练集特征属性的intersection

In [9]:
df_feat_behavior_train = pd.read_csv("data/feature_time_based_behavior_train_raw_latest.csv",index_col="个人编码").fillna(0)
df_feat_behavior_test = pd.read_csv("data/feature_time_based_behavior_test_raw_latest.csv",index_col="个人编码").fillna(0)

In [10]:
print df_feat_behavior_train.shape
print df_feat_behavior_test.shape

(20000, 1208)
(4000, 944)


In [11]:
predictors_train = df_feat_behavior_train.columns.difference(['target'])
predictors_test = df_feat_behavior_test.columns.difference(['target'])

In [12]:
print len(predictors_train)
print len(predictors_test)

1208
944


In [13]:
predictors_train[predictors_train.str.startswith('every_')]

Index([u'every_ave_period', u'every_max_period', u'every_min_period',
       u'every_period_0', u'every_period_0.1', u'every_period_1',
       u'every_period_1.1', u'every_period_10', u'every_period_10.1',
       u'every_period_100',
       ...
       u'every_period_96.1', u'every_period_97', u'every_period_97.1',
       u'every_period_98', u'every_period_98.1', u'every_period_99',
       u'every_period_99.1', u'every_q1_period', u'every_q2_period',
       u'every_q3_period'],
      dtype='object', length=272)

In [14]:
predictors_test[predictors_test.str.startswith('every_')]

Index([u'every_ave_period', u'every_max_period', u'every_min_period',
       u'every_period_0', u'every_period_0.1', u'every_period_1',
       u'every_period_1.1', u'every_period_10', u'every_period_10.1',
       u'every_period_102',
       ...
       u'every_period_91.1', u'every_period_92', u'every_period_92.1',
       u'every_period_94', u'every_period_94.1', u'every_period_98',
       u'every_period_98.1', u'every_q1_period', u'every_q2_period',
       u'every_q3_period'],
      dtype='object', length=216)

# 获取共同特征属性

In [15]:
predictors_train.difference(predictors_test)

Index([u'every_period_100', u'every_period_100.1', u'every_period_103',
       u'every_period_103.1', u'every_period_104', u'every_period_104.1',
       u'every_period_105', u'every_period_105.1', u'every_period_106',
       u'every_period_106.1',
       ...
       u'freq_trans_hospital_846', u'freq_trans_hospital_87',
       u'freq_trans_hospital_88', u'freq_trans_hospital_897',
       u'freq_trans_hospital_898', u'freq_trans_hospital_905',
       u'freq_trans_hospital_939', u'freq_trans_hospital_94',
       u'freq_trans_hospital_96', u'freq_trans_hospital_99'],
      dtype='object', length=288)

In [16]:
predictors_test.difference(predictors_train)

Index([u'every_period_109', u'every_period_109.1', u'every_period_118',
       u'every_period_118.1', u'every_period_123', u'every_period_123.1',
       u'every_period_131', u'every_period_131.1', u'every_period_135',
       u'every_period_135.1', u'every_period_147', u'every_period_147.1',
       u'every_period_153', u'every_period_153.1', u'freq_hospital_1304',
       u'freq_hospital_1437', u'freq_hospital_485', u'freq_hospital_821',
       u'freq_hospital_91', u'freq_trans_hospital_1304',
       u'freq_trans_hospital_1437', u'freq_trans_hospital_485',
       u'freq_trans_hospital_821', u'freq_trans_hospital_91'],
      dtype='object')

In [17]:
common_predictors = predictors_train.intersection(predictors_test)

In [18]:
df_feat_behavior_train = df_feat_behavior_train[common_predictors]
df_feat_behavior_test = df_feat_behavior_test[common_predictors]

# 检视

In [26]:
df_feat_behavior_test.head()

Unnamed: 0_level_0,day_to_nhospitals,dayofweek_0,dayofweek_1,dayofweek_2,dayofweek_3,dayofweek_4,dayofweek_5,dayofweek_6,every_ave_period,every_max_period,...,total_rate,total_trans_rate,trans_stat_max,trans_stat_mean,trans_stat_median,trans_stat_min,trans_stat_q1,trans_stat_q3,trans_stat_std,trans_stat_sum
个人编码,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
352120000001598,1,3.0,5.0,5.0,4.0,3.0,2.0,1.0,6.73913,16.0,...,6.458333,0.974843,13,6.913043,7.0,2,5.0,9.0,2.937424,159
352120000004806,0,11.0,9.0,7.0,9.0,5.0,1.0,0.0,4.317073,18.0,...,4.214286,0.7375,20,5.714286,4.0,2,4.0,5.0,4.163053,240
352120000005123,0,6.0,0.0,1.0,1.0,14.0,1.0,0.0,8.136364,14.0,...,7.782609,0.824885,22,9.434783,7.0,2,6.0,9.0,6.243732,217
352120000006659,0,4.0,5.0,2.0,2.0,1.0,0.0,0.0,13.230769,41.0,...,12.285714,0.741379,25,16.571429,20.0,3,7.0,23.75,9.095344,232
352120000010106,0,6.0,0.0,3.0,3.0,4.0,2.0,4.0,8.047619,23.0,...,7.681818,1.083333,20,7.090909,6.0,3,5.0,7.75,3.624114,156


In [23]:
df_feat_behavior_test.columns[df_feat_behavior_test.columns.str.startswith('every_period')]

Index([u'every_period_0', u'every_period_0.1', u'every_period_1',
       u'every_period_1.1', u'every_period_10', u'every_period_10.1',
       u'every_period_102', u'every_period_102.1', u'every_period_107',
       u'every_period_107.1',
       ...
       u'every_period_9', u'every_period_9.1', u'every_period_91',
       u'every_period_91.1', u'every_period_92', u'every_period_92.1',
       u'every_period_94', u'every_period_94.1', u'every_period_98',
       u'every_period_98.1'],
      dtype='object', length=196)

In [25]:
df_feat_behavior_train.head()

Unnamed: 0_level_0,day_to_nhospitals,dayofweek_0,dayofweek_1,dayofweek_2,dayofweek_3,dayofweek_4,dayofweek_5,dayofweek_6,every_ave_period,every_max_period,...,total_rate,total_trans_rate,trans_stat_max,trans_stat_mean,trans_stat_median,trans_stat_min,trans_stat_q1,trans_stat_q3,trans_stat_std,trans_stat_sum
个人编码,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
352120000000231,0,1.0,14.0,4.0,4.0,6.0,2.0,3.0,5.484848,14.0,...,5.323529,0.891626,12,5.970588,6.0,3,5.0,6.0,1.660327,203
352120000000386,0,1.0,11.0,3.0,0.0,0.0,0.0,0.0,12.071429,15.0,...,11.266667,1.3,14,8.666667,8.0,3,7.5,10.0,2.894987,130
352120000000408,8,17.0,13.0,4.0,2.0,1.0,0.0,0.0,4.0,18.0,...,3.911111,0.77193,14,6.162162,5.0,1,4.0,8.0,3.304152,228
352120000000409,1,2.0,2.0,1.0,2.0,14.0,1.0,0.0,8.272727,15.0,...,7.913043,1.700935,9,4.863636,5.5,2,3.0,6.0,1.753784,107
352120000000511,0,1.0,6.0,7.0,6.0,4.0,1.0,1.0,7.04,19.0,...,6.769231,1.128205,8,6.0,5.5,3,5.0,8.0,1.624808,156


In [22]:
df_feat_behavior_train.columns[df_feat_behavior_train.columns.str.startswith('every_period')]

Index([u'every_period_0', u'every_period_0.1', u'every_period_1',
       u'every_period_1.1', u'every_period_10', u'every_period_10.1',
       u'every_period_102', u'every_period_102.1', u'every_period_107',
       u'every_period_107.1',
       ...
       u'every_period_9', u'every_period_9.1', u'every_period_91',
       u'every_period_91.1', u'every_period_92', u'every_period_92.1',
       u'every_period_94', u'every_period_94.1', u'every_period_98',
       u'every_period_98.1'],
      dtype='object', length=196)

# 读取item特征进行合并

In [27]:
df_feat_item_train = pd.read_csv('data/feature_items_stats_train.csv',index_col='个人编码')
df_feat_item_test = pd.read_csv('data/feature_items_stats_test.csv',index_col='个人编码')

In [28]:
df_id_train = pd.read_csv('data/df_id_train.csv',header=None)
df_id_train = df_id_train.rename(columns={0:'Id',1:'target'})

In [22]:
df_id_train

Unnamed: 0,Id,target
0,352120001523108,1
1,352120001475556,0
2,352120003484886,0
3,352120002750505,0
4,352120001556755,0
5,352120001889611,0
6,352120002491088,0
7,352120010179934,1
8,352120000748407,0
9,352120002574575,1


In [29]:
df_feat_train = pd.concat([df_feat_item_train,df_feat_behavior_train],axis=1)

In [30]:
df_data_train = df_feat_train.join(df_id_train.set_index('Id'))

测试数据

In [31]:
df_id_test = pd.read_csv('data/df_id_test.csv',header=None)
df_id_test['target'] = 0
df_id_test = df_id_test.rename(columns={0:'Id',1:'target'})

In [30]:
df_id_test

Unnamed: 0,Id,target
0,352120001207726,0
1,352120000183777,0
2,352120002857717,0
3,352120001492069,0
4,352120001574938,0
5,352120001114553,0
6,352121000142166,0
7,352120000447733,0
8,352120000515129,0
9,352121000391830,0


In [32]:
df_feat_test = pd.concat([df_feat_item_test,df_feat_behavior_test],axis=1)

In [33]:
df_data_test = df_feat_test.join(df_id_test.set_index('Id'))

# 构造部分新特征

In [34]:
predictors = df_data_train.columns.difference(['target'])

In [35]:
feat_fee_fasheng = predictors[(predictors.str.find("金额")!=-1)&(predictors.str.endswith("_sum"))&(predictors.str.find("发生")!=-1)]
feat_fee_shenbao = predictors[(predictors.str.find("金额")!=-1)&(predictors.str.endswith("_sum"))&(predictors.str.find("申报")!=-1)]
feat_fee_zifei = predictors[(predictors.str.find("金额")!=-1)&(predictors.str.endswith("_sum"))&(predictors.str.find("自费")!=-1)]
feat_fee_buzhu = predictors[(predictors.str.find("金额")!=-1)&(predictors.str.endswith("_sum"))&(predictors.str.find("补助")!=-1)]
feat_fee_zhifu = predictors[(predictors.str.find("金额")!=-1)&(predictors.str.endswith("_sum"))&(predictors.str.find("支付")!=-1)]

In [36]:
# for训练数据

# 发生费用
df_data_train['fee_fasheng_sum'] = df_data_train[feat_fee_fasheng].sum(axis=1)
df_data_train['fee_fasheng_mean'] = df_data_train[feat_fee_fasheng].mean(axis=1)
# 申报费用
df_data_train['fee_shenbao_sum'] = df_data_train[feat_fee_shenbao].sum(axis=1)
df_data_train['fee_shenbao_mean'] = df_data_train[feat_fee_shenbao].mean(axis=1)
# 自费
df_data_train['fee_zifei_sum'] = df_data_train[feat_fee_zifei].sum(axis=1)
df_data_train['fee_zifei_mean'] = df_data_train[feat_fee_zifei].mean(axis=1)
# 补助
df_data_train['fee_buzhu_sum'] = df_data_train[feat_fee_buzhu].sum(axis=1)
df_data_train['fee_buzhu_mean'] = df_data_train[feat_fee_buzhu].mean(axis=1)
# 支付
df_data_train['fee_zhifu_sum'] = df_data_train[feat_fee_zhifu].sum(axis=1)
df_data_train['fee_zhifu_mean'] = df_data_train[feat_fee_zhifu].mean(axis=1)

# 发生-申报
df_data_train['fee_fasheng_shenbao'] = df_data_train['fee_fasheng_sum']-df_data_train['fee_shenbao_sum']

In [37]:
# for 测试数据
# 发生费用
df_data_test['fee_fasheng_sum'] = df_data_test[feat_fee_fasheng].sum(axis=1)
df_data_test['fee_fasheng_mean'] = df_data_test[feat_fee_fasheng].mean(axis=1)
# 申报费用
df_data_test['fee_shenbao_sum'] = df_data_test[feat_fee_shenbao].sum(axis=1)
df_data_test['fee_shenbao_mean'] = df_data_test[feat_fee_shenbao].mean(axis=1)
# 自费
df_data_test['fee_zifei_sum'] = df_data_test[feat_fee_zifei].sum(axis=1)
df_data_test['fee_zifei_mean'] = df_data_test[feat_fee_zifei].mean(axis=1)
# 补助
df_data_test['fee_buzhu_sum'] = df_data_test[feat_fee_buzhu].sum(axis=1)
df_data_test['fee_buzhu_mean'] = df_data_test[feat_fee_buzhu].mean(axis=1)
# 支付
df_data_test['fee_zhifu_sum'] = df_data_test[feat_fee_zhifu].sum(axis=1)
df_data_test['fee_zhifu_mean'] = df_data_test[feat_fee_zhifu].mean(axis=1)

# 发生-申报
df_data_test['fee_fasheng_shenbao'] = df_data_test['fee_fasheng_sum']-df_data_test['fee_shenbao_sum']

# 总特征清理

In [38]:
predictors = df_data_train.columns.difference(['target'])

In [39]:
predictors_std = df_data_train[predictors].std()

In [40]:
# std 为0的特征（需要去除）
zero_std_predictors = predictors_std[predictors_std == 0].index

In [41]:
zero_std_predictors

Index([u'every_period_365', u'every_period_365.1', u'其它申报金额_q1', u'其它申报金额_q2',
       u'其它申报金额_q3', u'手术费自费金额_q1', u'手术费自费金额_q2', u'手术费自费金额_q3',
       u'最高限额以上金额_q1', u'最高限额以上金额_q2', u'最高限额以上金额_q3', u'药品费拒付金额_max',
       u'药品费拒付金额_mean', u'药品费拒付金额_q1', u'药品费拒付金额_q2', u'药品费拒付金额_q3',
       u'药品费拒付金额_std', u'药品费拒付金额_sum', u'起付线标准金额_q1', u'起付线标准金额_q2',
       u'起付线标准金额_q3', u'输全血按比例自负金额_max', u'输全血按比例自负金额_mean', u'输全血按比例自负金额_q1',
       u'输全血按比例自负金额_q2', u'输全血按比例自负金额_q3', u'输全血按比例自负金额_std',
       u'输全血按比例自负金额_sum', u'高价材料发生金额_q1', u'高价材料发生金额_q2'],
      dtype='object')

In [42]:
df_data_train.drop(zero_std_predictors,axis=1).to_csv('df_data_train_clean_sparse_latest.csv')
df_data_test.drop(zero_std_predictors,axis=1).to_csv('df_data_test_clean_sparse_latest.csv')

In [40]:
df_data_test.drop(zero_std_predictors,axis=1)

Unnamed: 0_level_0,药品费发生金额_sum,贵重药品发生金额_sum,中成药费发生金额_sum,中草药费发生金额_sum,药品费自费金额_sum,药品费申报金额_sum,检查费发生金额_sum,贵重检查费金额_sum,检查费自费金额_sum,检查费申报金额_sum,...,fee_fasheng_mean,fee_shenbao_sum,fee_shenbao_mean,fee_zifei_sum,fee_zifei_mean,fee_buzhu_sum,fee_buzhu_mean,fee_zhifu_sum,fee_zhifu_mean,fee_fasheng_shenbao
个人编码,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
352120000001598,47509.56,0.00,9229.62,5557.44,967.99,46378.20,1470.0,0.0,0.0,1470.0,...,6019.447273,50295.50,6286.93750,967.99,193.598,0.00,0.000000,51426.86,10285.372,15918.42
352120000004806,73139.96,0.00,3853.14,56048.02,222.92,72857.30,0.0,0.0,0.0,0.0,...,12922.283636,81961.30,10245.16250,222.92,44.584,0.00,0.000000,82243.96,16448.792,60183.82
352120000005123,76780.33,20004.24,1014.68,31853.70,0.00,76780.33,400.0,0.0,0.0,400.0,...,11907.268182,78067.33,9758.41625,0.00,0.000,0.00,0.000000,78107.33,15621.466,52912.62
352120000006659,158424.90,6365.16,6365.16,152059.74,0.00,158424.90,0.0,0.0,0.0,0.0,...,29397.450909,158581.90,19822.73750,0.00,0.000,0.00,0.000000,158581.90,31716.380,164790.06
352120000010106,45043.14,0.00,13450.42,4629.78,201.76,44763.98,1116.0,0.0,0.0,1116.0,...,6102.283636,48765.76,6095.72000,201.76,40.352,0.00,0.000000,49044.92,9808.984,18359.36
352120000011024,57664.42,0.00,20975.20,0.00,118.00,56801.10,670.0,0.0,0.0,670.0,...,7496.229091,60616.00,7577.00000,118.00,23.600,0.00,0.000000,61483.32,12296.664,21842.52
352120000012921,295173.54,83222.37,0.00,0.00,0.00,289161.09,12540.0,0.0,350.0,12190.0,...,65172.373636,620963.29,77620.41125,350.00,70.000,0.00,0.000000,633673.74,126734.748,95932.82
352120000014708,104321.44,6549.18,3935.31,75329.38,8.00,104255.85,6669.0,0.0,0.0,6669.0,...,17923.437273,111278.35,13909.79375,8.00,1.600,0.00,0.000000,111343.94,22268.788,85879.46
352120000015145,152473.71,0.00,3847.00,136080.00,1028.74,151275.46,0.0,0.0,0.0,0.0,...,34860.164545,242290.96,30286.37000,1028.74,205.748,0.00,0.000000,243534.81,48706.962,141170.85
352120000015378,21257.54,3753.84,2065.40,0.00,15.20,21242.34,1563.0,0.0,0.0,1563.0,...,2736.970909,24272.24,3034.03000,15.20,3.040,0.00,0.000000,24287.44,4857.488,5834.44


# 新特征：总费用和平均费用

In [22]:
feat_fee_fasheng = predictors[(predictors.str.find("金额")!=-1)&(predictors.str.endswith("_sum"))&(predictors.str.find("发生")!=-1)]

In [23]:
feat_fee_shenbao = predictors[(predictors.str.find("金额")!=-1)&(predictors.str.endswith("_sum"))&(predictors.str.find("申报")!=-1)]

In [24]:
feat_fee_zifei = predictors[(predictors.str.find("金额")!=-1)&(predictors.str.endswith("_sum"))&(predictors.str.find("自费")!=-1)]

In [25]:
feat_fee_buzhu = predictors[(predictors.str.find("金额")!=-1)&(predictors.str.endswith("_sum"))&(predictors.str.find("补助")!=-1)]

In [31]:
feat_fee_zhifu = predictors[(predictors.str.find("金额")!=-1)&(predictors.str.endswith("_sum"))&(predictors.str.find("支付")!=-1)]

In [17]:
predictors[(predictors.str.find("金额")!=-1)&(predictors.str.endswith("_sum"))]

Index([u'一次性医用材料申报金额_sum', u'中成药费发生金额_sum', u'中草药费发生金额_sum',
       u'公务员医疗补助基金支付金额_sum', u'其它发生金额_sum', u'其它申报金额_sum', u'医用材料发生金额_sum',
       u'医用材料费自费金额_sum', u'医疗救助个人按比例负担金额_sum', u'可用账户报销金额_sum',
       u'城乡优抚补助金额_sum', u'城乡救助补助金额_sum', u'基本医疗保险个人账户支付金额_sum',
       u'基本医疗保险统筹基金支付金额_sum', u'床位费发生金额_sum', u'床位费申报金额_sum', u'成分输血申报金额_sum',
       u'手术费发生金额_sum', u'手术费申报金额_sum', u'手术费自费金额_sum', u'最高限额以上金额_sum',
       u'本次审批金额_sum', u'检查费发生金额_sum', u'检查费申报金额_sum', u'检查费自费金额_sum',
       u'残疾军人医疗补助基金支付金额_sum', u'民政救助补助金额_sum', u'治疗费发生金额_sum', u'治疗费申报金额_sum',
       u'治疗费自费金额_sum', u'药品费发生金额_sum', u'药品费拒付金额_sum', u'药品费申报金额_sum',
       u'药品费自费金额_sum', u'补助审批金额_sum', u'贵重检查费金额_sum', u'贵重药品发生金额_sum',
       u'起付标准以上自负比例金额_sum', u'起付线标准金额_sum', u'输全血按比例自负金额_sum', u'非账户支付金额_sum',
       u'高价材料发生金额_sum'],
      dtype='object')

In [42]:
# 发生费用
df_data_train['fee_fasheng_sum'] = df_data_train[feat_fee_fasheng].sum(axis=1)
df_data_train['fee_fasheng_mean'] = df_data_train[feat_fee_fasheng].mean(axis=1)
# 申报费用
df_data_train['fee_shenbao_sum'] = df_data_train[feat_fee_shenbao].sum(axis=1)
df_data_train['fee_shenbao_mean'] = df_data_train[feat_fee_shenbao].mean(axis=1)
# 自费
df_data_train['fee_zifei_sum'] = df_data_train[feat_fee_zifei].sum(axis=1)
df_data_train['fee_zifei_mean'] = df_data_train[feat_fee_zifei].mean(axis=1)
# 补助
df_data_train['fee_buzhu_sum'] = df_data_train[feat_fee_buzhu].sum(axis=1)
df_data_train['fee_buzhu_mean'] = df_data_train[feat_fee_buzhu].mean(axis=1)
# 支付
df_data_train['fee_zhifu_sum'] = df_data_train[feat_fee_zhifu].sum(axis=1)
df_data_train['fee_zhifu_mean'] = df_data_train[feat_fee_zhifu].mean(axis=1)

# 发生-申报
df_data_train['fee_fasheng_shenbao'] = df_data_train['fee_fasheng_sum']-df_data_train['fee_shenbao_sum']

In [33]:
df_data_train.head()

Unnamed: 0_level_0,药品费发生金额_sum,贵重药品发生金额_sum,中成药费发生金额_sum,中草药费发生金额_sum,药品费自费金额_sum,药品费拒付金额_sum,药品费申报金额_sum,检查费发生金额_sum,贵重检查费金额_sum,检查费自费金额_sum,...,fee_fasheng_sum,fee_fasheng_mean,fee_shenbao_sum,fee_shenbao_mean,fee_zifei_sum,fee_zifei_mean,fee_buzhu_sum,fee_buzhu_mean,fee_zhifu_sum,fee_zhifu_mean
个人编码,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
352120000000231,70372.25,9724.54,19721.85,0.0,226.33,0.0,69334.81,7010.0,0.0,0.0,...,107309.14,9755.376364,76757.31,9594.66375,226.33,45.266,0.0,0.0,77862.75,15572.55
352120000000386,33943.06,5617.3,2105.74,0.0,71.73,0.0,33086.74,570.0,0.0,0.0,...,42395.3,3854.118182,33815.94,4226.9925,71.73,14.346,0.0,0.0,34672.26,6934.452
352120000000408,50264.76,0.0,18577.27,0.0,11.08,0.0,49456.12,0.0,0.0,0.0,...,77549.36,7049.941818,58141.05,7267.63125,11.08,2.216,0.0,0.0,58972.09,11794.418
352120000000409,17915.9,0.0,15227.55,0.0,1.16,0.0,17914.74,0.0,0.0,0.0,...,40634.95,3694.086364,25406.24,3175.78,1.16,0.232,0.0,0.0,25407.4,5081.48
352120000000511,41459.28,6706.12,10400.12,0.0,13.41,0.0,41165.99,0.0,0.0,0.0,...,59045.52,5367.774545,41645.99,5205.74875,13.41,2.682,0.0,0.0,41939.28,8387.856


In [43]:
# 发生费用
df_data_test['fee_fasheng_sum'] = df_data_test[feat_fee_fasheng].sum(axis=1)
df_data_test['fee_fasheng_mean'] = df_data_test[feat_fee_fasheng].mean(axis=1)
# 申报费用
df_data_test['fee_shenbao_sum'] = df_data_test[feat_fee_shenbao].sum(axis=1)
df_data_test['fee_shenbao_mean'] = df_data_test[feat_fee_shenbao].mean(axis=1)
# 自费
df_data_test['fee_zifei_sum'] = df_data_test[feat_fee_zifei].sum(axis=1)
df_data_test['fee_zifei_mean'] = df_data_test[feat_fee_zifei].mean(axis=1)
# 补助
df_data_test['fee_buzhu_sum'] = df_data_test[feat_fee_buzhu].sum(axis=1)
df_data_test['fee_buzhu_mean'] = df_data_test[feat_fee_buzhu].mean(axis=1)
# 支付
df_data_test['fee_zhifu_sum'] = df_data_test[feat_fee_zhifu].sum(axis=1)
df_data_test['fee_zhifu_mean'] = df_data_test[feat_fee_zhifu].mean(axis=1)

# 发生-申报
df_data_test['fee_fasheng_shenbao'] = df_data_test['fee_fasheng_sum']-df_data_test['fee_shenbao_sum']

# 总特征数据清理

In [44]:
predictors = df_data_train.columns.difference(['target'])

In [45]:
predictors_std = df_data_train[predictors].std()

In [46]:
# std 为0的特征（需要去除）
zero_std_predictors = predictors_std[predictors_std == 0].index

In [47]:
zero_std_predictors

Index([u'其它申报金额_q1', u'其它申报金额_q2', u'其它申报金额_q3', u'手术费自费金额_q1', u'手术费自费金额_q2',
       u'手术费自费金额_q3', u'最高限额以上金额_q1', u'最高限额以上金额_q2', u'最高限额以上金额_q3',
       u'药品费拒付金额_max', u'药品费拒付金额_mean', u'药品费拒付金额_q1', u'药品费拒付金额_q2',
       u'药品费拒付金额_q3', u'药品费拒付金额_std', u'药品费拒付金额_sum', u'起付线标准金额_q1',
       u'起付线标准金额_q2', u'起付线标准金额_q3', u'输全血按比例自负金额_max', u'输全血按比例自负金额_mean',
       u'输全血按比例自负金额_q1', u'输全血按比例自负金额_q2', u'输全血按比例自负金额_q3', u'输全血按比例自负金额_std',
       u'输全血按比例自负金额_sum', u'高价材料发生金额_q1', u'高价材料发生金额_q2'],
      dtype='object')