In [1]:
# -*- coding: utf-8 -*-
import warnings
warnings.filterwarnings('ignore')
import logging
import datetime

import csv, math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.externals import joblib
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from dayu.hooks.oss_hook import OSSHook
from dayu.hooks.hive_server_hook import HiveServerHook
from dayu.hooks.hive_cli_hook import HiveCliHook

def split_table_name(datain):
    new_cols = []
    for column in datain.columns:
        if(len(column.split('.'))<2):
            return datain
        tb_name, col_name = column.split('.')
        new_cols.append((column, col_name))
    datain = datain.rename(columns=dict(new_cols))
    return datain

def read_from_hive2(output_file_name,insql,dtype):
    filename = output_file_name
    filepath = curr_dir+filename
    hive = HiveServerHook("warehouse_hive")
    hive.to_csv(insql,filepath , delimiter=',',lineterminator='\n', output_header=True)
    outdata = pd.read_csv(filepath, header=0,dtype=dtype)
    # 去除列名中带有的表名
#     outdata = split_table_name(outdata)
    return outdata

## 计算时间差
def date_time_sub(startTime,endTime,date_format):
    try:
        startTime= datetime.datetime.strptime(startTime,date_format)
        endTime= datetime.datetime.strptime(endTime,date_format)
        return (endTime - startTime).days
    except:
        pass

    # 计算车辆当前保值率
def computer_with_license_month(tar):
    try:
        license_month = tar['license_month']
        if(license_month<=12):
            #tar['keep_value'] = tar['year_1']
            return tar['year_1']
        else:
            year = license_month//12
            #当前年保值率
            keep_max = tar["year_"+str(int(year))]
            #下一年的保值率
            keep_min = tar["year_"+str(int(year+1))]

            #相比于上一年，已经过了几个月
            mon = license_month-12*year
            tem = (keep_max-keep_min)/12

            #tar['keep_value'] = round(keep_max - tem*mon, 4)
            return round(keep_max - tem*mon, 4)
    except:
        return tar["year_16"]

class Logger:       
    def __init__(self, logName, logFile):
        self._logger = logging.getLogger(logName)
        handler = logging.FileHandler(logFile)
        formatter = logging.Formatter('%(asctime)s ********* %(message)s')
        handler.setFormatter(formatter)
        self._logger.addHandler(handler)
        self._logger.setLevel(logging.INFO)

    def log(self, msg):
        if self._logger is not None:
            self._logger.info(msg)


pd.set_option('display.max_columns', 500)
curr_dir = '/home/souche/qiongjiu/hgc/'

curr_date=str(datetime.datetime.now())[0:10]
hive_cli = HiveCliHook("warehouse_hive")

logger = Logger('model_service','./log/accurate_valuation_cyp_run_log.log')
logger.log("程序启动.............")


DAYU_HOME : /home/souche/projects/datacenter-etl-v2
[2021-02-19 15:01:02,164] {driver:120} INFO - Generating grammar tables from /usr/lib/python3.5/lib2to3/Grammar.txt
[2021-02-19 15:01:02,192] {driver:120} INFO - Generating grammar tables from /usr/lib/python3.5/lib2to3/PatternGrammar.txt
[2021-02-19 15:01:02,953] {<ipython-input-1-5aad934e0497>:84} INFO - 程序启动.............


In [2]:
# =========================================
###            训练模型
### =========================================

In [3]:
dl_site_ts_order_clean_dfc_sales = pd.read_csv(curr_dir+'dl_site_ts_order_clean_dfc_sales_0.csv')
dl_site_ts_order_clean_dfc_purchase = pd.read_csv(curr_dir+'dl_site_ts_order_clean_dfc_purchase_0.csv')
dl_site_ts_order_clean_quan = pd.read_csv(curr_dir+'dl_site_ts_order_clean_yh01.csv')


In [4]:
print(dl_site_ts_order_clean_dfc_sales.shape,
      dl_site_ts_order_clean_dfc_purchase.shape,
      dl_site_ts_order_clean_quan.shape)


(689715, 29) (921654, 32) (7996680, 30)


# 增加车易拍数据测试-----

In [5]:
sql_info ="""
SELECT model_code,series_code,brand_code,series_name,brand_name  
FROM db_data.ods_car_model_model  
WHERE ds = date_sub('"""+curr_date+"""',1)
"""
dtype={'city_code':str}

model_info  = read_from_hive2('model_info ',sql_info,dtype)
db_columns = []
for col in model_info.columns:
    if len(col.split('.')) > 1:
        db_columns.append(col.split('.')[1])

    else:
        db_columns.append(col)

model_info.columns = db_columns 
logger.log("读取数据完成.............")


[2021-02-19 15:02:24,655] {hiveserver2:138} INFO - Using database default as default
[2021-02-19 15:02:24,773] {hive_server_hook:112} INFO - Running query: 
SELECT model_code,series_code,brand_code,series_name,brand_name  
FROM db_data.ods_car_model_model  
WHERE ds = date_sub('2021-02-19',1)

[2021-02-19 15:02:25,146] {hive_server_hook:162} INFO - Written 10000 rows so far.
[2021-02-19 15:02:25,398] {hive_server_hook:162} INFO - Written 20000 rows so far.
[2021-02-19 15:02:25,648] {hive_server_hook:162} INFO - Written 30000 rows so far.
[2021-02-19 15:02:25,893] {hive_server_hook:162} INFO - Written 40000 rows so far.
[2021-02-19 15:02:26,127] {hive_server_hook:162} INFO - Written 50000 rows so far.
[2021-02-19 15:02:26,373] {hive_server_hook:162} INFO - Written 60000 rows so far.
[2021-02-19 15:02:26,623] {hive_server_hook:162} INFO - Written 70000 rows so far.
[2021-02-19 15:02:26,711] {hive_server_hook:162} INFO - Written 73572 rows so far.
[2021-02-19 15:02:26,715] {hiveserver2:26

In [6]:
test_df = pd.read_excel("../标注数据818.xlsx")
test_df = test_df.rename(columns = {'订单号':'car_id', 
                                    '交易时间':'publish_time', 
                                    '车型code':'model_code', 
                                    '品牌':'brand_name', 
                                    '车型':'model_name', 
                                    '车系':'series_name', '省份':'a_province_name',
                                    '城市':'a_city_name', 
                                    '上牌时间':'license_time', 
                                    '公里数万':'mileage', 
                                    '颜色':'color', 
                                    '过户次数':'transfer_times', 
                                    '营运性质':'use_property',
                                    '订单金额（万元）':'real_pay_amount'})


test_df['license_time_year'] = test_df['license_time'].map(lambda x:x[:4])
test_df['publish_time_year'] = test_df['publish_time'].map(lambda x:x[:4])

for col in ['series_code','brand_code','series_name','brand_name']:
    if col in test_df.columns:
        del test_df[col] 

test_df = pd.merge(test_df,model_info,on='model_code',how='left')

for col in  ['car_id', 'area', 'source_model_name', 'brand_name',
       'brand_code', 'series_name', 'series_code', 'model_name', 'model_code',
       'model_year', 'real_pay_amount', 'emission', 'color', 'mileage',
       'use_property', 'license_time', 'publish_time', 'a_province_name',
       'a_city_name', 'source_publish_time', 'sale_time', 'license_time_year',
       'publish_time_year', 'site', 'transfer_times', 'real_pay_amount_max',
       'real_pay_amount_min', 'count']:
    if col not in test_df.columns:
        test_df[col] = 0
    
test_df['mileage'] = test_df['mileage'] * 10000

test_df.shape

(100, 44)

# 增加车牛数据

In [7]:

sql_info ="""
select t1.car_id,
       '' area,
       '' source_model_name,
       brand_name,
       brand_code,
       series_name,
       series_code,  
       model_name,
       model_code,
       '' model_year,
       amount/100 real_pay_amount,
       '' emission,
       car_body_color_name color,
       display_mileage/10000 mileage,
       '' use_property,
       first_license_plate_date license_time,
       deal_time publish_time,
       license_plate_province_name a_province_name,
       license_plate_city_name a_city_name,
       '' source_publish_time,
       '' sale_time,
       substr(first_license_plate_date,0,4) license_time_year,
       substr(deal_time,0,4) publish_time_year,
       '' site,
       0 transfer_times 
from (select * from dl_cheniu.dl_cheniu_ipmd_car_deal_dd
      where ds = date_sub('"""+curr_date+"""',1)) t1 
left join (select * from dl_cheniu.dl_cheniu_ipmd_car_dd 
           where ds = date_sub('"""+curr_date+"""',1)) t2 
on t1.car_id = t2.car_id 
where t1.order_status != 4 and t2.test_type = 0 
and substr(t2.vin,0,4) != 'TEST'
"""
dtype={'city_code':str}

cheniu_ipmd_car_deal = read_from_hive2('dl_cheniu_ipmd_car_deal_dd',sql_info,dtype)
db_columns = []
for col in cheniu_ipmd_car_deal.columns:
    if len(col.split('.')) > 1:
        db_columns.append(col.split('.')[1])

    else:
        db_columns.append(col)

cheniu_ipmd_car_deal.columns = db_columns 
logger.log("读取数据完成.............")

cheniu_ipmd_car_deal['a_province_name'] = cheniu_ipmd_car_deal['a_province_name'].map(lambda x:str(x).replace('省','').replace('壮族','').\
                                            replace('回族','').replace('自治区','').replace('维吾尔','').\
                                            replace('市',''))

cheniu_ipmd_car_deal['license_time_year'] = cheniu_ipmd_car_deal['license_time'].map(lambda x:str(x)[:4])
cheniu_ipmd_car_deal['publish_time_year'] = cheniu_ipmd_car_deal['publish_time'].map(lambda x:str(x)[:4])

for col in ['series_code','brand_code','series_name','brand_name']:
    if col in cheniu_ipmd_car_deal.columns:
        del cheniu_ipmd_car_deal[col] 

cheniu_ipmd_car_deal = pd.merge(cheniu_ipmd_car_deal,model_info,on='model_code',how='left')

for col in  ['car_id', 'area', 'source_model_name', 'brand_name',
       'brand_code', 'series_name', 'series_code', 'model_name', 'model_code',
       'model_year', 'real_pay_amount', 'emission', 'color', 'mileage',
       'use_property', 'license_time', 'publish_time', 'a_province_name',
       'a_city_name', 'source_publish_time', 'sale_time', 'license_time_year',
       'publish_time_year', 'site', 'transfer_times', 'real_pay_amount_max',
       'real_pay_amount_min', 'count']:
    if col not in cheniu_ipmd_car_deal.columns:
        cheniu_ipmd_car_deal[col] = 0
    
cheniu_ipmd_car_deal['mileage'] = cheniu_ipmd_car_deal['mileage'] * 10000
cheniu_ipmd_car_deal['real_pay_amount'] = cheniu_ipmd_car_deal['real_pay_amount']/10000



[2021-02-19 15:02:27,013] {hiveserver2:138} INFO - Using database default as default
[2021-02-19 15:02:27,180] {hive_server_hook:112} INFO - Running query: 
select t1.car_id,
       '' area,
       '' source_model_name,
       brand_name,
       brand_code,
       series_name,
       series_code,  
       model_name,
       model_code,
       '' model_year,
       amount/100 real_pay_amount,
       '' emission,
       car_body_color_name color,
       display_mileage/10000 mileage,
       '' use_property,
       first_license_plate_date license_time,
       deal_time publish_time,
       license_plate_province_name a_province_name,
       license_plate_city_name a_city_name,
       '' source_publish_time,
       '' sale_time,
       substr(first_license_plate_date,0,4) license_time_year,
       substr(deal_time,0,4) publish_time_year,
       '' site,
       0 transfer_times 
from (select * from dl_cheniu.dl_cheniu_ipmd_car_deal_dd
      where ds = date_sub('2021-02-19',1)) t1 
left joi

# 合并各数据源的数据

In [8]:

dl_site_ts_order_clean_dfc_sales['data_type'] = 'dfc_sales'
dl_site_ts_order_clean_dfc_purchase['data_type'] = 'dfc_purchase'

dl_site_ts_order_clean_quan['data_type'] = 'quan'
test_df['data_type'] = 'cyp'
cheniu_ipmd_car_deal['data_type'] = 'cheniu'

# dl_site_ts_order_clean_dfc_purchase = dl_site_ts_order_clean_dfc_purchase.loc[(dl_site_ts_order_clean_dfc_purchase['delete_flag'] !=1) & 
#                                        (dl_site_ts_order_clean_dfc_purchase['delete_flag1'] !=1)]

get_columns = ['car_id', 'area', 'source_model_name', 'brand_name',
       'brand_code', 'series_name', 'series_code', 'model_name', 'model_code',
       'model_year', 'real_pay_amount', 'emission', 'color', 'mileage',
       'use_property', 'license_time', 'publish_time', 'a_province_name',
       'a_city_name', 'source_publish_time', 'sale_time', 'license_time_year',
       'publish_time_year', 'site', 'transfer_times', 'data_type','delete_flag',
        'delete_flag_1', 'delete_flag_2']
for col in ['delete_flag','delete_flag_1', 'delete_flag_2'] :
    if col not in dl_site_ts_order_clean_dfc_sales.columns:dl_site_ts_order_clean_dfc_sales[col] = None
    if col not in dl_site_ts_order_clean_dfc_purchase.columns:dl_site_ts_order_clean_dfc_purchase[col] = None
    if col not in test_df.columns:test_df[col] = None
    if col not in cheniu_ipmd_car_deal.columns:cheniu_ipmd_car_deal[col] = None


dl_site_ts_order_clean = pd.concat([dl_site_ts_order_clean_dfc_sales[get_columns],
                                    dl_site_ts_order_clean_dfc_purchase[get_columns],
                                   dl_site_ts_order_clean_quan[get_columns],
                                   test_df[get_columns],
                                   cheniu_ipmd_car_deal[get_columns]],axis=0)


# 选择2019年至今的数据

In [9]:

dl_site_ts_order_clean['publish_time'] = dl_site_ts_order_clean['publish_time'].map(lambda x:str(x)[:10])
dl_site_ts_order_clean = dl_site_ts_order_clean.loc[dl_site_ts_order_clean['publish_time'] >= '2019-01-01']
del dl_site_ts_order_clean_dfc_sales
del dl_site_ts_order_clean_dfc_purchase
del dl_site_ts_order_clean_quan
del cheniu_ipmd_car_deal
del test_df


In [10]:
dl_site_ts_order_clean = dl_site_ts_order_clean[['car_id', 'brand_name','brand_code', 'series_name', 'series_code', 'model_name', 'model_code',
       'real_pay_amount', 'color', 'mileage','license_time', 'publish_time', 'a_province_name',
       'a_city_name', 'license_time_year','publish_time_year', 'site', 'transfer_times','data_type','delete_flag',
        'delete_flag_1', 'delete_flag_2']]


# 匹配指导价计算残值率

In [11]:
sql_info ="""
SELECT model_code,guide_price 
FROM db_data.ods_car_model_model_price 
WHERE ds = date_sub('"""+curr_date+"""',1)
"""
dtype={'city_code':str}

model_price  = read_from_hive2('model_price ',sql_info,dtype)
db_columns = []
for col in model_price.columns:
    if len(col.split('.')) > 1:
        db_columns.append(col.split('.')[1])

    else:
        db_columns.append(col)

model_price.columns = db_columns 
logger.log("读取数据完成.............")


[2021-02-19 15:03:20,272] {hiveserver2:138} INFO - Using database default as default
[2021-02-19 15:03:20,411] {hive_server_hook:112} INFO - Running query: 
SELECT model_code,guide_price 
FROM db_data.ods_car_model_model_price 
WHERE ds = date_sub('2021-02-19',1)

[2021-02-19 15:03:20,722] {hive_server_hook:162} INFO - Written 10000 rows so far.
[2021-02-19 15:03:20,848] {hive_server_hook:162} INFO - Written 20000 rows so far.
[2021-02-19 15:03:20,970] {hive_server_hook:162} INFO - Written 30000 rows so far.
[2021-02-19 15:03:21,090] {hive_server_hook:162} INFO - Written 40000 rows so far.
[2021-02-19 15:03:21,209] {hive_server_hook:162} INFO - Written 50000 rows so far.
[2021-02-19 15:03:21,331] {hive_server_hook:162} INFO - Written 60000 rows so far.
[2021-02-19 15:03:21,438] {hive_server_hook:162} INFO - Written 68853 rows so far.
[2021-02-19 15:03:21,442] {hiveserver2:265} INFO - Closing active operation
[2021-02-19 15:03:21,459] {hive_server_hook:163} INFO - Done. Loaded a total o

In [12]:
dl_site_ts_order_clean = pd.merge(dl_site_ts_order_clean,model_price,on='model_code',how='left')
dl_site_ts_order_clean['guide_price'] = dl_site_ts_order_clean['guide_price']/10000
dl_site_ts_order_clean['residual'] = dl_site_ts_order_clean['real_pay_amount']/dl_site_ts_order_clean['guide_price']


# 处理残值率大于1 的数据

In [13]:
residual_ex = list(dl_site_ts_order_clean.loc[dl_site_ts_order_clean['residual']>=1]['model_code'].unique())
residual_ex_count = dl_site_ts_order_clean.loc[(dl_site_ts_order_clean['model_code'].isin(residual_ex)) & 
                           (dl_site_ts_order_clean['residual']>=1)][['model_code','car_id']].\
                            groupby(['model_code']).count().reset_index().\
                            rename(columns={'car_id':'count_ex'})

residual_count = dl_site_ts_order_clean.loc[dl_site_ts_order_clean['model_code'].isin(residual_ex)][['model_code','car_id']].\
                            groupby(['model_code']).count().reset_index().\
                            rename(columns={'car_id':'count'})

residual_count = pd.merge(residual_count,residual_ex_count,on='model_code',how='left')
residual_count['rate'] = residual_count['count_ex'] / residual_count['count']

## 对残值率大于1的数据进行处理
model_ex_list_gt_4 = list(residual_count.loc[(residual_count['rate'] >0.4) & (residual_count['count'] >=3)]['model_code'].unique())

dl_site_ts_order_clean_gt1 = dl_site_ts_order_clean.loc[(dl_site_ts_order_clean['model_code'].isin(model_ex_list_gt_4)) & 
                          (dl_site_ts_order_clean['residual']>=1) & 
                        (dl_site_ts_order_clean['residual']<1.5)]

dl_site_ts_order_clean = dl_site_ts_order_clean.loc[(dl_site_ts_order_clean['residual']<1) ]
dl_site_ts_order_clean = pd.concat([dl_site_ts_order_clean,dl_site_ts_order_clean_gt1],axis=0)


In [50]:
## 计算平均值、中位数

In [62]:
dl_site_ts_order_clean_mean = dl_site_ts_order_clean[['model_code','license_time_year','publish_time_year','real_pay_amount']].\
groupby(['model_code','license_time_year','publish_time_year']).mean().reset_index().rename(columns={"real_pay_amount":"price_mean"})

dl_site_ts_order_clean_std = dl_site_ts_order_clean[['model_code','license_time_year','publish_time_year','real_pay_amount']].\
groupby(['model_code','license_time_year','publish_time_year']).std().reset_index().rename(columns={"real_pay_amount":"price_std"})

dl_site_ts_order_clean_median = dl_site_ts_order_clean[['model_code','license_time_year','publish_time_year','real_pay_amount']].\
groupby(['model_code','license_time_year','publish_time_year']).median().reset_index().rename(columns={"real_pay_amount":"price_median"})

dl_site_ts_order_clean_count = dl_site_ts_order_clean[['model_code','license_time_year','publish_time_year','real_pay_amount']].\
groupby(['model_code','license_time_year','publish_time_year']).count().reset_index().rename(columns={"real_pay_amount":"price_count"})


dl_site_ts_order_clean_mean = pd.merge(dl_site_ts_order_clean_mean,
         dl_site_ts_order_clean_std,
        on=['model_code','license_time_year','publish_time_year'],
        how='left')

dl_site_ts_order_clean_mean = pd.merge(dl_site_ts_order_clean_mean,
         dl_site_ts_order_clean_median,
        on=['model_code','license_time_year','publish_time_year'],
        how='left')

dl_site_ts_order_clean_mean = pd.merge(dl_site_ts_order_clean_mean,
         dl_site_ts_order_clean_count,
        on=['model_code','license_time_year','publish_time_year'],
        how='left')

dl_site_ts_order_clean_mean = dl_site_ts_order_clean_mean.loc[~dl_site_ts_order_clean_mean['price_std'].isnull()]

dl_site_ts_order_clean_mean['price_mean'] = dl_site_ts_order_clean_mean['price_mean'].map(lambda x:round(x/10000,2))
dl_site_ts_order_clean_mean['price_median'] = dl_site_ts_order_clean_mean['price_median'].map(lambda x:round(x/10000,2))


In [14]:
#### ===========训练模型构建特征===============#####

In [15]:
## 剔除taoche,yxpai,baixing的数据
dl_site_ts_order_clean = dl_site_ts_order_clean.loc[~dl_site_ts_order_clean['site'].isin(['taoche','yxpai','baixing'])]


In [16]:
if 'model_year' in dl_site_ts_order_clean.columns:
    dl_site_ts_order_clean = dl_site_ts_order_clean.drop(['model_year'], axis=1)
car = pd.read_csv(curr_dir+"2021-02-02版车型参数及独热编码.csv", header = 0, low_memory=False)

dl_site_ts_order_clean = pd.merge(dl_site_ts_order_clean,car,on='model_code',how='left')


In [17]:
## 分车系等级训练模型
dl_site_ts_order_clean = dl_site_ts_order_clean.loc[
              (dl_site_ts_order_clean['level']=="缺失") | 
              (dl_site_ts_order_clean['level']=="A00") | 
              (dl_site_ts_order_clean['level']=="A0") | 
              (dl_site_ts_order_clean['level']=="A")]


In [18]:
dl_site_ts_order_clean['mileage'] = dl_site_ts_order_clean['mileage'].map(lambda x:round(x/10000,2) )

dl_site_ts_order_clean = dl_site_ts_order_clean.loc[~((dl_site_ts_order_clean['license_time'].isnull()) |
                           (dl_site_ts_order_clean['publish_time'].isnull()))
                          ]
dl_site_ts_order_clean.shape

(4492388, 203)

In [19]:
## 计算衍生特征
dl_site_ts_order_clean['license_month'] = list(map(lambda x,y:date_time_sub(x,y,"%Y-%m-%d"),dl_site_ts_order_clean['license_time'].map(lambda x:str(x)),dl_site_ts_order_clean['publish_time'].map(lambda x:str(x))))
dl_site_ts_order_clean = dl_site_ts_order_clean.loc[~dl_site_ts_order_clean['license_month'].isnull()]
dl_site_ts_order_clean['license_month'] = dl_site_ts_order_clean['license_month'].map(lambda x:round(x/30))


In [20]:
## 计算车龄月
dl_site_ts_order_clean['license_time_month'] = dl_site_ts_order_clean['license_time'].map(lambda x:int(x.split('-')[1]))
dl_site_ts_order_clean['publish_time_month'] = dl_site_ts_order_clean['publish_time'].map(lambda x:int(x.split('-')[1]))
dl_site_ts_order_clean['license_month'] = dl_site_ts_order_clean['publish_time_year'].map(lambda x:int(x)) * 12 + \
                                            dl_site_ts_order_clean['publish_time_month'] - \
                                            dl_site_ts_order_clean['license_time_year'].map(lambda x:int(x)) * 12 - \
                                            dl_site_ts_order_clean['license_time_month'] + 1


In [21]:
dl_site_ts_order_clean['per_mile'] = round(dl_site_ts_order_clean['mileage']/(dl_site_ts_order_clean['license_month']/12),2)
dl_site_ts_order_clean['license_time_year'] = dl_site_ts_order_clean['license_time_year'].map(lambda x:int(x))
dl_site_ts_order_clean['model_year'] = dl_site_ts_order_clean['model_year'].map(lambda x:int(x))
dl_site_ts_order_clean['year_err'] = dl_site_ts_order_clean['license_time_year'] - dl_site_ts_order_clean['model_year']


In [22]:
dl_site_ts_order_clean.shape

(4492381, 208)

In [23]:
# 计算车辆当前保值率
def computer_with_license_month(tar):
    try:
        license_month = tar['license_month']
        if(license_month<=12):
            #tar['keep_value'] = tar['year_1']
            return tar['year_1']
        else:
            year = license_month//12
            #当前年保值率
            keep_max = tar["year_"+str(int(year))]
            #下一年的保值率
            keep_min = tar["year_"+str(int(year+1))]

            #相比于上一年，已经过了几个月
            mon = license_month-12*year
            tem = (keep_max-keep_min)/12

            #tar['keep_value'] = round(keep_max - tem*mon, 4)
            return round(keep_max - tem*mon, 4)
    except:
        return tar["year_16"]

#data2m = data2m.apply(computer_with_license_month, axis=1)
dl_site_ts_order_clean['keep_value'] = dl_site_ts_order_clean[['license_month','year_1', 'year_2', 'year_3', 'year_4','year_5', 'year_6', 'year_7', 'year_8', 'year_9', 'year_10', 'year_11','year_12', 'year_13', 'year_14', 'year_15', 'year_16']].to_dict(orient='records')
print('keep_value 计算开始。。')
dl_site_ts_order_clean['keep_value'] = dl_site_ts_order_clean['keep_value'].map(lambda tar:computer_with_license_month(tar))


keep_value 计算开始。。


In [24]:
## 删除处理后的数据
drop_columns = ['year_1', 'year_2', 'year_3', 'year_4','year_5', 'year_6', 'year_7', 'year_8', 'year_9', 'year_10', 
            'year_11','year_12', 'year_13', 'year_14', 'year_15', 'year_16']
dl_site_ts_order_clean = dl_site_ts_order_clean.drop(columns=drop_columns)


In [25]:
## 计算是否过质保
def get_quality(license_month,mileage_std,quality_mile,quality_year):
    year = round(license_month/12, 2)
    if mileage_std < quality_mile and year< quality_year:
        return 1
    else:
        return 0

dl_site_ts_order_clean['quality'] = list(map(lambda license_month,mileage_std,quality_mile,quality_year:get_quality(license_month,mileage_std,quality_mile,quality_year),
                                             dl_site_ts_order_clean['license_month'],dl_site_ts_order_clean['mileage'],
                                             dl_site_ts_order_clean['quality_mile'],dl_site_ts_order_clean['quality_year']))
dl_site_ts_order_clean = dl_site_ts_order_clean.drop(['quality_mile', 'quality_year'], axis=1)


In [26]:
## 计算残值率
# dl_site_ts_order_clean['real_pay_amount'] = dl_site_ts_order_clean['real_pay_amount'].map(lambda x:round(x/10000,2))
dl_site_ts_order_clean['residual'] = dl_site_ts_order_clean['real_pay_amount'] / dl_site_ts_order_clean['new_car_price']


# 把类别特征转化为编码

In [27]:
dl_site_ts_order_clean = dl_site_ts_order_clean.loc[~dl_site_ts_order_clean['brand_code'].isnull()]
dl_site_ts_order_clean = dl_site_ts_order_clean.loc[~dl_site_ts_order_clean['series_code'].isnull()]


In [28]:
dl_site_ts_order_clean = dl_site_ts_order_clean.loc[~dl_site_ts_order_clean['a_province_name'].isin(['州','江', '庆', '苏', 'nan'])]
dl_site_ts_order_clean.loc[dl_site_ts_order_clean['data_type'] =='cheniu','color'] = '白色'

In [29]:
dl_site_ts_order_clean.loc[dl_site_ts_order_clean['brand_code'] == 'brand-889','brand_code'] = 'brand-54'
dl_site_ts_order_clean.loc[dl_site_ts_order_clean['brand_code'] == 'brand-895','brand_code'] = 'brand-526'

# transfer_times_dict = 
transfer_times_df = pd.read_excel('../transfer_times_df_yh01.xlsx')
transfer_times_dict = {}
for key,value in np.array(transfer_times_df.loc[transfer_times_df['name'] == 'transfer_times'][['key','value']]):
    transfer_times_dict[key] = value
a_province_name_dict = {}
for key,value in np.array(transfer_times_df.loc[transfer_times_df['name'] == 'province_name'][['key','value']]):
    a_province_name_dict[key] = value
color_dict = {}
for key,value in np.array(transfer_times_df.loc[transfer_times_df['name'] == 'color'][['key','value']]):
    color_dict[key] = value
brand_code_dict = {}
for key,value in np.array(transfer_times_df.loc[transfer_times_df['name'] == 'brand_code'][['key','value']]):
    brand_code_dict[key] = value
series_code_dict = {}
for key,value in np.array(transfer_times_df.loc[transfer_times_df['name'] == 'series_code'][['key','value']]):
    series_code_dict[key] = value

## 剔除车型库已变更的车型数据
dl_site_ts_order_clean = dl_site_ts_order_clean.loc[dl_site_ts_order_clean['series_code'].isin(list(series_code_dict.keys()))]
dl_site_ts_order_clean = dl_site_ts_order_clean.loc[dl_site_ts_order_clean['brand_code'].isin(list(brand_code_dict.keys()))]

## 处理省份数据
dl_site_ts_order_clean = dl_site_ts_order_clean.loc[~dl_site_ts_order_clean['a_province_name'].isnull()]
dl_site_ts_order_clean['a_province_name'] = dl_site_ts_order_clean['a_province_name'].map(lambda x:str(x).replace('省','').replace('市',''))
## 省份
dl_site_ts_order_clean['province_name'] = dl_site_ts_order_clean['a_province_name'].map(lambda x:a_province_name_dict[x])
## 品牌
dl_site_ts_order_clean['brand_code_index'] = dl_site_ts_order_clean['brand_code'].map(lambda x:brand_code_dict[x])
## 车系
dl_site_ts_order_clean['series_code_index'] = dl_site_ts_order_clean['series_code'].map(lambda x:series_code_dict[x])

## 处理颜色和过户次数
color_list = ['多彩色','粉红色','冰川白','金色','香槟金','银色','橙色','绿色','咖啡色','黄色','紫色','灰色','棕色','深灰色','香槟色','蓝色',
'银灰色','红色','其他','黑色','白色']
dl_site_ts_order_clean.loc[dl_site_ts_order_clean['color'].isin(['不详','其他色', '其它色','其它','其他','—']),'color'] = '其他'

dl_site_ts_order_clean = dl_site_ts_order_clean.loc[dl_site_ts_order_clean['color'].isin(color_list)]

dl_site_ts_order_clean['transfer_times'] = dl_site_ts_order_clean['transfer_times'].fillna(-1)
def transfer_times_fun(x):
    try:
        return int(x)
    except:
        pass

dl_site_ts_order_clean['transfer_times'] = dl_site_ts_order_clean['transfer_times'].map(lambda x:transfer_times_fun(x))
dl_site_ts_order_clean.loc[dl_site_ts_order_clean['transfer_times'] >=5,'transfer_times'] = 5 
dl_site_ts_order_clean = dl_site_ts_order_clean.loc[dl_site_ts_order_clean['transfer_times'].isin([-1,0,1,2,3,4,5])]
## 过户次数
dl_site_ts_order_clean['transfer_times_index'] = dl_site_ts_order_clean['transfer_times'].map(lambda x:transfer_times_dict[x])
## 颜色
dl_site_ts_order_clean['color_index'] = dl_site_ts_order_clean['color'].map(lambda x:color_dict[x])


In [30]:
## 对数据进行过滤
dl_site_ts_order_clean = dl_site_ts_order_clean.loc[~dl_site_ts_order_clean['brand_code'].isnull()]
dl_site_ts_order_clean = dl_site_ts_order_clean.loc[dl_site_ts_order_clean['license_month'] >= 1]
dl_site_ts_order_clean = dl_site_ts_order_clean.loc[dl_site_ts_order_clean['per_mile'] >= 0.01]
dl_site_ts_order_clean = dl_site_ts_order_clean.loc[dl_site_ts_order_clean['per_mile'] <=10]


In [31]:
## 上牌年和交易年对应的最小车龄
dl_site_ts_order_clean['license_month_min'] = list(map(lambda x,y:date_time_sub(x,y,"%Y-%m"),dl_site_ts_order_clean['license_time_year'].map(lambda x:str(int(x))+"-12"),dl_site_ts_order_clean['publish_time_year'].map(lambda x:str(x)+'-01')))
dl_site_ts_order_clean['license_month_min'] = dl_site_ts_order_clean['license_month_min'].map(lambda x:round(x/30))
dl_site_ts_order_clean['license_month_cha'] = (dl_site_ts_order_clean['license_month'] - dl_site_ts_order_clean['license_month_min'])

dl_site_ts_order_clean['publish_time_year'] = dl_site_ts_order_clean['publish_time_year'].map(lambda x:int(x))
dl_site_ts_order_clean['car_years'] = dl_site_ts_order_clean['publish_time_year'] - dl_site_ts_order_clean['license_time_year']

dl_site_ts_order_clean['months_cha'] = dl_site_ts_order_clean['publish_time'].map(lambda x:int(x[5:7])) - dl_site_ts_order_clean['license_time'].map(lambda x:int(x.split('-')[1]))


In [33]:
## 对车龄月进行转化
dl_site_ts_order_clean['license_month_log'] = dl_site_ts_order_clean['license_month'].map(lambda x:math.log(x))
dl_site_ts_order_clean['mileage_log'] = dl_site_ts_order_clean['mileage'].map(lambda x:math.log(x))


In [34]:
## 计算距今时间，单位为月
dl_site_ts_order_clean['months'] = dl_site_ts_order_clean['publish_time'].map(lambda x:round(date_time_sub(str(x)[:10],'2021-12-31',"%Y-%m-%d")/30))


In [35]:
## 定义特征列 
fearture_columns = ['months','mileage_log', 'car_years','months_cha',#'license_month', 
                    'new_car_price','model_year', 'rate', 
                    'rate_count', 'per_mile', 'year_err','province_name','license_time_year', 'publish_time_year',
                    'brand_code_index','series_code_index',
                    'transfer_times_index','color_index',
#                     'province_name0', 'province_name1', 'province_name2', 'province_name3', 'province_name4',
#                     'province_name5', 'province_name6', 'province_name7', 'province_name8', 'province_name9', 
#                     'province_name10', 'province_name11', 'province_name12', 'province_name13', 'province_name14',
#                     'province_name15', 'province_name16', 'province_name17', 'province_name18', 'province_name19', 
#                     'province_name20', 'province_name21', 'province_name22', 'province_name23', 'province_name24', 
#                     'province_name25', 'province_name26', 'province_name27', 'province_name28', 'province_name29', 
#                     'province_name30',
                    'keep_value', 'quality',
                    'wheel_base', 'length', 'height', 'width', 'max_torque', 'max_power', 'engine_volume_l', 'cylinder_number', 
                    'seat_number_top', 'driving_mode0', 'driving_mode1', 'driving_mode2', 'driving_mode3', 'driving_mode4', 
                    'driving_mode5', 'driving_mode6', 'driving_mode7', 'driving_mode8', 'driving_mode9', 'gear_box_type0', 
                    'gear_box_type1', 'gear_box_type2', 'gear_box_type3', 'gear_box_type4', 'gear_box_type5', 'gear_box_type6', 
                    'gear_box_type7', 'gear_box_type8', 'gear_box_type9', 'gear_box_type10', 'country_id0', 'country_id1', 
                    'country_id2', 'country_id3', 'country_id4', 'country_id5', 'country_id6', 'country_id7', 'country_id8', 
                    'country_id9', 'country_id10', 'country_id11', 'country_id12', 'country_id13', 'country_id14', 'import_type0', 
                    'import_type1', 'intake_type0', 'intake_type1', 'intake_type2', 'intake_type3', 'intake_type4', 'intake_type5', 
                    'intake_type6', 'intake_type7', 'fuel_form0', 'fuel_form1', 'fuel_form2', 'fuel_form3', 'fuel_form4', 
                    'fuel_form5', 'fuel_form6', 'fuel_form7', 'fuel_form8', 'car_body0', 'car_body1', 'car_body2', 'car_body3',
                    'car_body4', 'car_body5', 'car_body6', 'car_body7', 'car_body8', 'car_body9', 'car_body10', 'car_body11', 
                    'series_level0', 'series_level1', 'series_level2', 'series_level3', 'series_level4', 'series_level5', 
                    'series_level6', 'series_level7', 'series_level8', 'series_level9', 'series_level10', 'series_level11', 
                    'series_level12', 'series_level13', 'series_level14', 'series_level15', 'series_level16', 'series_level17', 
                    'series_level18', 'series_level19', 'series_level20', 'series_level21', 'series_level22', 'series_level23', 
                    'series_level24', 'series_level25', 'series_level26', 'series_level27', 'series_level28', 'series_level29',
                    'series_level30', 'series_level31', 'series_level32', 'series_level33', 'series_level34', 'series_level35',
                    'series_level36', 'series_level37', 'series_level38', 'series_level39', 'series_level40', 'series_level41', 
                    'series_level42', 'series_level43', 'series_level44', 'series_level45', 'series_level46', 'series_level47', 
                    'series_level48', 'series_level49', 'series_level50', 'series_level51', 'series_level52', 'series_level53', 
                    'series_level54', 'series_level55', 'series_level56', 'series_level57', 'series_level58', 'series_level59', 
                    'series_level60', 'series_level61', 'wordvec0', 'wordvec1', 'wordvec2', 'wordvec3', 'wordvec4', 'wordvec5', 
                    'wordvec6', 'wordvec7', 'wordvec8', 'wordvec9', 'wordvec10', 'wordvec11', 'wordvec12', 'wordvec13', 'wordvec14',
                    'wordvec15', 'wordvec16', 'wordvec17', 'wordvec18', 'wordvec19', 'wordvec20', 'wordvec21', 'wordvec22', 'wordvec23',
                    'wordvec24', 'wordvec25', 'wordvec26', 'wordvec27', 'wordvec28', 'wordvec29', 'wordvec30', 'wordvec31', 'wordvec32', 
                    'wordvec33', 'wordvec34', 'wordvec35', 'wordvec36', 'wordvec37', 'wordvec38', 'wordvec39', 'wordvec40', 'wordvec41',
                    'wordvec42', 'wordvec43', 'wordvec44', 'wordvec45', 'wordvec46', 'wordvec47', 'wordvec48', 'wordvec49', 'wordvec50',
                    'wordvec51', 'wordvec52', 'wordvec53', 'wordvec54', 'wordvec55', 'wordvec56', 'wordvec57', 'wordvec58', 'wordvec59', 
                    'wordvec60', 'wordvec61', 'wordvec62', 'wordvec63', 'wordvec64', 'wordvec65', 'wordvec66', 'wordvec67', 'wordvec68', 
                    'wordvec69', 'wordvec70', 'wordvec71', 'wordvec72', 'wordvec73', 'wordvec74', 'wordvec75', 'wordvec76', 'wordvec77',
                    'wordvec78', 'wordvec79', 'wordvec80', 'wordvec81', 'wordvec82', 'wordvec83', 'wordvec84', 'wordvec85', 'wordvec86',
                    'wordvec87', 'wordvec88', 'wordvec89', 'wordvec90', 'wordvec91', 'wordvec92', 'wordvec93', 'wordvec94', 'wordvec95', 
                    'wordvec96', 'wordvec97', 'wordvec98', 'wordvec99']

label = 'residual'


In [36]:
## 增加车型名称编码特征
vec = pd.read_csv(curr_dir+"2021-02-02版-TF-IDF权重的词向量.csv", dtype={'model_code': str}, header = 0)
vec = vec.drop(['model_name'], axis=1)
dl_site_ts_order_clean = pd.merge(dl_site_ts_order_clean,vec,on='model_code',how='left')


In [37]:
## 取数据训练模型
dl_site_ts_order_clean = dl_site_ts_order_clean.loc[dl_site_ts_order_clean['per_mile'] <= 5]
train_data = dl_site_ts_order_clean.loc[(dl_site_ts_order_clean['publish_time'] > '2019-01-01') 
                                         & (dl_site_ts_order_clean['publish_time'] < '2020-12-01')
                                       ]
test_data = dl_site_ts_order_clean.loc[(dl_site_ts_order_clean['publish_time'] >= '2020-12-01') & 
                                       (dl_site_ts_order_clean['publish_time'] < '2021-01-01') 
                                      ]

train_data = train_data.loc[train_data['data_type'].isin(['quan'])]
test_data = test_data.loc[test_data['data_type'].isin(['quan'])]


In [38]:
print(train_data.shape,test_data.shape)
print(dl_site_ts_order_clean.shape)


(4026802, 304) (64689, 304)
(4392825, 304)


In [39]:
# del dl_site_ts_order_clean

In [40]:
import lightgbm as lgb
gbm = lgb.LGBMRegressor(boosting_type = 'gbdt', 
                        objective = 'regression',
                        learning_rate = 0.03, 
                        n_estimators = 3000,
                        max_depth = 12,
                        num_leaves = 100, 
                        subsample = 1, 
                        colsample_bytree = 0.34,
                        min_child_samples = 110, 
                        n_jobs = 8,
                        verbose = 1)

gbm.fit(train_data[fearture_columns],
        train_data[label],
        eval_set = (test_data[fearture_columns],
                    test_data[label]),
        eval_metric = {'l1','l2'}, 
        early_stopping_rounds = 50, 
        verbose=True,
        categorical_feature=['province_name','model_year', 'publish_time_year','quality',
                            'brand_code_index','series_code_index','transfer_times_index','color_index']
       )


[1]	valid_0's l1: 0.186623	valid_0's l2: 0.0489713
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's l1: 0.181488	valid_0's l2: 0.046428
[3]	valid_0's l1: 0.176531	valid_0's l2: 0.0440643
[4]	valid_0's l1: 0.171621	valid_0's l2: 0.0417666
[5]	valid_0's l1: 0.168526	valid_0's l2: 0.0404044
[6]	valid_0's l1: 0.163998	valid_0's l2: 0.0384006
[7]	valid_0's l1: 0.160882	valid_0's l2: 0.0370762
[8]	valid_0's l1: 0.156741	valid_0's l2: 0.035254
[9]	valid_0's l1: 0.152731	valid_0's l2: 0.0335333
[10]	valid_0's l1: 0.149828	valid_0's l2: 0.0324007
[11]	valid_0's l1: 0.14593	valid_0's l2: 0.0308061
[12]	valid_0's l1: 0.142185	valid_0's l2: 0.0293265
[13]	valid_0's l1: 0.138703	valid_0's l2: 0.0279888
[14]	valid_0's l1: 0.135131	valid_0's l2: 0.0266293
[15]	valid_0's l1: 0.131743	valid_0's l2: 0.0253695
[16]	valid_0's l1: 0.128242	valid_0's l2: 0.0241102
[17]	valid_0's l1: 0.125052	valid_0's l2: 0.0229933
[18]	valid_0's l1: 0.121998	valid_0's l2: 0.021953
[19]	valid_0's 

[153]	valid_0's l1: 0.0382401	valid_0's l2: 0.00282342
[154]	valid_0's l1: 0.0381972	valid_0's l2: 0.00281798
[155]	valid_0's l1: 0.0381613	valid_0's l2: 0.00281346
[156]	valid_0's l1: 0.0381183	valid_0's l2: 0.00280847
[157]	valid_0's l1: 0.0380497	valid_0's l2: 0.00279953
[158]	valid_0's l1: 0.038012	valid_0's l2: 0.0027953
[159]	valid_0's l1: 0.0379482	valid_0's l2: 0.00278667
[160]	valid_0's l1: 0.0379189	valid_0's l2: 0.00278331
[161]	valid_0's l1: 0.0378676	valid_0's l2: 0.00277646
[162]	valid_0's l1: 0.0378286	valid_0's l2: 0.00277161
[163]	valid_0's l1: 0.0377886	valid_0's l2: 0.0027656
[164]	valid_0's l1: 0.0376821	valid_0's l2: 0.00275244
[165]	valid_0's l1: 0.037623	valid_0's l2: 0.00274487
[166]	valid_0's l1: 0.0375593	valid_0's l2: 0.00273657
[167]	valid_0's l1: 0.037493	valid_0's l2: 0.00272876
[168]	valid_0's l1: 0.0374653	valid_0's l2: 0.00272534
[169]	valid_0's l1: 0.0374218	valid_0's l2: 0.00272007
[170]	valid_0's l1: 0.0373895	valid_0's l2: 0.00271665
[171]	valid_0's

[303]	valid_0's l1: 0.033967	valid_0's l2: 0.00228752
[304]	valid_0's l1: 0.0339585	valid_0's l2: 0.00228636
[305]	valid_0's l1: 0.033942	valid_0's l2: 0.00228462
[306]	valid_0's l1: 0.0339055	valid_0's l2: 0.00228024
[307]	valid_0's l1: 0.0338685	valid_0's l2: 0.0022759
[308]	valid_0's l1: 0.0338539	valid_0's l2: 0.0022743
[309]	valid_0's l1: 0.0338338	valid_0's l2: 0.0022721
[310]	valid_0's l1: 0.0338161	valid_0's l2: 0.00226992
[311]	valid_0's l1: 0.0337903	valid_0's l2: 0.00226691
[312]	valid_0's l1: 0.0337718	valid_0's l2: 0.0022644
[313]	valid_0's l1: 0.0337494	valid_0's l2: 0.00226168
[314]	valid_0's l1: 0.0337222	valid_0's l2: 0.00225844
[315]	valid_0's l1: 0.0337162	valid_0's l2: 0.00225773
[316]	valid_0's l1: 0.0337073	valid_0's l2: 0.00225667
[317]	valid_0's l1: 0.0336941	valid_0's l2: 0.00225478
[318]	valid_0's l1: 0.0336874	valid_0's l2: 0.00225366
[319]	valid_0's l1: 0.0336756	valid_0's l2: 0.00225226
[320]	valid_0's l1: 0.0336549	valid_0's l2: 0.00224915
[321]	valid_0's 

[453]	valid_0's l1: 0.0322113	valid_0's l2: 0.00206811
[454]	valid_0's l1: 0.0322018	valid_0's l2: 0.00206687
[455]	valid_0's l1: 0.0321945	valid_0's l2: 0.00206593
[456]	valid_0's l1: 0.0321851	valid_0's l2: 0.00206487
[457]	valid_0's l1: 0.0321774	valid_0's l2: 0.00206403
[458]	valid_0's l1: 0.0321708	valid_0's l2: 0.00206337
[459]	valid_0's l1: 0.0321649	valid_0's l2: 0.0020626
[460]	valid_0's l1: 0.0321532	valid_0's l2: 0.00206101
[461]	valid_0's l1: 0.0321451	valid_0's l2: 0.00205998
[462]	valid_0's l1: 0.0321334	valid_0's l2: 0.00205858
[463]	valid_0's l1: 0.0321317	valid_0's l2: 0.00205852
[464]	valid_0's l1: 0.03212	valid_0's l2: 0.00205697
[465]	valid_0's l1: 0.0321157	valid_0's l2: 0.00205653
[466]	valid_0's l1: 0.0321122	valid_0's l2: 0.00205604
[467]	valid_0's l1: 0.0321092	valid_0's l2: 0.00205547
[468]	valid_0's l1: 0.032102	valid_0's l2: 0.00205468
[469]	valid_0's l1: 0.0320939	valid_0's l2: 0.00205387
[470]	valid_0's l1: 0.0320821	valid_0's l2: 0.00205242
[471]	valid_0'

[603]	valid_0's l1: 0.031183	valid_0's l2: 0.00193818
[604]	valid_0's l1: 0.031177	valid_0's l2: 0.00193753
[605]	valid_0's l1: 0.0311701	valid_0's l2: 0.00193683
[606]	valid_0's l1: 0.0311649	valid_0's l2: 0.00193616
[607]	valid_0's l1: 0.0311636	valid_0's l2: 0.001936
[608]	valid_0's l1: 0.0311545	valid_0's l2: 0.00193502
[609]	valid_0's l1: 0.031152	valid_0's l2: 0.00193467
[610]	valid_0's l1: 0.0311447	valid_0's l2: 0.00193382
[611]	valid_0's l1: 0.0311427	valid_0's l2: 0.00193355
[612]	valid_0's l1: 0.0311403	valid_0's l2: 0.00193321
[613]	valid_0's l1: 0.031134	valid_0's l2: 0.0019324
[614]	valid_0's l1: 0.0311271	valid_0's l2: 0.00193137
[615]	valid_0's l1: 0.0311159	valid_0's l2: 0.00192998
[616]	valid_0's l1: 0.0311154	valid_0's l2: 0.00192985
[617]	valid_0's l1: 0.0311088	valid_0's l2: 0.00192896
[618]	valid_0's l1: 0.0311008	valid_0's l2: 0.00192791
[619]	valid_0's l1: 0.0310936	valid_0's l2: 0.00192691
[620]	valid_0's l1: 0.0310893	valid_0's l2: 0.00192647
[621]	valid_0's l

[753]	valid_0's l1: 0.0304863	valid_0's l2: 0.00185532
[754]	valid_0's l1: 0.0304856	valid_0's l2: 0.0018552
[755]	valid_0's l1: 0.0304818	valid_0's l2: 0.00185482
[756]	valid_0's l1: 0.0304814	valid_0's l2: 0.00185475
[757]	valid_0's l1: 0.0304791	valid_0's l2: 0.00185453
[758]	valid_0's l1: 0.0304779	valid_0's l2: 0.00185432
[759]	valid_0's l1: 0.0304766	valid_0's l2: 0.00185417
[760]	valid_0's l1: 0.0304718	valid_0's l2: 0.00185352
[761]	valid_0's l1: 0.0304664	valid_0's l2: 0.00185301
[762]	valid_0's l1: 0.0304595	valid_0's l2: 0.00185216
[763]	valid_0's l1: 0.0304557	valid_0's l2: 0.0018517
[764]	valid_0's l1: 0.0304509	valid_0's l2: 0.00185094
[765]	valid_0's l1: 0.0304492	valid_0's l2: 0.00185071
[766]	valid_0's l1: 0.0304447	valid_0's l2: 0.00185009
[767]	valid_0's l1: 0.0304422	valid_0's l2: 0.00184972
[768]	valid_0's l1: 0.03044	valid_0's l2: 0.00184927
[769]	valid_0's l1: 0.0304359	valid_0's l2: 0.00184873
[770]	valid_0's l1: 0.0304289	valid_0's l2: 0.00184786
[771]	valid_0'

[903]	valid_0's l1: 0.030048	valid_0's l2: 0.00180222
[904]	valid_0's l1: 0.0300454	valid_0's l2: 0.00180191
[905]	valid_0's l1: 0.030042	valid_0's l2: 0.00180135
[906]	valid_0's l1: 0.0300401	valid_0's l2: 0.00180116
[907]	valid_0's l1: 0.030031	valid_0's l2: 0.00180038
[908]	valid_0's l1: 0.0300284	valid_0's l2: 0.00180015
[909]	valid_0's l1: 0.0300251	valid_0's l2: 0.0017999
[910]	valid_0's l1: 0.0300243	valid_0's l2: 0.00179977
[911]	valid_0's l1: 0.0300242	valid_0's l2: 0.00179973
[912]	valid_0's l1: 0.0300221	valid_0's l2: 0.00179945
[913]	valid_0's l1: 0.030022	valid_0's l2: 0.00179944
[914]	valid_0's l1: 0.0300208	valid_0's l2: 0.00179925
[915]	valid_0's l1: 0.0300202	valid_0's l2: 0.00179905
[916]	valid_0's l1: 0.0300184	valid_0's l2: 0.00179881
[917]	valid_0's l1: 0.0300176	valid_0's l2: 0.00179869
[918]	valid_0's l1: 0.0300101	valid_0's l2: 0.00179788
[919]	valid_0's l1: 0.0300095	valid_0's l2: 0.00179783
[920]	valid_0's l1: 0.0300081	valid_0's l2: 0.00179767
[921]	valid_0's

[1052]	valid_0's l1: 0.0297285	valid_0's l2: 0.00176409
[1053]	valid_0's l1: 0.0297277	valid_0's l2: 0.00176402
[1054]	valid_0's l1: 0.0297246	valid_0's l2: 0.00176375
[1055]	valid_0's l1: 0.0297242	valid_0's l2: 0.00176361
[1056]	valid_0's l1: 0.0297237	valid_0's l2: 0.00176352
[1057]	valid_0's l1: 0.0297216	valid_0's l2: 0.00176328
[1058]	valid_0's l1: 0.0297206	valid_0's l2: 0.00176304
[1059]	valid_0's l1: 0.029717	valid_0's l2: 0.00176262
[1060]	valid_0's l1: 0.0297159	valid_0's l2: 0.0017625
[1061]	valid_0's l1: 0.029712	valid_0's l2: 0.00176213
[1062]	valid_0's l1: 0.0297118	valid_0's l2: 0.00176213
[1063]	valid_0's l1: 0.0297111	valid_0's l2: 0.00176205
[1064]	valid_0's l1: 0.0297096	valid_0's l2: 0.00176184
[1065]	valid_0's l1: 0.0297079	valid_0's l2: 0.0017616
[1066]	valid_0's l1: 0.0297048	valid_0's l2: 0.00176123
[1067]	valid_0's l1: 0.0297002	valid_0's l2: 0.00176096
[1068]	valid_0's l1: 0.0296991	valid_0's l2: 0.00176085
[1069]	valid_0's l1: 0.0296966	valid_0's l2: 0.00176

[1199]	valid_0's l1: 0.0295028	valid_0's l2: 0.00173791
[1200]	valid_0's l1: 0.0295017	valid_0's l2: 0.00173782
[1201]	valid_0's l1: 0.0295015	valid_0's l2: 0.00173777
[1202]	valid_0's l1: 0.0295001	valid_0's l2: 0.00173766
[1203]	valid_0's l1: 0.029497	valid_0's l2: 0.00173721
[1204]	valid_0's l1: 0.0294947	valid_0's l2: 0.00173684
[1205]	valid_0's l1: 0.0294939	valid_0's l2: 0.00173672
[1206]	valid_0's l1: 0.0294929	valid_0's l2: 0.00173663
[1207]	valid_0's l1: 0.0294921	valid_0's l2: 0.00173654
[1208]	valid_0's l1: 0.029492	valid_0's l2: 0.0017365
[1209]	valid_0's l1: 0.0294917	valid_0's l2: 0.00173648
[1210]	valid_0's l1: 0.029489	valid_0's l2: 0.00173629
[1211]	valid_0's l1: 0.0294887	valid_0's l2: 0.00173627
[1212]	valid_0's l1: 0.0294883	valid_0's l2: 0.00173625
[1213]	valid_0's l1: 0.0294836	valid_0's l2: 0.00173571
[1214]	valid_0's l1: 0.029482	valid_0's l2: 0.00173554
[1215]	valid_0's l1: 0.0294776	valid_0's l2: 0.00173504
[1216]	valid_0's l1: 0.0294767	valid_0's l2: 0.001734

[1347]	valid_0's l1: 0.0293164	valid_0's l2: 0.0017165
[1348]	valid_0's l1: 0.029316	valid_0's l2: 0.00171643
[1349]	valid_0's l1: 0.0293139	valid_0's l2: 0.00171621
[1350]	valid_0's l1: 0.0293135	valid_0's l2: 0.00171618
[1351]	valid_0's l1: 0.0293128	valid_0's l2: 0.00171604
[1352]	valid_0's l1: 0.0293114	valid_0's l2: 0.0017159
[1353]	valid_0's l1: 0.029311	valid_0's l2: 0.00171586
[1354]	valid_0's l1: 0.0293091	valid_0's l2: 0.00171563
[1355]	valid_0's l1: 0.0293088	valid_0's l2: 0.0017156
[1356]	valid_0's l1: 0.0293079	valid_0's l2: 0.00171548
[1357]	valid_0's l1: 0.0293062	valid_0's l2: 0.00171532
[1358]	valid_0's l1: 0.029305	valid_0's l2: 0.00171525
[1359]	valid_0's l1: 0.0293048	valid_0's l2: 0.00171523
[1360]	valid_0's l1: 0.0293047	valid_0's l2: 0.00171526
[1361]	valid_0's l1: 0.0293038	valid_0's l2: 0.00171516
[1362]	valid_0's l1: 0.0293048	valid_0's l2: 0.00171526
[1363]	valid_0's l1: 0.0293043	valid_0's l2: 0.0017152
[1364]	valid_0's l1: 0.0293037	valid_0's l2: 0.00171515

[1494]	valid_0's l1: 0.02918	valid_0's l2: 0.00170029
[1495]	valid_0's l1: 0.0291777	valid_0's l2: 0.00170008
[1496]	valid_0's l1: 0.0291773	valid_0's l2: 0.00170005
[1497]	valid_0's l1: 0.0291771	valid_0's l2: 0.00170001
[1498]	valid_0's l1: 0.0291767	valid_0's l2: 0.00169998
[1499]	valid_0's l1: 0.0291755	valid_0's l2: 0.0016998
[1500]	valid_0's l1: 0.0291746	valid_0's l2: 0.00169971
[1501]	valid_0's l1: 0.0291735	valid_0's l2: 0.00169961
[1502]	valid_0's l1: 0.0291729	valid_0's l2: 0.0016995
[1503]	valid_0's l1: 0.0291721	valid_0's l2: 0.00169946
[1504]	valid_0's l1: 0.0291713	valid_0's l2: 0.0016994
[1505]	valid_0's l1: 0.0291706	valid_0's l2: 0.00169927
[1506]	valid_0's l1: 0.0291693	valid_0's l2: 0.00169915
[1507]	valid_0's l1: 0.0291679	valid_0's l2: 0.0016991
[1508]	valid_0's l1: 0.0291634	valid_0's l2: 0.00169867
[1509]	valid_0's l1: 0.0291632	valid_0's l2: 0.00169857
[1510]	valid_0's l1: 0.0291622	valid_0's l2: 0.00169845
[1511]	valid_0's l1: 0.029161	valid_0's l2: 0.0016983


[1642]	valid_0's l1: 0.0290467	valid_0's l2: 0.00168458
[1643]	valid_0's l1: 0.0290465	valid_0's l2: 0.00168455
[1644]	valid_0's l1: 0.0290453	valid_0's l2: 0.00168445
[1645]	valid_0's l1: 0.0290453	valid_0's l2: 0.00168445
[1646]	valid_0's l1: 0.0290448	valid_0's l2: 0.00168436
[1647]	valid_0's l1: 0.0290447	valid_0's l2: 0.00168435
[1648]	valid_0's l1: 0.0290447	valid_0's l2: 0.00168436
[1649]	valid_0's l1: 0.0290446	valid_0's l2: 0.00168433
[1650]	valid_0's l1: 0.0290442	valid_0's l2: 0.0016843
[1651]	valid_0's l1: 0.0290442	valid_0's l2: 0.00168431
[1652]	valid_0's l1: 0.0290434	valid_0's l2: 0.00168427
[1653]	valid_0's l1: 0.0290428	valid_0's l2: 0.0016842
[1654]	valid_0's l1: 0.0290423	valid_0's l2: 0.0016841
[1655]	valid_0's l1: 0.0290421	valid_0's l2: 0.00168408
[1656]	valid_0's l1: 0.0290412	valid_0's l2: 0.00168399
[1657]	valid_0's l1: 0.029041	valid_0's l2: 0.00168394
[1658]	valid_0's l1: 0.0290403	valid_0's l2: 0.00168386
[1659]	valid_0's l1: 0.0290404	valid_0's l2: 0.00168

[1789]	valid_0's l1: 0.0289522	valid_0's l2: 0.0016733
[1790]	valid_0's l1: 0.0289521	valid_0's l2: 0.00167327
[1791]	valid_0's l1: 0.028952	valid_0's l2: 0.00167326
[1792]	valid_0's l1: 0.0289513	valid_0's l2: 0.00167316
[1793]	valid_0's l1: 0.0289498	valid_0's l2: 0.00167303
[1794]	valid_0's l1: 0.0289496	valid_0's l2: 0.00167296
[1795]	valid_0's l1: 0.0289478	valid_0's l2: 0.00167278
[1796]	valid_0's l1: 0.0289475	valid_0's l2: 0.00167274
[1797]	valid_0's l1: 0.0289468	valid_0's l2: 0.00167263
[1798]	valid_0's l1: 0.0289428	valid_0's l2: 0.00167224
[1799]	valid_0's l1: 0.0289429	valid_0's l2: 0.00167224
[1800]	valid_0's l1: 0.0289404	valid_0's l2: 0.001672
[1801]	valid_0's l1: 0.0289399	valid_0's l2: 0.00167194
[1802]	valid_0's l1: 0.0289398	valid_0's l2: 0.00167193
[1803]	valid_0's l1: 0.0289379	valid_0's l2: 0.00167174
[1804]	valid_0's l1: 0.0289375	valid_0's l2: 0.00167173
[1805]	valid_0's l1: 0.0289368	valid_0's l2: 0.00167165
[1806]	valid_0's l1: 0.0289353	valid_0's l2: 0.00167

[1937]	valid_0's l1: 0.0288559	valid_0's l2: 0.0016624
[1938]	valid_0's l1: 0.0288554	valid_0's l2: 0.00166228
[1939]	valid_0's l1: 0.0288545	valid_0's l2: 0.00166215
[1940]	valid_0's l1: 0.028854	valid_0's l2: 0.00166208
[1941]	valid_0's l1: 0.0288533	valid_0's l2: 0.00166202
[1942]	valid_0's l1: 0.028852	valid_0's l2: 0.00166188
[1943]	valid_0's l1: 0.028852	valid_0's l2: 0.00166188
[1944]	valid_0's l1: 0.0288519	valid_0's l2: 0.00166189
[1945]	valid_0's l1: 0.0288484	valid_0's l2: 0.00166145
[1946]	valid_0's l1: 0.0288479	valid_0's l2: 0.00166139
[1947]	valid_0's l1: 0.0288452	valid_0's l2: 0.00166119
[1948]	valid_0's l1: 0.0288448	valid_0's l2: 0.00166115
[1949]	valid_0's l1: 0.0288442	valid_0's l2: 0.00166108
[1950]	valid_0's l1: 0.0288435	valid_0's l2: 0.001661
[1951]	valid_0's l1: 0.028843	valid_0's l2: 0.00166096
[1952]	valid_0's l1: 0.028843	valid_0's l2: 0.00166095
[1953]	valid_0's l1: 0.0288414	valid_0's l2: 0.00166067
[1954]	valid_0's l1: 0.0288415	valid_0's l2: 0.00166069


[2084]	valid_0's l1: 0.0287657	valid_0's l2: 0.00165226
[2085]	valid_0's l1: 0.0287658	valid_0's l2: 0.00165228
[2086]	valid_0's l1: 0.0287657	valid_0's l2: 0.00165226
[2087]	valid_0's l1: 0.0287656	valid_0's l2: 0.00165224
[2088]	valid_0's l1: 0.0287655	valid_0's l2: 0.00165223
[2089]	valid_0's l1: 0.0287651	valid_0's l2: 0.00165221
[2090]	valid_0's l1: 0.0287647	valid_0's l2: 0.00165218
[2091]	valid_0's l1: 0.0287641	valid_0's l2: 0.00165215
[2092]	valid_0's l1: 0.0287637	valid_0's l2: 0.00165212
[2093]	valid_0's l1: 0.0287633	valid_0's l2: 0.00165209
[2094]	valid_0's l1: 0.0287629	valid_0's l2: 0.00165207
[2095]	valid_0's l1: 0.0287625	valid_0's l2: 0.00165205
[2096]	valid_0's l1: 0.0287619	valid_0's l2: 0.00165195
[2097]	valid_0's l1: 0.0287614	valid_0's l2: 0.00165192
[2098]	valid_0's l1: 0.0287616	valid_0's l2: 0.00165196
[2099]	valid_0's l1: 0.0287616	valid_0's l2: 0.00165194
[2100]	valid_0's l1: 0.0287612	valid_0's l2: 0.00165187
[2101]	valid_0's l1: 0.0287613	valid_0's l2: 0.0

[2231]	valid_0's l1: 0.0286857	valid_0's l2: 0.00164371
[2232]	valid_0's l1: 0.0286852	valid_0's l2: 0.00164368
[2233]	valid_0's l1: 0.0286846	valid_0's l2: 0.00164361
[2234]	valid_0's l1: 0.0286829	valid_0's l2: 0.00164346
[2235]	valid_0's l1: 0.0286829	valid_0's l2: 0.00164347
[2236]	valid_0's l1: 0.0286826	valid_0's l2: 0.00164343
[2237]	valid_0's l1: 0.0286823	valid_0's l2: 0.0016434
[2238]	valid_0's l1: 0.0286818	valid_0's l2: 0.00164334
[2239]	valid_0's l1: 0.0286813	valid_0's l2: 0.0016433
[2240]	valid_0's l1: 0.0286803	valid_0's l2: 0.00164324
[2241]	valid_0's l1: 0.0286799	valid_0's l2: 0.00164319
[2242]	valid_0's l1: 0.0286799	valid_0's l2: 0.00164317
[2243]	valid_0's l1: 0.0286788	valid_0's l2: 0.00164302
[2244]	valid_0's l1: 0.0286785	valid_0's l2: 0.00164299
[2245]	valid_0's l1: 0.0286783	valid_0's l2: 0.00164298
[2246]	valid_0's l1: 0.0286776	valid_0's l2: 0.00164293
[2247]	valid_0's l1: 0.0286767	valid_0's l2: 0.00164282
[2248]	valid_0's l1: 0.0286763	valid_0's l2: 0.001

[2379]	valid_0's l1: 0.0286218	valid_0's l2: 0.00163654
[2380]	valid_0's l1: 0.0286216	valid_0's l2: 0.00163652
[2381]	valid_0's l1: 0.0286212	valid_0's l2: 0.00163648
[2382]	valid_0's l1: 0.028621	valid_0's l2: 0.00163645
[2383]	valid_0's l1: 0.0286207	valid_0's l2: 0.0016364
[2384]	valid_0's l1: 0.0286205	valid_0's l2: 0.00163636
[2385]	valid_0's l1: 0.0286199	valid_0's l2: 0.00163629
[2386]	valid_0's l1: 0.0286199	valid_0's l2: 0.00163629
[2387]	valid_0's l1: 0.0286197	valid_0's l2: 0.00163625
[2388]	valid_0's l1: 0.0286194	valid_0's l2: 0.00163622
[2389]	valid_0's l1: 0.0286194	valid_0's l2: 0.00163624
[2390]	valid_0's l1: 0.0286195	valid_0's l2: 0.00163625
[2391]	valid_0's l1: 0.0286196	valid_0's l2: 0.00163626
[2392]	valid_0's l1: 0.028619	valid_0's l2: 0.00163619
[2393]	valid_0's l1: 0.0286186	valid_0's l2: 0.00163613
[2394]	valid_0's l1: 0.0286186	valid_0's l2: 0.00163613
[2395]	valid_0's l1: 0.028618	valid_0's l2: 0.00163607
[2396]	valid_0's l1: 0.0286178	valid_0's l2: 0.00163

[2526]	valid_0's l1: 0.0285569	valid_0's l2: 0.00162949
[2527]	valid_0's l1: 0.0285568	valid_0's l2: 0.00162944
[2528]	valid_0's l1: 0.0285568	valid_0's l2: 0.00162944
[2529]	valid_0's l1: 0.0285565	valid_0's l2: 0.0016294
[2530]	valid_0's l1: 0.0285549	valid_0's l2: 0.00162925
[2531]	valid_0's l1: 0.0285548	valid_0's l2: 0.00162924
[2532]	valid_0's l1: 0.0285545	valid_0's l2: 0.00162921
[2533]	valid_0's l1: 0.0285541	valid_0's l2: 0.00162918
[2534]	valid_0's l1: 0.0285537	valid_0's l2: 0.00162913
[2535]	valid_0's l1: 0.0285535	valid_0's l2: 0.00162911
[2536]	valid_0's l1: 0.0285529	valid_0's l2: 0.00162909
[2537]	valid_0's l1: 0.028553	valid_0's l2: 0.0016291
[2538]	valid_0's l1: 0.0285527	valid_0's l2: 0.00162905
[2539]	valid_0's l1: 0.0285524	valid_0's l2: 0.00162902
[2540]	valid_0's l1: 0.0285519	valid_0's l2: 0.00162899
[2541]	valid_0's l1: 0.0285517	valid_0's l2: 0.00162899
[2542]	valid_0's l1: 0.0285515	valid_0's l2: 0.00162898
[2543]	valid_0's l1: 0.0285514	valid_0's l2: 0.0016

[2674]	valid_0's l1: 0.0285057	valid_0's l2: 0.00162406
[2675]	valid_0's l1: 0.0285053	valid_0's l2: 0.00162404
[2676]	valid_0's l1: 0.028505	valid_0's l2: 0.00162402
[2677]	valid_0's l1: 0.0285047	valid_0's l2: 0.00162401
[2678]	valid_0's l1: 0.0285039	valid_0's l2: 0.00162396
[2679]	valid_0's l1: 0.0285033	valid_0's l2: 0.00162388
[2680]	valid_0's l1: 0.028503	valid_0's l2: 0.00162386
[2681]	valid_0's l1: 0.0285021	valid_0's l2: 0.0016238
[2682]	valid_0's l1: 0.0285016	valid_0's l2: 0.00162377
[2683]	valid_0's l1: 0.0285014	valid_0's l2: 0.00162375
[2684]	valid_0's l1: 0.0285005	valid_0's l2: 0.00162365
[2685]	valid_0's l1: 0.0285002	valid_0's l2: 0.00162365
[2686]	valid_0's l1: 0.0285	valid_0's l2: 0.00162363
[2687]	valid_0's l1: 0.0284998	valid_0's l2: 0.00162362
[2688]	valid_0's l1: 0.0284997	valid_0's l2: 0.00162361
[2689]	valid_0's l1: 0.0284996	valid_0's l2: 0.00162357
[2690]	valid_0's l1: 0.0284991	valid_0's l2: 0.00162353
[2691]	valid_0's l1: 0.0284984	valid_0's l2: 0.0016234

[2822]	valid_0's l1: 0.0284497	valid_0's l2: 0.00161828
[2823]	valid_0's l1: 0.0284476	valid_0's l2: 0.00161792
[2824]	valid_0's l1: 0.0284478	valid_0's l2: 0.00161794
[2825]	valid_0's l1: 0.028447	valid_0's l2: 0.0016178
[2826]	valid_0's l1: 0.0284463	valid_0's l2: 0.00161772
[2827]	valid_0's l1: 0.0284464	valid_0's l2: 0.00161772
[2828]	valid_0's l1: 0.028446	valid_0's l2: 0.00161767
[2829]	valid_0's l1: 0.0284458	valid_0's l2: 0.00161768
[2830]	valid_0's l1: 0.0284456	valid_0's l2: 0.00161766
[2831]	valid_0's l1: 0.0284453	valid_0's l2: 0.00161761
[2832]	valid_0's l1: 0.0284451	valid_0's l2: 0.00161756
[2833]	valid_0's l1: 0.0284449	valid_0's l2: 0.00161753
[2834]	valid_0's l1: 0.0284447	valid_0's l2: 0.00161755
[2835]	valid_0's l1: 0.0284433	valid_0's l2: 0.00161738
[2836]	valid_0's l1: 0.0284431	valid_0's l2: 0.00161736
[2837]	valid_0's l1: 0.0284424	valid_0's l2: 0.00161729
[2838]	valid_0's l1: 0.0284421	valid_0's l2: 0.00161728
[2839]	valid_0's l1: 0.0284418	valid_0's l2: 0.0016

[2969]	valid_0's l1: 0.0284096	valid_0's l2: 0.00161339
[2970]	valid_0's l1: 0.0284094	valid_0's l2: 0.00161336
[2971]	valid_0's l1: 0.0284094	valid_0's l2: 0.00161334
[2972]	valid_0's l1: 0.0284093	valid_0's l2: 0.00161334
[2973]	valid_0's l1: 0.0284093	valid_0's l2: 0.00161334
[2974]	valid_0's l1: 0.028409	valid_0's l2: 0.00161331
[2975]	valid_0's l1: 0.0284085	valid_0's l2: 0.00161322
[2976]	valid_0's l1: 0.0284087	valid_0's l2: 0.00161323
[2977]	valid_0's l1: 0.0284086	valid_0's l2: 0.00161323
[2978]	valid_0's l1: 0.0284084	valid_0's l2: 0.00161323
[2979]	valid_0's l1: 0.0284084	valid_0's l2: 0.00161323
[2980]	valid_0's l1: 0.0284083	valid_0's l2: 0.00161323
[2981]	valid_0's l1: 0.0284078	valid_0's l2: 0.00161321
[2982]	valid_0's l1: 0.0284078	valid_0's l2: 0.00161322
[2983]	valid_0's l1: 0.0284077	valid_0's l2: 0.00161323
[2984]	valid_0's l1: 0.0284074	valid_0's l2: 0.0016132
[2985]	valid_0's l1: 0.0284071	valid_0's l2: 0.00161313
[2986]	valid_0's l1: 0.0284071	valid_0's l2: 0.001

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=0.34,
       learning_rate=0.03, max_depth=12, min_child_samples=110,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=3000,
       n_jobs=8, num_leaves=100, objective='regression', random_state=None,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1,
       subsample_for_bin=200000, subsample_freq=0, verbose=1)

In [41]:
feature_importances_df = pd.DataFrame(fearture_columns,columns=['fearture'])
feature_importances_df['feature_importances'] = gbm.feature_importances_

# feature_importances_df.loc[feature_importances_df['feature_importances']>0].sort_values(by='feature_importances',ascending=False)
# feature_importances_df = pd.merge(feature_importances_df,check_item_value[['check_item_value_code','check_item_value_name']],left_on='fearture',right_on='check_item_value_code',how='left')
feature_importances_df.sort_values(by='feature_importances',ascending=False)



Unnamed: 0,fearture,feature_importances
14,series_code_index,34444
10,province_name,15074
13,brand_code_index,14825
0,months,7990
5,model_year,7544
4,new_car_price,7242
16,color_index,6992
9,year_err,6334
1,mileage_log,5843
17,keep_value,5528


In [102]:
# ##保存模型
import os
from sklearn.externals import joblib

# 保存模型
joblib.dump(gbm,curr_dir+'biaozhun_model_车况一般_2021-02-02.pkl')

# 上传到 oss上 
ph = curr_dir+'biaozhun_model_车况一般_2021-02-02.pkl'
from dayu.hooks.oss_hook import OSSHook
oss = OSSHook("oss_algorithm")
oss.put_file("algorithm/qiongjiu/valuation/全网数据-优化模型/biaozhun_model_车况一般_2021-02-02.pkl", ph)



[2021-02-02 17:40:28,669] {oss_hook:28} INFO - Done. Loaded the key algorithm/qiongjiu/valuation/全网数据-优化模型/biaozhun_model_车况一般_2021-02-02.pkl .


In [54]:
# gbm = joblib.load(curr_dir+'biaozhun_model_车况一般_2020-12-21_16点_A.pkl')


In [55]:
# import requests
# import pandas as pd 

# params={}
# params['modelCode']= '10006-n'
# params['regDate']= '2016-03-01'
# params['mile']= '3.92'

# params['cityCode']= '01756'
# params['evaluateDate'] = '2020-12-01'
# params['transferTimes']= '0'
# params['colorName'] = '红色'

# params['carCondition'] = 'condition_all'
# params['userId']= 'IiQ5XSVJUJ'
# # VzODeebWFWyUhiAkOGVvdKlZJpsbGIwS
# # lCLrtztVfUUDDIgbWWFoLROiowMKszud
# ## 测试环境
# params['busCode']= 'nnVVRxQqqHSMMYzgaFEDCQtWVvJtPuqb'
# req = requests.post('http://enterprise.stable.dasouche-inc.net/residual-test/getModelResidualV3Api?',params=params)

# data_r = req.json()
# fearture_list = []
# for col in fearture_columns:
#     fearture_list.append(data_r['data']['data']['modelParam'][col])

# gbm.predict([fearture_list])



In [42]:
test_data['ypred'] = gbm.predict(test_data[fearture_columns])
test_data['err'] = abs(test_data['ypred'] - test_data['residual']) / test_data['residual']

# train_data['ypred'] = gbm.predict(train_data[fearture_columns])
# train_data['err'] = abs(train_data['ypred'] - train_data['residual']) / train_data['residual']


In [43]:
def data3m_pinggu(data_df,col):
    total = data_df.shape[0]
    num_3 = data_df.loc[(data_df[col]>=-0.03 ) & (data_df[col]<=0.03 )].shape[0]
    print("P<3%: ",round(num_3/total,4))
    num_5 = data_df.loc[(data_df[col]>=-0.05 ) & (data_df[col]<=0.05 )].shape[0]
    print("P<5%: ",round(num_5/total,4))
    num_8 = data_df.loc[(data_df[col]>=-0.08 ) & (data_df[col]<=0.08 )].shape[0]
    print("P<8%: ",round(num_8/total,4))
    num_10 = data_df.loc[(data_df[col]>=-0.1 ) & (data_df[col]<=0.1 )].shape[0]
    print("P<10%: ",round(num_10/total,4))
    num_20 = data_df.loc[(data_df[col]>=-0.2 ) & (data_df[col]<=0.2 )].shape[0]
    print("P<20%: ",round(num_20/total,4))
print("测试集。。。。")
data3m_pinggu(test_data,'err')


训练集。。。。
测试集。。。。
P<3%:  0.3837
P<5%:  0.5793
P<8%:  0.7675
P<10%:  0.8408
P<20%:  0.9633


In [44]:
train_data['ypred'] = gbm.predict(train_data[fearture_columns])
test_data['ypred'] = gbm.predict(test_data[fearture_columns])

train_data['ypred_price'] = train_data['ypred'] * train_data['new_car_price']
test_data['ypred_price'] = test_data['ypred'] * test_data['new_car_price']



In [46]:
print(test_data.loc[(test_data['delete_flag'] !=1) & 
                           (test_data['delete_flag_1'] !=1) & 
                           (test_data['delete_flag_2'] !=1)].shape)
data3m_pinggu(test_data.loc[(test_data['delete_flag'] !=1) & 
                           (test_data['delete_flag_1'] !=1) & 
                           (test_data['delete_flag_2'] !=1)],'err')


(47765, 307)
P<3%:  0.4195
P<5%:  0.6162
P<8%:  0.7921
P<10%:  0.8581
P<20%:  0.965


In [75]:
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn2pmml import sklearn2pmml
import lightgbm as lgb

pipeline = PMMLPipeline([("regressor", lgb.LGBMRegressor(boosting_type = 'gbdt', 
                        objective = 'regression',
                        learning_rate = 0.03, 
                        n_estimators = 3000,
                        max_depth = 12,
                        num_leaves = 100, 
                        subsample = 1, 
                        colsample_bytree = 0.34,
                        min_child_samples = 110, 
                        n_jobs = 8,
                        verbose = 1))])

pipeline.fit(train_data[fearture_columns],
            train_data[label])



PMMLPipeline(steps=[('regressor', LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=0.34,
       learning_rate=0.03, max_depth=12, min_child_samples=110,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=3000,
       n_jobs=8, num_leaves=100, objective='regression', random_state=None,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1,
       subsample_for_bin=200000, subsample_freq=0, verbose=1))])

In [76]:
def data3m_pinggu(data_df,col):
    total = data_df.shape[0]
    num_3 = data_df.loc[(data_df[col]>=-0.03 ) & (data_df[col]<=0.03 )].shape[0]
    print("P<3%: ",round(num_3/total,4))
    num_5 = data_df.loc[(data_df[col]>=-0.05 ) & (data_df[col]<=0.05 )].shape[0]
    print("P<5%: ",round(num_5/total,4))
    num_8 = data_df.loc[(data_df[col]>=-0.08 ) & (data_df[col]<=0.08 )].shape[0]
    print("P<8%: ",round(num_8/total,4))
    num_10 = data_df.loc[(data_df[col]>=-0.1 ) & (data_df[col]<=0.1 )].shape[0]
    print("P<10%: ",round(num_10/total,4))
    num_20 = data_df.loc[(data_df[col]>=-0.2 ) & (data_df[col]<=0.2 )].shape[0]
    print("P<20%: ",round(num_20/total,4))

test_data['ypred'] = pipeline.predict(test_data[fearture_columns])
test_data['err'] = abs(test_data['ypred'] - test_data['residual']) / test_data['residual']
data3m_pinggu(test_data,'err')
print(test_data.loc[(test_data['delete_flag'] !=1) & 
                           (test_data['delete_flag_1'] !=1) & 
                           (test_data['delete_flag_2'] !=1)].shape)
data3m_pinggu(test_data.loc[(test_data['delete_flag'] !=1) & 
                           (test_data['delete_flag_1'] !=1) & 
                           (test_data['delete_flag_2'] !=1)],'err')


P<3%:  0.3747
P<5%:  0.5681
P<8%:  0.7577
P<10%:  0.8336
P<20%:  0.9618
(47765, 307)
P<3%:  0.414
P<5%:  0.61
P<8%:  0.7871
P<10%:  0.8534
P<20%:  0.9636


In [77]:
ph = curr_dir+"enterprise2_AAA_level_dfc_retail_quan.pmml"
sklearn2pmml(pipeline, ph, with_repr = True)
from dayu.hooks.oss_hook import OSSHook
oss = OSSHook("oss_algorithm")
oss.put_file("algorithm/qiongjiu/valuation/全网数据-优化模型/"+curr_date+"/enterprise2_AAA_level_dfc_retail_quan.pmml", ph)


[2021-02-03 14:33:54,956] {oss_hook:28} INFO - Done. Loaded the key algorithm/qiongjiu/valuation/全网数据-优化模型/2021-02-03/enterprise2_AAA_level_dfc_retail_quan.pmml .


In [78]:
test_data.to_csv(curr_dir+"enterprise2_A_test_2021-02-02.csv")
by_model = test_data[['model_code','err']]
by_model = by_model.groupby("model_code").median().reset_index().rename(columns={"err":"median_res"})

ph = curr_dir+"enterprise2_AAA_level_retail_median_res.xlsx"
by_model.to_excel(ph)



In [None]:
## ====================================================
##       下面是模型在各数据集上对比评估
## ====================================================

In [47]:
test_data = dl_site_ts_order_clean.loc[(dl_site_ts_order_clean['publish_time'] >= '2020-12-01') |  
                                      (dl_site_ts_order_clean['data_type'] == 'cyp')]

test_data = test_data.loc[test_data['data_type'].isin(['cyp','dfc_purchase'])]
test_data_cheniu = dl_site_ts_order_clean.loc[(dl_site_ts_order_clean['data_type'] == 'cheniu')]

test_data = pd.concat([test_data,test_data_cheniu],axis=0)

# gbm = joblib.load(curr_dir+'biaozhun_model_车况一般_2020-12-28_16点_A_0.pkl')
test_data['ypred'] = gbm.predict(test_data[fearture_columns])
test_data['new_predict_price'] = test_data['new_car_price'] * test_data['ypred']


# 对大风车采购、车牛数据评估、车易拍新旧模型对比评估

In [54]:

test_data = dl_site_ts_order_clean.loc[(dl_site_ts_order_clean['publish_time'] >= '2020-12-01') |  
                                      (dl_site_ts_order_clean['data_type'] == 'cyp')]

test_data = test_data.loc[test_data['data_type'].isin(['cyp','dfc_purchase'])]
test_data_cheniu = dl_site_ts_order_clean.loc[(dl_site_ts_order_clean['data_type'] == 'cheniu')]

test_data = pd.concat([test_data,test_data_cheniu],axis=0)

test_data['ypred'] = gbm.predict(test_data[fearture_columns])
test_data['new_predict_price'] = test_data['new_car_price'] * test_data['ypred']

def get_guzhi_interface(model_code,license_time,mileage,city_code,publish_time):
    import requests
    params={}
    params['modelCode']= model_code
    params['regDate']= license_time
    params['mile']= mileage

    params['cityCode']= city_code
    params['evaluateDate'] = publish_time
    params['carCondition'] = 'condition_all'
    params['userId']= 'IiQ5XSVJUJ'
    # VzODeebWFWyUhiAkOGVvdKlZJpsbGIwS
    # lCLrtztVfUUDDIgbWWFoLROiowMKszud
    ## 测试环境
    params['busCode']= 'VzODeebWFWyUhiAkOGVvdKlZJpsbGIwS'
    req = requests.post('http://enterprise.stable.dasouche-inc.net/residualTest/getModelResidualV2Api.json?',params=params)
    
    data_r = req.json()
    return data_r

province_city_data = pd.read_excel("../../province_city_data.xlsx")[['city_name','city_code']]
province_city_data['city_code'] = province_city_data['city_code'].map(lambda x:("000000"+str(x))[-5:])

# test_data = dl_site_ts_order_clean.loc[(dl_site_ts_order_clean['publish_time'] >= '2020-12-01') 
test_data['a_city_name'] =  test_data['a_city_name'].map(lambda x:str(x).replace('市',''))
test_data_df = test_data[['car_id', 'model_code','mileage' ,'license_time', 'publish_time', 'a_province_name', 'a_city_name','real_pay_amount']]

test_data_df = pd.merge(test_data_df,province_city_data,left_on='a_city_name',right_on='city_name',how='left')

test_data_list = []
i = 0
for data in test_data_df.to_dict(orient='records'):
    i += 1
    if i % 100 == 0: print(i)
    try:
        data_r = get_guzhi_interface(data['model_code'],data['license_time'],data['mileage'],
                            data['city_code'],data['publish_time'])

        data['old_pred'] = data_r['data']['data']['dealerBuy']['good']['residual']
    except:
        data['old_pred'] = None
    test_data_list.append(data)

test_data_df = pd.DataFrame(test_data_list)



100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
11300
11400
11500
11600
11700
11800
11900
12000
12100
12200
12300
12400
12500
12600


In [55]:
retial_purchase_margin = pd.read_csv(curr_dir+"enterprise2_retial_purchase_margin.csv")
test_data = pd.merge(test_data,retial_purchase_margin,on='model_code',how='left')
test_data = pd.merge(test_data,test_data_df[['old_pred','car_id']],on='car_id',how='left')
test_data['ypred_price'] = (test_data['ypred']*0.96 - test_data['margin_median']) * test_data['new_car_price']


# 对大风车采购数据评估

In [60]:

test_data_dfc = test_data.loc[test_data['data_type'] == 'dfc_purchase']
test_data_dfc['err_dfc_new'] = abs(test_data_dfc['ypred_price'] - test_data_dfc['real_pay_amount'] )/test_data_dfc['real_pay_amount'] 
test_data_dfc['err_dfc_old'] = abs(test_data_dfc['old_pred'] - test_data_dfc['real_pay_amount'] )/test_data_dfc['real_pay_amount'] 

data3m_pinggu(test_data_dfc.drop_duplicates(),'err_dfc_new')
data3m_pinggu(test_data_dfc.drop_duplicates(),'err_dfc_old')

test_data_dfc[['car_id','model_code','brand_name','series_name','model_name','real_pay_amount','mileage',
              'license_time', 'publish_time', 'a_province_name', 'a_city_name','ypred_price','old_pred',
               'err_dfc_new','err_dfc_old','guide_price']].to_csv('test_data_dfc_out_data.csv')



P<3%:  0.2282
P<5%:  0.3679
P<8%:  0.5445
P<10%:  0.6173
P<20%:  0.8051
P<3%:  0.2686
P<5%:  0.4265
P<8%:  0.592
P<10%:  0.6734
P<20%:  0.8592


In [304]:
test_data_dfc.drop_duplicates()[['model_code','brand_name','model_name','license_time_year','publish_time_year','ypred','new_predict_price',
                                'margin_median','new_car_price','old_pred','ypred_price','err_dfc_new','err_dfc_old']]


Unnamed: 0,model_code,brand_name,model_name,license_time_year,publish_time_year,ypred,new_predict_price,margin_median,new_car_price,old_pred,ypred_price,err_dfc_new,err_dfc_old
0,10034,标致,2008款 标致307 两厢 1.6L 手动精致版,2008,2020,0.103893,1.140749,0.049940,10.98,0.77,0.501144,0.392065,1.138889
2,10034,标致,2008款 标致307 两厢 1.6L 手动精致版,2008,2020,0.103893,1.140749,0.049940,10.98,0.77,0.501144,0.392065,1.138889
4,10050-n,大众,2016款 途观 300TSI 自动两驱风尚版,2016,2020,0.559720,11.854867,0.027965,21.18,9.92,10.314185,0.031532,0.068545
5,100648,雪佛兰,2012款 科鲁兹 1.6L SE MT,2013,2020,0.256316,3.073225,0.039580,11.99,2.00,2.352799,0.120381,0.047619
7,100648,雪佛兰,2012款 科鲁兹 1.6L SE MT,2013,2020,0.256316,3.073225,0.039580,11.99,2.00,2.352799,0.120381,0.047619
9,10066,现代,2008款 悦动 1.6L MT GL,2009,2020,0.192797,1.924118,0.038875,9.98,1.03,1.382212,1.126481,0.584615
11,10066,现代,2008款 悦动 1.6L MT GL,2009,2020,0.192797,1.924118,0.038875,9.98,1.03,1.382212,1.126481,0.584615
13,100775,长城,2011款 炫丽 CROSS 1.5L MT,2011,2020,0.241616,1.519765,0.056942,6.29,0.61,1.040021,0.386694,0.186667
15,100775,长城,2011款 炫丽 CROSS 1.5L MT,2011,2020,0.241616,1.519765,0.056942,6.29,0.61,1.040021,0.386694,0.186667
17,101212,铃木,2011款 雨燕 1.3L 手动超值版,2011,2020,0.360281,2.154480,0.056084,5.98,1.20,1.646740,0.097827,0.200000


# 对大风车采购做映射模型评估

In [295]:
train_dfc_data = dl_site_ts_order_clean.loc[(dl_site_ts_order_clean['publish_time'] < '2020-11-01') & 
                                      (dl_site_ts_order_clean['data_type'] == 'dfc_purchase')] 
test_dfc_data = dl_site_ts_order_clean.loc[(dl_site_ts_order_clean['publish_time'] >= '2020-11-01') & 
                                      (dl_site_ts_order_clean['data_type'] == 'dfc_purchase')]
# gbm = joblib.load(curr_dir+'biaozhun_model_车况一般_2020-12-28_16点_A_0.pkl')
train_dfc_data = pd.merge(train_dfc_data,retial_purchase_margin,on='model_code',how='left')
test_dfc_data = pd.merge(test_dfc_data,retial_purchase_margin,on='model_code',how='left')

train_dfc_data['ypred'] = gbm.predict(train_dfc_data[fearture_columns])
test_dfc_data['ypred'] = gbm.predict(test_dfc_data[fearture_columns])
train_dfc_data['ypred_price'] = (train_dfc_data['ypred']*0.92 - train_dfc_data['margin_median']) * train_dfc_data['new_car_price']
test_dfc_data['ypred_price'] = (test_dfc_data['ypred']*0.92 - test_dfc_data['margin_median']) * test_dfc_data['new_car_price']


fearture_columns_cn = ['wordvec0', 'wordvec1', 'wordvec2', 'wordvec3', 'wordvec4', 'wordvec5', 
                    'wordvec6', 'wordvec7', 'wordvec8', 'wordvec9', 'wordvec10', 'wordvec11', 'wordvec12', 'wordvec13', 'wordvec14',
                    'wordvec15', 'wordvec16', 'wordvec17', 'wordvec18', 'wordvec19', 'wordvec20', 'wordvec21', 'wordvec22', 'wordvec23',
                    'wordvec24', 'wordvec25', 'wordvec26', 'wordvec27', 'wordvec28', 'wordvec29', 'wordvec30', 'wordvec31', 'wordvec32', 
                    'wordvec33', 'wordvec34', 'wordvec35', 'wordvec36', 'wordvec37', 'wordvec38', 'wordvec39', 'wordvec40', 'wordvec41',
                    'wordvec42', 'wordvec43', 'wordvec44', 'wordvec45', 'wordvec46', 'wordvec47', 'wordvec48', 'wordvec49', 'wordvec50',
                    'wordvec51', 'wordvec52', 'wordvec53', 'wordvec54', 'wordvec55', 'wordvec56', 'wordvec57', 'wordvec58', 'wordvec59', 
                    'wordvec60', 'wordvec61', 'wordvec62', 'wordvec63', 'wordvec64', 'wordvec65', 'wordvec66', 'wordvec67', 'wordvec68', 
                    'wordvec69', 'wordvec70', 'wordvec71', 'wordvec72', 'wordvec73', 'wordvec74', 'wordvec75', 'wordvec76', 'wordvec77',
                    'wordvec78', 'wordvec79', 'wordvec80', 'wordvec81', 'wordvec82', 'wordvec83', 'wordvec84', 'wordvec85', 'wordvec86',
                    'wordvec87', 'wordvec88', 'wordvec89', 'wordvec90', 'wordvec91', 'wordvec92', 'wordvec93', 'wordvec94', 'wordvec95', 
                    'wordvec96', 'wordvec97', 'wordvec98', 'wordvec99','ypred_price']
label = 'real_pay_amount'

import lightgbm as lgb
gbm = lgb.LGBMRegressor(boosting_type = 'gbdt', 
                        objective = 'regression',
                        learning_rate = 0.03, 
                        n_estimators = 3000,
                        max_depth = 12,
                        num_leaves = 100, 
                        subsample = 1, 
                        colsample_bytree = 0.34,
                        min_child_samples = 110, 
                        n_jobs = 8,
                        verbose = 1)

gbm.fit(train_dfc_data[fearture_columns_cn],
        train_dfc_data[label],
        eval_set = (test_dfc_data[fearture_columns_cn],
                    test_dfc_data[label]),
        eval_metric = {'l1','l2'}, 
        early_stopping_rounds = 50, 
        verbose=True,
       )

test_dfc_data['ypred_cn_new'] = gbm.predict(test_dfc_data[fearture_columns_cn])
test_dfc_data['err_cn_new'] = abs(test_dfc_data['ypred_cn_new'] - test_dfc_data['real_pay_amount'] )/test_dfc_data['real_pay_amount'] 

data3m_pinggu(test_dfc_data,'err_cn_new')


[1]	valid_0's l1: 4.29727	valid_0's l2: 40.2702
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's l1: 4.18922	valid_0's l2: 38.4658
[3]	valid_0's l1: 4.08281	valid_0's l2: 36.7567
[4]	valid_0's l1: 3.97857	valid_0's l2: 35.1583
[5]	valid_0's l1: 3.86324	valid_0's l2: 33.2693
[6]	valid_0's l1: 3.76886	valid_0's l2: 31.817
[7]	valid_0's l1: 3.6595	valid_0's l2: 30.0865
[8]	valid_0's l1: 3.55407	valid_0's l2: 28.4756
[9]	valid_0's l1: 3.46585	valid_0's l2: 27.2285
[10]	valid_0's l1: 3.36599	valid_0's l2: 25.7679
[11]	valid_0's l1: 3.28403	valid_0's l2: 24.6593
[12]	valid_0's l1: 3.20395	valid_0's l2: 23.6141
[13]	valid_0's l1: 3.12521	valid_0's l2: 22.5701
[14]	valid_0's l1: 3.04642	valid_0's l2: 21.5292
[15]	valid_0's l1: 2.97453	valid_0's l2: 20.6395
[16]	valid_0's l1: 2.90131	valid_0's l2: 19.7218
[17]	valid_0's l1: 2.83141	valid_0's l2: 18.8817
[18]	valid_0's l1: 2.75135	valid_0's l2: 17.9054
[19]	valid_0's l1: 2.67421	valid_0's l2: 16.9936
[20]	valid_0's l1:

[174]	valid_0's l1: 0.671181	valid_0's l2: 1.55643
[175]	valid_0's l1: 0.670824	valid_0's l2: 1.55243
[176]	valid_0's l1: 0.669634	valid_0's l2: 1.54508
[177]	valid_0's l1: 0.668478	valid_0's l2: 1.53777
[178]	valid_0's l1: 0.668515	valid_0's l2: 1.53602
[179]	valid_0's l1: 0.667485	valid_0's l2: 1.52632
[180]	valid_0's l1: 0.666183	valid_0's l2: 1.51892
[181]	valid_0's l1: 0.665733	valid_0's l2: 1.51351
[182]	valid_0's l1: 0.665884	valid_0's l2: 1.51041
[183]	valid_0's l1: 0.664814	valid_0's l2: 1.50352
[184]	valid_0's l1: 0.66364	valid_0's l2: 1.49636
[185]	valid_0's l1: 0.663904	valid_0's l2: 1.49494
[186]	valid_0's l1: 0.662783	valid_0's l2: 1.48794
[187]	valid_0's l1: 0.662915	valid_0's l2: 1.4867
[188]	valid_0's l1: 0.662602	valid_0's l2: 1.48242
[189]	valid_0's l1: 0.661339	valid_0's l2: 1.47253
[190]	valid_0's l1: 0.660294	valid_0's l2: 1.46417
[191]	valid_0's l1: 0.659292	valid_0's l2: 1.45595
[192]	valid_0's l1: 0.659502	valid_0's l2: 1.45464
[193]	valid_0's l1: 0.65939	valid

[348]	valid_0's l1: 0.621644	valid_0's l2: 1.04133
[349]	valid_0's l1: 0.621448	valid_0's l2: 1.03975
[350]	valid_0's l1: 0.621537	valid_0's l2: 1.03914
[351]	valid_0's l1: 0.621093	valid_0's l2: 1.03632
[352]	valid_0's l1: 0.62104	valid_0's l2: 1.03534
[353]	valid_0's l1: 0.620918	valid_0's l2: 1.03412
[354]	valid_0's l1: 0.620833	valid_0's l2: 1.03262
[355]	valid_0's l1: 0.620766	valid_0's l2: 1.03189
[356]	valid_0's l1: 0.620825	valid_0's l2: 1.03144
[357]	valid_0's l1: 0.620736	valid_0's l2: 1.03019
[358]	valid_0's l1: 0.620546	valid_0's l2: 1.0286
[359]	valid_0's l1: 0.620402	valid_0's l2: 1.02781
[360]	valid_0's l1: 0.62053	valid_0's l2: 1.02733
[361]	valid_0's l1: 0.620244	valid_0's l2: 1.02519
[362]	valid_0's l1: 0.620215	valid_0's l2: 1.0243
[363]	valid_0's l1: 0.620296	valid_0's l2: 1.02392
[364]	valid_0's l1: 0.620223	valid_0's l2: 1.02282
[365]	valid_0's l1: 0.620226	valid_0's l2: 1.02204
[366]	valid_0's l1: 0.619907	valid_0's l2: 1.0193
[367]	valid_0's l1: 0.619816	valid_0

[526]	valid_0's l1: 0.603853	valid_0's l2: 0.866957
[527]	valid_0's l1: 0.60388	valid_0's l2: 0.866462
[528]	valid_0's l1: 0.603848	valid_0's l2: 0.866087
[529]	valid_0's l1: 0.60383	valid_0's l2: 0.865479
[530]	valid_0's l1: 0.603835	valid_0's l2: 0.86532
[531]	valid_0's l1: 0.603816	valid_0's l2: 0.864955
[532]	valid_0's l1: 0.603888	valid_0's l2: 0.864789
[533]	valid_0's l1: 0.603865	valid_0's l2: 0.864372
[534]	valid_0's l1: 0.603902	valid_0's l2: 0.864208
[535]	valid_0's l1: 0.603977	valid_0's l2: 0.864251
[536]	valid_0's l1: 0.603781	valid_0's l2: 0.86299
[537]	valid_0's l1: 0.603773	valid_0's l2: 0.862844
[538]	valid_0's l1: 0.603698	valid_0's l2: 0.862376
[539]	valid_0's l1: 0.603559	valid_0's l2: 0.86132
[540]	valid_0's l1: 0.603523	valid_0's l2: 0.861024
[541]	valid_0's l1: 0.603389	valid_0's l2: 0.859384
[542]	valid_0's l1: 0.60333	valid_0's l2: 0.859029
[543]	valid_0's l1: 0.603224	valid_0's l2: 0.857894
[544]	valid_0's l1: 0.603247	valid_0's l2: 0.857638
[545]	valid_0's l1

[707]	valid_0's l1: 0.593844	valid_0's l2: 0.783223
[708]	valid_0's l1: 0.59377	valid_0's l2: 0.782871
[709]	valid_0's l1: 0.593618	valid_0's l2: 0.782059
[710]	valid_0's l1: 0.593581	valid_0's l2: 0.781759
[711]	valid_0's l1: 0.593589	valid_0's l2: 0.78171
[712]	valid_0's l1: 0.593595	valid_0's l2: 0.781574
[713]	valid_0's l1: 0.593496	valid_0's l2: 0.781053
[714]	valid_0's l1: 0.593465	valid_0's l2: 0.780842
[715]	valid_0's l1: 0.593489	valid_0's l2: 0.780744
[716]	valid_0's l1: 0.593428	valid_0's l2: 0.780107
[717]	valid_0's l1: 0.59337	valid_0's l2: 0.779825
[718]	valid_0's l1: 0.59339	valid_0's l2: 0.779722
[719]	valid_0's l1: 0.59331	valid_0's l2: 0.779234
[720]	valid_0's l1: 0.593248	valid_0's l2: 0.7789
[721]	valid_0's l1: 0.59316	valid_0's l2: 0.778513
[722]	valid_0's l1: 0.593227	valid_0's l2: 0.778573
[723]	valid_0's l1: 0.593198	valid_0's l2: 0.778418
[724]	valid_0's l1: 0.593172	valid_0's l2: 0.778074
[725]	valid_0's l1: 0.593154	valid_0's l2: 0.777938
[726]	valid_0's l1: 

[879]	valid_0's l1: 0.587143	valid_0's l2: 0.740466
[880]	valid_0's l1: 0.587164	valid_0's l2: 0.740497
[881]	valid_0's l1: 0.587119	valid_0's l2: 0.740262
[882]	valid_0's l1: 0.586982	valid_0's l2: 0.739814
[883]	valid_0's l1: 0.587013	valid_0's l2: 0.739852
[884]	valid_0's l1: 0.58698	valid_0's l2: 0.73969
[885]	valid_0's l1: 0.586898	valid_0's l2: 0.739388
[886]	valid_0's l1: 0.586869	valid_0's l2: 0.739194
[887]	valid_0's l1: 0.586849	valid_0's l2: 0.73905
[888]	valid_0's l1: 0.586784	valid_0's l2: 0.738884
[889]	valid_0's l1: 0.586743	valid_0's l2: 0.738667
[890]	valid_0's l1: 0.586686	valid_0's l2: 0.738414
[891]	valid_0's l1: 0.586654	valid_0's l2: 0.738229
[892]	valid_0's l1: 0.586583	valid_0's l2: 0.737896
[893]	valid_0's l1: 0.586579	valid_0's l2: 0.737824
[894]	valid_0's l1: 0.586599	valid_0's l2: 0.737773
[895]	valid_0's l1: 0.586497	valid_0's l2: 0.737418
[896]	valid_0's l1: 0.586468	valid_0's l2: 0.737241
[897]	valid_0's l1: 0.586492	valid_0's l2: 0.737212
[898]	valid_0's

[1054]	valid_0's l1: 0.582073	valid_0's l2: 0.714811
[1055]	valid_0's l1: 0.582028	valid_0's l2: 0.71466
[1056]	valid_0's l1: 0.582021	valid_0's l2: 0.714598
[1057]	valid_0's l1: 0.582063	valid_0's l2: 0.714633
[1058]	valid_0's l1: 0.582006	valid_0's l2: 0.714417
[1059]	valid_0's l1: 0.582048	valid_0's l2: 0.714504
[1060]	valid_0's l1: 0.582039	valid_0's l2: 0.714303
[1061]	valid_0's l1: 0.582031	valid_0's l2: 0.714222
[1062]	valid_0's l1: 0.581991	valid_0's l2: 0.714087
[1063]	valid_0's l1: 0.582029	valid_0's l2: 0.714088
[1064]	valid_0's l1: 0.581959	valid_0's l2: 0.713894
[1065]	valid_0's l1: 0.581932	valid_0's l2: 0.713801
[1066]	valid_0's l1: 0.581893	valid_0's l2: 0.713705
[1067]	valid_0's l1: 0.581864	valid_0's l2: 0.713541
[1068]	valid_0's l1: 0.581812	valid_0's l2: 0.713354
[1069]	valid_0's l1: 0.581757	valid_0's l2: 0.713177
[1070]	valid_0's l1: 0.581729	valid_0's l2: 0.713029
[1071]	valid_0's l1: 0.581785	valid_0's l2: 0.713098
[1072]	valid_0's l1: 0.581738	valid_0's l2: 0.7

[1225]	valid_0's l1: 0.579749	valid_0's l2: 0.702111
[1226]	valid_0's l1: 0.579803	valid_0's l2: 0.702175
[1227]	valid_0's l1: 0.579748	valid_0's l2: 0.702007
[1228]	valid_0's l1: 0.579707	valid_0's l2: 0.701945
[1229]	valid_0's l1: 0.579695	valid_0's l2: 0.701913
[1230]	valid_0's l1: 0.57968	valid_0's l2: 0.701845
[1231]	valid_0's l1: 0.579698	valid_0's l2: 0.701939
[1232]	valid_0's l1: 0.579642	valid_0's l2: 0.701779
[1233]	valid_0's l1: 0.579605	valid_0's l2: 0.701682
[1234]	valid_0's l1: 0.579532	valid_0's l2: 0.701488
[1235]	valid_0's l1: 0.579562	valid_0's l2: 0.701519
[1236]	valid_0's l1: 0.579547	valid_0's l2: 0.701487
[1237]	valid_0's l1: 0.579512	valid_0's l2: 0.701384
[1238]	valid_0's l1: 0.579503	valid_0's l2: 0.701366
[1239]	valid_0's l1: 0.579477	valid_0's l2: 0.701237
[1240]	valid_0's l1: 0.579523	valid_0's l2: 0.701297
[1241]	valid_0's l1: 0.57952	valid_0's l2: 0.701256
[1242]	valid_0's l1: 0.579504	valid_0's l2: 0.70122
[1243]	valid_0's l1: 0.579475	valid_0's l2: 0.701

[1400]	valid_0's l1: 0.577354	valid_0's l2: 0.694036
[1401]	valid_0's l1: 0.577337	valid_0's l2: 0.693972
[1402]	valid_0's l1: 0.577335	valid_0's l2: 0.693962
[1403]	valid_0's l1: 0.577328	valid_0's l2: 0.69393
[1404]	valid_0's l1: 0.577298	valid_0's l2: 0.693845
[1405]	valid_0's l1: 0.57726	valid_0's l2: 0.693768
[1406]	valid_0's l1: 0.577238	valid_0's l2: 0.693794
[1407]	valid_0's l1: 0.577224	valid_0's l2: 0.693755
[1408]	valid_0's l1: 0.577197	valid_0's l2: 0.693736
[1409]	valid_0's l1: 0.577217	valid_0's l2: 0.693747
[1410]	valid_0's l1: 0.577158	valid_0's l2: 0.693644
[1411]	valid_0's l1: 0.57712	valid_0's l2: 0.693592
[1412]	valid_0's l1: 0.577101	valid_0's l2: 0.693521
[1413]	valid_0's l1: 0.577075	valid_0's l2: 0.693468
[1414]	valid_0's l1: 0.577109	valid_0's l2: 0.693531
[1415]	valid_0's l1: 0.57706	valid_0's l2: 0.693424
[1416]	valid_0's l1: 0.577031	valid_0's l2: 0.693348
[1417]	valid_0's l1: 0.577019	valid_0's l2: 0.693302
[1418]	valid_0's l1: 0.577029	valid_0's l2: 0.6932

[1574]	valid_0's l1: 0.575376	valid_0's l2: 0.68969
[1575]	valid_0's l1: 0.575369	valid_0's l2: 0.689699
[1576]	valid_0's l1: 0.575393	valid_0's l2: 0.68974
[1577]	valid_0's l1: 0.575384	valid_0's l2: 0.68977
[1578]	valid_0's l1: 0.575351	valid_0's l2: 0.689723
[1579]	valid_0's l1: 0.575343	valid_0's l2: 0.689709
[1580]	valid_0's l1: 0.575311	valid_0's l2: 0.689686
[1581]	valid_0's l1: 0.575316	valid_0's l2: 0.689692
[1582]	valid_0's l1: 0.575305	valid_0's l2: 0.689722
[1583]	valid_0's l1: 0.575318	valid_0's l2: 0.68975
[1584]	valid_0's l1: 0.575304	valid_0's l2: 0.689702
[1585]	valid_0's l1: 0.575283	valid_0's l2: 0.689686
[1586]	valid_0's l1: 0.575255	valid_0's l2: 0.689641
[1587]	valid_0's l1: 0.575255	valid_0's l2: 0.689637
[1588]	valid_0's l1: 0.575281	valid_0's l2: 0.68967
[1589]	valid_0's l1: 0.575266	valid_0's l2: 0.689649
[1590]	valid_0's l1: 0.575244	valid_0's l2: 0.689627
[1591]	valid_0's l1: 0.57524	valid_0's l2: 0.689607
[1592]	valid_0's l1: 0.575221	valid_0's l2: 0.689587

[1731]	valid_0's l1: 0.574099	valid_0's l2: 0.688128
[1732]	valid_0's l1: 0.574117	valid_0's l2: 0.688145
[1733]	valid_0's l1: 0.574095	valid_0's l2: 0.688107
[1734]	valid_0's l1: 0.574076	valid_0's l2: 0.688086
[1735]	valid_0's l1: 0.574069	valid_0's l2: 0.688074
[1736]	valid_0's l1: 0.574058	valid_0's l2: 0.688052
[1737]	valid_0's l1: 0.574049	valid_0's l2: 0.688071
[1738]	valid_0's l1: 0.574062	valid_0's l2: 0.68809
[1739]	valid_0's l1: 0.574068	valid_0's l2: 0.688096
[1740]	valid_0's l1: 0.574084	valid_0's l2: 0.688127
[1741]	valid_0's l1: 0.574078	valid_0's l2: 0.688138
[1742]	valid_0's l1: 0.574088	valid_0's l2: 0.68813
[1743]	valid_0's l1: 0.574075	valid_0's l2: 0.688119
[1744]	valid_0's l1: 0.574081	valid_0's l2: 0.688116
[1745]	valid_0's l1: 0.574083	valid_0's l2: 0.688116
[1746]	valid_0's l1: 0.57406	valid_0's l2: 0.688083
[1747]	valid_0's l1: 0.574061	valid_0's l2: 0.688075
[1748]	valid_0's l1: 0.574051	valid_0's l2: 0.68806
[1749]	valid_0's l1: 0.574043	valid_0's l2: 0.6880

[1902]	valid_0's l1: 0.573289	valid_0's l2: 0.687495
[1903]	valid_0's l1: 0.573301	valid_0's l2: 0.687518
[1904]	valid_0's l1: 0.573302	valid_0's l2: 0.687535
[1905]	valid_0's l1: 0.573297	valid_0's l2: 0.687545
[1906]	valid_0's l1: 0.573319	valid_0's l2: 0.68759
[1907]	valid_0's l1: 0.573329	valid_0's l2: 0.687593
[1908]	valid_0's l1: 0.573318	valid_0's l2: 0.687595
[1909]	valid_0's l1: 0.573321	valid_0's l2: 0.687659
[1910]	valid_0's l1: 0.573332	valid_0's l2: 0.687667
[1911]	valid_0's l1: 0.573313	valid_0's l2: 0.687652
[1912]	valid_0's l1: 0.573262	valid_0's l2: 0.68761
[1913]	valid_0's l1: 0.573259	valid_0's l2: 0.687682
[1914]	valid_0's l1: 0.573257	valid_0's l2: 0.687677
[1915]	valid_0's l1: 0.57325	valid_0's l2: 0.687658
[1916]	valid_0's l1: 0.57325	valid_0's l2: 0.687676
[1917]	valid_0's l1: 0.573261	valid_0's l2: 0.687703
[1918]	valid_0's l1: 0.573284	valid_0's l2: 0.687762
[1919]	valid_0's l1: 0.573246	valid_0's l2: 0.687681
[1920]	valid_0's l1: 0.573253	valid_0's l2: 0.6876

# 对车牛的数据评估

In [61]:
fearture_columns_cn = ['wordvec0', 'wordvec1', 'wordvec2', 'wordvec3', 'wordvec4', 'wordvec5', 
                    'wordvec6', 'wordvec7', 'wordvec8', 'wordvec9', 'wordvec10', 'wordvec11', 'wordvec12', 'wordvec13', 'wordvec14',
                    'wordvec15', 'wordvec16', 'wordvec17', 'wordvec18', 'wordvec19', 'wordvec20', 'wordvec21', 'wordvec22', 'wordvec23',
                    'wordvec24', 'wordvec25', 'wordvec26', 'wordvec27', 'wordvec28', 'wordvec29', 'wordvec30', 'wordvec31', 'wordvec32', 
                    'wordvec33', 'wordvec34', 'wordvec35', 'wordvec36', 'wordvec37', 'wordvec38', 'wordvec39', 'wordvec40', 'wordvec41',
                    'wordvec42', 'wordvec43', 'wordvec44', 'wordvec45', 'wordvec46', 'wordvec47', 'wordvec48', 'wordvec49', 'wordvec50',
                    'wordvec51', 'wordvec52', 'wordvec53', 'wordvec54', 'wordvec55', 'wordvec56', 'wordvec57', 'wordvec58', 'wordvec59', 
                    'wordvec60', 'wordvec61', 'wordvec62', 'wordvec63', 'wordvec64', 'wordvec65', 'wordvec66', 'wordvec67', 'wordvec68', 
                    'wordvec69', 'wordvec70', 'wordvec71', 'wordvec72', 'wordvec73', 'wordvec74', 'wordvec75', 'wordvec76', 'wordvec77',
                    'wordvec78', 'wordvec79', 'wordvec80', 'wordvec81', 'wordvec82', 'wordvec83', 'wordvec84', 'wordvec85', 'wordvec86',
                    'wordvec87', 'wordvec88', 'wordvec89', 'wordvec90', 'wordvec91', 'wordvec92', 'wordvec93', 'wordvec94', 'wordvec95', 
                    'wordvec96', 'wordvec97', 'wordvec98', 'wordvec99','ypred_price']
label = 'real_pay_amount'
test_data_cn_test = test_data.loc[(test_data['data_type'] == 'cheniu') & (test_data['publish_time'] >= '2020-12-01')]
test_data_cn_train = test_data.loc[(test_data['data_type'] == 'cheniu') & (test_data['publish_time'] < '2020-12-01')]

import lightgbm as lgb
gbm = lgb.LGBMRegressor(boosting_type = 'gbdt', 
                        objective = 'regression',
                        learning_rate = 0.03, 
                        n_estimators = 3000,
                        max_depth = 12,
                        num_leaves = 100, 
                        subsample = 1, 
                        colsample_bytree = 0.34,
                        min_child_samples = 110, 
                        n_jobs = 8,
                        verbose = 1)

gbm.fit(test_data_cn_train[fearture_columns_cn],
        test_data_cn_train[label],
        eval_set = (test_data_cn_test[fearture_columns_cn],
                    test_data_cn_test[label]),
        eval_metric = {'l1','l2'}, 
        early_stopping_rounds = 50, 
        verbose=True,
       )

test_data_cn_test['ypred_cn_new'] = gbm.predict(test_data_cn_test[fearture_columns_cn])
test_data_cn_test['err_cn_new'] = abs(test_data_cn_test['ypred_cn_new'] - test_data_cn_test['real_pay_amount'] )/test_data_cn_test['real_pay_amount'] 

data3m_pinggu(test_data_cn_test,'err_cn_new')


[1]	valid_0's l2: 6.45244	valid_0's l1: 1.77123
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's l2: 6.20081	valid_0's l1: 1.73067
[3]	valid_0's l2: 5.9585	valid_0's l1: 1.68869
[4]	valid_0's l2: 5.75642	valid_0's l1: 1.65563
[5]	valid_0's l2: 5.47788	valid_0's l1: 1.61009
[6]	valid_0's l2: 5.26204	valid_0's l1: 1.57339
[7]	valid_0's l2: 5.00232	valid_0's l1: 1.52983
[8]	valid_0's l2: 4.76025	valid_0's l1: 1.48803
[9]	valid_0's l2: 4.59331	valid_0's l1: 1.45495
[10]	valid_0's l2: 4.36545	valid_0's l1: 1.41481
[11]	valid_0's l2: 4.19639	valid_0's l1: 1.38107
[12]	valid_0's l2: 4.06658	valid_0's l1: 1.35665
[13]	valid_0's l2: 3.92785	valid_0's l1: 1.3282
[14]	valid_0's l2: 3.80262	valid_0's l1: 1.30308
[15]	valid_0's l2: 3.67928	valid_0's l1: 1.27676
[16]	valid_0's l2: 3.57358	valid_0's l1: 1.25348
[17]	valid_0's l2: 3.46486	valid_0's l1: 1.22996
[18]	valid_0's l2: 3.30507	valid_0's l1: 1.19783
[19]	valid_0's l2: 3.15363	valid_0's l1: 1.16643
[20]	valid_0's l2:

[240]	valid_0's l2: 0.517153	valid_0's l1: 0.366601
[241]	valid_0's l2: 0.516772	valid_0's l1: 0.366446
[242]	valid_0's l2: 0.515582	valid_0's l1: 0.365966
[243]	valid_0's l2: 0.514886	valid_0's l1: 0.366103
[244]	valid_0's l2: 0.513693	valid_0's l1: 0.365572
[245]	valid_0's l2: 0.512496	valid_0's l1: 0.36502
[246]	valid_0's l2: 0.512388	valid_0's l1: 0.364968
[247]	valid_0's l2: 0.511779	valid_0's l1: 0.364735
[248]	valid_0's l2: 0.511858	valid_0's l1: 0.364734
[249]	valid_0's l2: 0.511444	valid_0's l1: 0.364656
[250]	valid_0's l2: 0.510063	valid_0's l1: 0.364103
[251]	valid_0's l2: 0.509839	valid_0's l1: 0.363911
[252]	valid_0's l2: 0.509055	valid_0's l1: 0.363504
[253]	valid_0's l2: 0.509028	valid_0's l1: 0.363575
[254]	valid_0's l2: 0.508756	valid_0's l1: 0.363555
[255]	valid_0's l2: 0.507527	valid_0's l1: 0.363123
[256]	valid_0's l2: 0.506849	valid_0's l1: 0.362947
[257]	valid_0's l2: 0.505793	valid_0's l1: 0.362409
[258]	valid_0's l2: 0.50458	valid_0's l1: 0.361776
[259]	valid_0'

[410]	valid_0's l2: 0.43439	valid_0's l1: 0.341208
[411]	valid_0's l2: 0.43415	valid_0's l1: 0.34116
[412]	valid_0's l2: 0.434129	valid_0's l1: 0.341184
[413]	valid_0's l2: 0.433796	valid_0's l1: 0.340896
[414]	valid_0's l2: 0.433427	valid_0's l1: 0.340714
[415]	valid_0's l2: 0.433453	valid_0's l1: 0.340794
[416]	valid_0's l2: 0.433683	valid_0's l1: 0.340797
[417]	valid_0's l2: 0.433772	valid_0's l1: 0.34095
[418]	valid_0's l2: 0.433328	valid_0's l1: 0.340784
[419]	valid_0's l2: 0.433192	valid_0's l1: 0.34056
[420]	valid_0's l2: 0.433088	valid_0's l1: 0.340476
[421]	valid_0's l2: 0.432819	valid_0's l1: 0.340362
[422]	valid_0's l2: 0.432717	valid_0's l1: 0.340277
[423]	valid_0's l2: 0.432287	valid_0's l1: 0.340078
[424]	valid_0's l2: 0.4321	valid_0's l1: 0.339932
[425]	valid_0's l2: 0.431844	valid_0's l1: 0.339787
[426]	valid_0's l2: 0.43197	valid_0's l1: 0.339824
[427]	valid_0's l2: 0.431857	valid_0's l1: 0.339639
[428]	valid_0's l2: 0.431597	valid_0's l1: 0.339538
[429]	valid_0's l2: 

[623]	valid_0's l2: 0.391204	valid_0's l1: 0.329602
[624]	valid_0's l2: 0.391021	valid_0's l1: 0.329589
[625]	valid_0's l2: 0.390966	valid_0's l1: 0.329603
[626]	valid_0's l2: 0.390772	valid_0's l1: 0.32954
[627]	valid_0's l2: 0.390422	valid_0's l1: 0.329518
[628]	valid_0's l2: 0.390455	valid_0's l1: 0.32951
[629]	valid_0's l2: 0.390318	valid_0's l1: 0.329511
[630]	valid_0's l2: 0.390171	valid_0's l1: 0.329434
[631]	valid_0's l2: 0.390202	valid_0's l1: 0.329509
[632]	valid_0's l2: 0.38991	valid_0's l1: 0.329407
[633]	valid_0's l2: 0.389862	valid_0's l1: 0.329384
[634]	valid_0's l2: 0.389686	valid_0's l1: 0.329334
[635]	valid_0's l2: 0.389484	valid_0's l1: 0.329194
[636]	valid_0's l2: 0.389702	valid_0's l1: 0.329313
[637]	valid_0's l2: 0.389538	valid_0's l1: 0.329284
[638]	valid_0's l2: 0.389339	valid_0's l1: 0.329188
[639]	valid_0's l2: 0.389099	valid_0's l1: 0.328949
[640]	valid_0's l2: 0.388959	valid_0's l1: 0.328938
[641]	valid_0's l2: 0.388795	valid_0's l1: 0.328796
[642]	valid_0's

[837]	valid_0's l2: 0.365163	valid_0's l1: 0.321783
[838]	valid_0's l2: 0.365131	valid_0's l1: 0.321705
[839]	valid_0's l2: 0.365013	valid_0's l1: 0.321704
[840]	valid_0's l2: 0.364655	valid_0's l1: 0.321645
[841]	valid_0's l2: 0.364602	valid_0's l1: 0.321697
[842]	valid_0's l2: 0.364725	valid_0's l1: 0.321664
[843]	valid_0's l2: 0.364592	valid_0's l1: 0.321598
[844]	valid_0's l2: 0.364272	valid_0's l1: 0.321478
[845]	valid_0's l2: 0.364137	valid_0's l1: 0.321459
[846]	valid_0's l2: 0.363972	valid_0's l1: 0.321425
[847]	valid_0's l2: 0.363927	valid_0's l1: 0.321509
[848]	valid_0's l2: 0.363503	valid_0's l1: 0.321459
[849]	valid_0's l2: 0.363043	valid_0's l1: 0.321387
[850]	valid_0's l2: 0.363002	valid_0's l1: 0.321388
[851]	valid_0's l2: 0.362933	valid_0's l1: 0.321394
[852]	valid_0's l2: 0.362987	valid_0's l1: 0.321307
[853]	valid_0's l2: 0.363135	valid_0's l1: 0.321407
[854]	valid_0's l2: 0.363067	valid_0's l1: 0.321363
[855]	valid_0's l2: 0.362949	valid_0's l1: 0.32141
[856]	valid_0

[1011]	valid_0's l2: 0.351821	valid_0's l1: 0.317242
[1012]	valid_0's l2: 0.351583	valid_0's l1: 0.317224
[1013]	valid_0's l2: 0.351551	valid_0's l1: 0.317179
[1014]	valid_0's l2: 0.351487	valid_0's l1: 0.31715
[1015]	valid_0's l2: 0.351568	valid_0's l1: 0.31724
[1016]	valid_0's l2: 0.351453	valid_0's l1: 0.317164
[1017]	valid_0's l2: 0.351051	valid_0's l1: 0.317019
[1018]	valid_0's l2: 0.351179	valid_0's l1: 0.317097
[1019]	valid_0's l2: 0.351076	valid_0's l1: 0.317033
[1020]	valid_0's l2: 0.351122	valid_0's l1: 0.317079
[1021]	valid_0's l2: 0.351186	valid_0's l1: 0.317147
[1022]	valid_0's l2: 0.351185	valid_0's l1: 0.317096
[1023]	valid_0's l2: 0.350959	valid_0's l1: 0.317097
[1024]	valid_0's l2: 0.350859	valid_0's l1: 0.317062
[1025]	valid_0's l2: 0.35086	valid_0's l1: 0.31705
[1026]	valid_0's l2: 0.350811	valid_0's l1: 0.317007
[1027]	valid_0's l2: 0.350886	valid_0's l1: 0.316945
[1028]	valid_0's l2: 0.350732	valid_0's l1: 0.316883
[1029]	valid_0's l2: 0.350499	valid_0's l1: 0.3168

[1230]	valid_0's l2: 0.339601	valid_0's l1: 0.31392
[1231]	valid_0's l2: 0.339627	valid_0's l1: 0.313888
[1232]	valid_0's l2: 0.339499	valid_0's l1: 0.313846
[1233]	valid_0's l2: 0.339566	valid_0's l1: 0.313839
[1234]	valid_0's l2: 0.339514	valid_0's l1: 0.313823
[1235]	valid_0's l2: 0.339462	valid_0's l1: 0.313804
[1236]	valid_0's l2: 0.33946	valid_0's l1: 0.31386
[1237]	valid_0's l2: 0.339445	valid_0's l1: 0.313863
[1238]	valid_0's l2: 0.339268	valid_0's l1: 0.313852
[1239]	valid_0's l2: 0.339229	valid_0's l1: 0.313792
[1240]	valid_0's l2: 0.339031	valid_0's l1: 0.313753
[1241]	valid_0's l2: 0.339122	valid_0's l1: 0.313813
[1242]	valid_0's l2: 0.338976	valid_0's l1: 0.313751
[1243]	valid_0's l2: 0.338848	valid_0's l1: 0.313724
[1244]	valid_0's l2: 0.338861	valid_0's l1: 0.313688
[1245]	valid_0's l2: 0.338794	valid_0's l1: 0.313649
[1246]	valid_0's l2: 0.338789	valid_0's l1: 0.31361
[1247]	valid_0's l2: 0.338744	valid_0's l1: 0.313616
[1248]	valid_0's l2: 0.338533	valid_0's l1: 0.3135

[1444]	valid_0's l2: 0.329598	valid_0's l1: 0.309839
[1445]	valid_0's l2: 0.329623	valid_0's l1: 0.30984
[1446]	valid_0's l2: 0.329483	valid_0's l1: 0.309802
[1447]	valid_0's l2: 0.32951	valid_0's l1: 0.30978
[1448]	valid_0's l2: 0.329429	valid_0's l1: 0.309738
[1449]	valid_0's l2: 0.329387	valid_0's l1: 0.309716
[1450]	valid_0's l2: 0.329377	valid_0's l1: 0.309714
[1451]	valid_0's l2: 0.329304	valid_0's l1: 0.309687
[1452]	valid_0's l2: 0.329148	valid_0's l1: 0.309742
[1453]	valid_0's l2: 0.329257	valid_0's l1: 0.309766
[1454]	valid_0's l2: 0.329196	valid_0's l1: 0.30994
[1455]	valid_0's l2: 0.329303	valid_0's l1: 0.309941
[1456]	valid_0's l2: 0.329281	valid_0's l1: 0.309988
[1457]	valid_0's l2: 0.3293	valid_0's l1: 0.309954
[1458]	valid_0's l2: 0.329279	valid_0's l1: 0.309916
[1459]	valid_0's l2: 0.329128	valid_0's l1: 0.309896
[1460]	valid_0's l2: 0.329169	valid_0's l1: 0.309888
[1461]	valid_0's l2: 0.329203	valid_0's l1: 0.3099
[1462]	valid_0's l2: 0.329215	valid_0's l1: 0.30985
[1

[1642]	valid_0's l2: 0.322812	valid_0's l1: 0.307704
[1643]	valid_0's l2: 0.322882	valid_0's l1: 0.307736
[1644]	valid_0's l2: 0.322685	valid_0's l1: 0.307585
[1645]	valid_0's l2: 0.322684	valid_0's l1: 0.307636
[1646]	valid_0's l2: 0.322707	valid_0's l1: 0.307614
[1647]	valid_0's l2: 0.322638	valid_0's l1: 0.307543
[1648]	valid_0's l2: 0.322652	valid_0's l1: 0.307512
[1649]	valid_0's l2: 0.322674	valid_0's l1: 0.307549
[1650]	valid_0's l2: 0.322683	valid_0's l1: 0.307526
[1651]	valid_0's l2: 0.322648	valid_0's l1: 0.307471
[1652]	valid_0's l2: 0.322629	valid_0's l1: 0.307473
[1653]	valid_0's l2: 0.32264	valid_0's l1: 0.30745
[1654]	valid_0's l2: 0.322628	valid_0's l1: 0.307462
[1655]	valid_0's l2: 0.322597	valid_0's l1: 0.30746
[1656]	valid_0's l2: 0.322573	valid_0's l1: 0.307467
[1657]	valid_0's l2: 0.322567	valid_0's l1: 0.307496
[1658]	valid_0's l2: 0.322562	valid_0's l1: 0.307528
[1659]	valid_0's l2: 0.322414	valid_0's l1: 0.307458
[1660]	valid_0's l2: 0.322375	valid_0's l1: 0.307

[1837]	valid_0's l2: 0.317547	valid_0's l1: 0.305497
[1838]	valid_0's l2: 0.317458	valid_0's l1: 0.30548
[1839]	valid_0's l2: 0.317501	valid_0's l1: 0.305504
[1840]	valid_0's l2: 0.317569	valid_0's l1: 0.305523
[1841]	valid_0's l2: 0.317554	valid_0's l1: 0.305547
[1842]	valid_0's l2: 0.317388	valid_0's l1: 0.305459
[1843]	valid_0's l2: 0.317382	valid_0's l1: 0.305464
[1844]	valid_0's l2: 0.317532	valid_0's l1: 0.305565
[1845]	valid_0's l2: 0.31749	valid_0's l1: 0.305519
[1846]	valid_0's l2: 0.317502	valid_0's l1: 0.305559
[1847]	valid_0's l2: 0.317509	valid_0's l1: 0.305539
[1848]	valid_0's l2: 0.317534	valid_0's l1: 0.305564
[1849]	valid_0's l2: 0.317464	valid_0's l1: 0.305513
[1850]	valid_0's l2: 0.317422	valid_0's l1: 0.305515
[1851]	valid_0's l2: 0.317397	valid_0's l1: 0.305493
[1852]	valid_0's l2: 0.317475	valid_0's l1: 0.305526
[1853]	valid_0's l2: 0.317473	valid_0's l1: 0.305485
[1854]	valid_0's l2: 0.31754	valid_0's l1: 0.305605
[1855]	valid_0's l2: 0.317523	valid_0's l1: 0.305

[2050]	valid_0's l2: 0.313886	valid_0's l1: 0.304275
[2051]	valid_0's l2: 0.313863	valid_0's l1: 0.304287
[2052]	valid_0's l2: 0.313843	valid_0's l1: 0.304301
[2053]	valid_0's l2: 0.313912	valid_0's l1: 0.304343
[2054]	valid_0's l2: 0.313865	valid_0's l1: 0.304395
[2055]	valid_0's l2: 0.313919	valid_0's l1: 0.304408
[2056]	valid_0's l2: 0.313968	valid_0's l1: 0.304478
[2057]	valid_0's l2: 0.313961	valid_0's l1: 0.304472
[2058]	valid_0's l2: 0.313963	valid_0's l1: 0.304405
[2059]	valid_0's l2: 0.31398	valid_0's l1: 0.304416
[2060]	valid_0's l2: 0.313985	valid_0's l1: 0.304399
[2061]	valid_0's l2: 0.314003	valid_0's l1: 0.304396
[2062]	valid_0's l2: 0.314019	valid_0's l1: 0.304399
[2063]	valid_0's l2: 0.313937	valid_0's l1: 0.304395
[2064]	valid_0's l2: 0.313944	valid_0's l1: 0.304389
[2065]	valid_0's l2: 0.313952	valid_0's l1: 0.304422
[2066]	valid_0's l2: 0.313926	valid_0's l1: 0.304384
[2067]	valid_0's l2: 0.313867	valid_0's l1: 0.304361
[2068]	valid_0's l2: 0.313859	valid_0's l1: 0.3

[2249]	valid_0's l2: 0.311199	valid_0's l1: 0.303488
[2250]	valid_0's l2: 0.311223	valid_0's l1: 0.303504
[2251]	valid_0's l2: 0.311236	valid_0's l1: 0.30348
[2252]	valid_0's l2: 0.311195	valid_0's l1: 0.303465
[2253]	valid_0's l2: 0.31124	valid_0's l1: 0.303471
[2254]	valid_0's l2: 0.311223	valid_0's l1: 0.303495
[2255]	valid_0's l2: 0.311231	valid_0's l1: 0.303533
[2256]	valid_0's l2: 0.311193	valid_0's l1: 0.303539
[2257]	valid_0's l2: 0.311202	valid_0's l1: 0.30354
[2258]	valid_0's l2: 0.311107	valid_0's l1: 0.303509
[2259]	valid_0's l2: 0.311113	valid_0's l1: 0.3035
[2260]	valid_0's l2: 0.31114	valid_0's l1: 0.303548
[2261]	valid_0's l2: 0.311216	valid_0's l1: 0.303612
[2262]	valid_0's l2: 0.311139	valid_0's l1: 0.303597
[2263]	valid_0's l2: 0.311046	valid_0's l1: 0.303593
[2264]	valid_0's l2: 0.311014	valid_0's l1: 0.30358
[2265]	valid_0's l2: 0.311045	valid_0's l1: 0.303571
[2266]	valid_0's l2: 0.31103	valid_0's l1: 0.303586
[2267]	valid_0's l2: 0.31105	valid_0's l1: 0.30358
[22

[2464]	valid_0's l2: 0.308497	valid_0's l1: 0.302569
[2465]	valid_0's l2: 0.308452	valid_0's l1: 0.302528
[2466]	valid_0's l2: 0.308466	valid_0's l1: 0.302531
[2467]	valid_0's l2: 0.308395	valid_0's l1: 0.302526
[2468]	valid_0's l2: 0.30843	valid_0's l1: 0.302524
[2469]	valid_0's l2: 0.308431	valid_0's l1: 0.302521
[2470]	valid_0's l2: 0.308399	valid_0's l1: 0.302505
[2471]	valid_0's l2: 0.308397	valid_0's l1: 0.302504
[2472]	valid_0's l2: 0.308469	valid_0's l1: 0.302551
[2473]	valid_0's l2: 0.308468	valid_0's l1: 0.30255
[2474]	valid_0's l2: 0.308523	valid_0's l1: 0.302579
[2475]	valid_0's l2: 0.308519	valid_0's l1: 0.302597
[2476]	valid_0's l2: 0.308416	valid_0's l1: 0.302556
[2477]	valid_0's l2: 0.308434	valid_0's l1: 0.30257
[2478]	valid_0's l2: 0.308453	valid_0's l1: 0.302557
[2479]	valid_0's l2: 0.308422	valid_0's l1: 0.302533
[2480]	valid_0's l2: 0.308441	valid_0's l1: 0.302543
[2481]	valid_0's l2: 0.308436	valid_0's l1: 0.302548
[2482]	valid_0's l2: 0.308436	valid_0's l1: 0.302

[2637]	valid_0's l2: 0.306618	valid_0's l1: 0.301738
[2638]	valid_0's l2: 0.306617	valid_0's l1: 0.301771
[2639]	valid_0's l2: 0.306586	valid_0's l1: 0.301721
[2640]	valid_0's l2: 0.306503	valid_0's l1: 0.301677
[2641]	valid_0's l2: 0.306495	valid_0's l1: 0.30172
[2642]	valid_0's l2: 0.306448	valid_0's l1: 0.301703
[2643]	valid_0's l2: 0.306466	valid_0's l1: 0.301688
[2644]	valid_0's l2: 0.306386	valid_0's l1: 0.301645
[2645]	valid_0's l2: 0.306404	valid_0's l1: 0.301655
[2646]	valid_0's l2: 0.306403	valid_0's l1: 0.301667
[2647]	valid_0's l2: 0.306328	valid_0's l1: 0.301679
[2648]	valid_0's l2: 0.306329	valid_0's l1: 0.30168
[2649]	valid_0's l2: 0.306338	valid_0's l1: 0.301676
[2650]	valid_0's l2: 0.30633	valid_0's l1: 0.30166
[2651]	valid_0's l2: 0.306289	valid_0's l1: 0.301635
[2652]	valid_0's l2: 0.306334	valid_0's l1: 0.301633
[2653]	valid_0's l2: 0.306294	valid_0's l1: 0.301591
[2654]	valid_0's l2: 0.306282	valid_0's l1: 0.301684
[2655]	valid_0's l2: 0.306297	valid_0's l1: 0.3016

P<3%:  0.4646
P<5%:  0.6702
P<8%:  0.8458
P<10%:  0.8992
P<20%:  0.9873


In [62]:
fearture_columns_cn = ['wordvec0', 'wordvec1', 'wordvec2', 'wordvec3', 'wordvec4', 'wordvec5', 
                    'wordvec6', 'wordvec7', 'wordvec8', 'wordvec9', 'wordvec10', 'wordvec11', 'wordvec12', 'wordvec13', 'wordvec14',
                    'wordvec15', 'wordvec16', 'wordvec17', 'wordvec18', 'wordvec19', 'wordvec20', 'wordvec21', 'wordvec22', 'wordvec23',
                    'wordvec24', 'wordvec25', 'wordvec26', 'wordvec27', 'wordvec28', 'wordvec29', 'wordvec30', 'wordvec31', 'wordvec32', 
                    'wordvec33', 'wordvec34', 'wordvec35', 'wordvec36', 'wordvec37', 'wordvec38', 'wordvec39', 'wordvec40', 'wordvec41',
                    'wordvec42', 'wordvec43', 'wordvec44', 'wordvec45', 'wordvec46', 'wordvec47', 'wordvec48', 'wordvec49', 'wordvec50',
                    'wordvec51', 'wordvec52', 'wordvec53', 'wordvec54', 'wordvec55', 'wordvec56', 'wordvec57', 'wordvec58', 'wordvec59', 
                    'wordvec60', 'wordvec61', 'wordvec62', 'wordvec63', 'wordvec64', 'wordvec65', 'wordvec66', 'wordvec67', 'wordvec68', 
                    'wordvec69', 'wordvec70', 'wordvec71', 'wordvec72', 'wordvec73', 'wordvec74', 'wordvec75', 'wordvec76', 'wordvec77',
                    'wordvec78', 'wordvec79', 'wordvec80', 'wordvec81', 'wordvec82', 'wordvec83', 'wordvec84', 'wordvec85', 'wordvec86',
                    'wordvec87', 'wordvec88', 'wordvec89', 'wordvec90', 'wordvec91', 'wordvec92', 'wordvec93', 'wordvec94', 'wordvec95', 
                    'wordvec96', 'wordvec97', 'wordvec98', 'wordvec99','old_pred']
label = 'real_pay_amount'

import lightgbm as lgb
gbm = lgb.LGBMRegressor(boosting_type = 'gbdt', 
                        objective = 'regression',
                        learning_rate = 0.03, 
                        n_estimators = 3000,
                        max_depth = 12,
                        num_leaves = 100, 
                        subsample = 1, 
                        colsample_bytree = 0.34,
                        min_child_samples = 110, 
                        n_jobs = 8,
                        verbose = 1)

gbm.fit(test_data_cn_train[fearture_columns_cn],
        test_data_cn_train[label],
        eval_set = (test_data_cn_test[fearture_columns_cn],
                    test_data_cn_test[label]),
        eval_metric = {'l1','l2'}, 
        early_stopping_rounds = 50, 
        verbose=True,
       )

test_data_cn_test['ypred_cn_old'] = gbm.predict(test_data_cn_test[fearture_columns_cn])                                                   
test_data_cn_test['err_cn_old'] = abs(test_data_cn_test['ypred_cn_old'] - test_data_cn_test['real_pay_amount'] )/test_data_cn_test['real_pay_amount'] 
data3m_pinggu(test_data_cn_test,'err_cn_old')


[1]	valid_0's l2: 6.46082	valid_0's l1: 1.77118
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's l2: 6.20895	valid_0's l1: 1.73059
[3]	valid_0's l2: 5.96641	valid_0's l1: 1.68851
[4]	valid_0's l2: 5.76412	valid_0's l1: 1.65535
[5]	valid_0's l2: 5.49034	valid_0's l1: 1.60992
[6]	valid_0's l2: 5.27404	valid_0's l1: 1.57324
[7]	valid_0's l2: 5.02762	valid_0's l1: 1.53053
[8]	valid_0's l2: 4.79649	valid_0's l1: 1.49
[9]	valid_0's l2: 4.62883	valid_0's l1: 1.45683
[10]	valid_0's l2: 4.41132	valid_0's l1: 1.41777
[11]	valid_0's l2: 4.24134	valid_0's l1: 1.38379
[12]	valid_0's l2: 4.11031	valid_0's l1: 1.35923
[13]	valid_0's l2: 3.97735	valid_0's l1: 1.33307
[14]	valid_0's l2: 3.85083	valid_0's l1: 1.30777
[15]	valid_0's l2: 3.7265	valid_0's l1: 1.28147
[16]	valid_0's l2: 3.61884	valid_0's l1: 1.25789
[17]	valid_0's l2: 3.50799	valid_0's l1: 1.2337
[18]	valid_0's l2: 3.35551	valid_0's l1: 1.20216
[19]	valid_0's l2: 3.21113	valid_0's l1: 1.17153
[20]	valid_0's l2: 3.

[243]	valid_0's l2: 0.56653	valid_0's l1: 0.372864
[244]	valid_0's l2: 0.564013	valid_0's l1: 0.371395
[245]	valid_0's l2: 0.562906	valid_0's l1: 0.371342
[246]	valid_0's l2: 0.563058	valid_0's l1: 0.371382
[247]	valid_0's l2: 0.562908	valid_0's l1: 0.371311
[248]	valid_0's l2: 0.562489	valid_0's l1: 0.371107
[249]	valid_0's l2: 0.56206	valid_0's l1: 0.370921
[250]	valid_0's l2: 0.559573	valid_0's l1: 0.369473
[251]	valid_0's l2: 0.559253	valid_0's l1: 0.369439
[252]	valid_0's l2: 0.558995	valid_0's l1: 0.369376
[253]	valid_0's l2: 0.559206	valid_0's l1: 0.370058
[254]	valid_0's l2: 0.55873	valid_0's l1: 0.369837
[255]	valid_0's l2: 0.557563	valid_0's l1: 0.369728
[256]	valid_0's l2: 0.557465	valid_0's l1: 0.369438
[257]	valid_0's l2: 0.556254	valid_0's l1: 0.369264
[258]	valid_0's l2: 0.554006	valid_0's l1: 0.367949
[259]	valid_0's l2: 0.553473	valid_0's l1: 0.36795
[260]	valid_0's l2: 0.552827	valid_0's l1: 0.367715
[261]	valid_0's l2: 0.552459	valid_0's l1: 0.367555
[262]	valid_0's 

P<3%:  0.4673
P<5%:  0.6562
P<8%:  0.8191
P<10%:  0.8732
P<20%:  0.974


In [63]:
test_data_cn_test[['car_id','model_code','brand_name','series_name','model_name','real_pay_amount','mileage',
              'license_time', 'publish_time', 'a_province_name', 'a_city_name','ypred_cn_old','err_cn_old',
               'ypred_cn_new','err_cn_new','guide_price']].to_csv("test_data_cn_test_data_A.csv")



# 对大风车零售评估

In [48]:
test_data = dl_site_ts_order_clean.loc[(dl_site_ts_order_clean['publish_time'] >= '2020-12-01') ]

test_data = test_data.loc[test_data['data_type'].isin(['dfc_sales'])]
test_data['ypred'] = gbm.predict(test_data[fearture_columns])
test_data['err'] = abs(test_data['ypred']*0.95 - test_data['residual']) / test_data['residual']
data3m_pinggu(test_data,'err')


P<3%:  0.3685
P<5%:  0.5659
P<8%:  0.76
P<10%:  0.8316
P<20%:  0.9623
