In [1]:
# -*- coding: utf-8 -*-
import warnings
warnings.filterwarnings('ignore')
import logging
import datetime

import csv, math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.externals import joblib
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from dayu.hooks.oss_hook import OSSHook
from dayu.hooks.hive_server_hook import HiveServerHook
from dayu.hooks.hive_cli_hook import HiveCliHook

def split_table_name(datain):
    new_cols = []
    for column in datain.columns:
        if(len(column.split('.'))<2):
            return datain
        tb_name, col_name = column.split('.')
        new_cols.append((column, col_name))
    datain = datain.rename(columns=dict(new_cols))
    return datain

def read_from_hive2(output_file_name,insql,dtype):
    filename = output_file_name
    filepath = curr_dir+filename
    hive = HiveServerHook("warehouse_hive")
    hive.to_csv(insql,filepath , delimiter=',',lineterminator='\n', output_header=True)
    outdata = pd.read_csv(filepath, header=0,dtype=dtype)
    # 去除列名中带有的表名
#     outdata = split_table_name(outdata)
    return outdata

## 计算时间差
def date_time_sub(startTime,endTime,date_format):
    try:
        startTime= datetime.datetime.strptime(startTime,date_format)
        endTime= datetime.datetime.strptime(endTime,date_format)
        return (endTime - startTime).days
    except:
        pass

    # 计算车辆当前保值率
def computer_with_license_month(tar):
    try:
        license_month = tar['license_month']
        if(license_month<=12):
            #tar['keep_value'] = tar['year_1']
            return tar['year_1']
        else:
            year = license_month//12
            #当前年保值率
            keep_max = tar["year_"+str(int(year))]
            #下一年的保值率
            keep_min = tar["year_"+str(int(year+1))]

            #相比于上一年，已经过了几个月
            mon = license_month-12*year
            tem = (keep_max-keep_min)/12

            #tar['keep_value'] = round(keep_max - tem*mon, 4)
            return round(keep_max - tem*mon, 4)
    except:
        return tar["year_16"]

class Logger:       
    def __init__(self, logName, logFile):
        self._logger = logging.getLogger(logName)
        handler = logging.FileHandler(logFile)
        formatter = logging.Formatter('%(asctime)s ********* %(message)s')
        handler.setFormatter(formatter)
        self._logger.addHandler(handler)
        self._logger.setLevel(logging.INFO)

    def log(self, msg):
        if self._logger is not None:
            self._logger.info(msg)


pd.set_option('display.max_columns', 500)
curr_dir = '/home/souche/qiongjiu/hgc/'

curr_date=str(datetime.datetime.now())[0:10]
hive_cli = HiveCliHook("warehouse_hive")

logger = Logger('model_service','./log/accurate_valuation_cyp_run_log.log')
logger.log("程序启动.............")


DAYU_HOME : /home/souche/projects/datacenter-etl-v2
[2021-02-19 17:31:33,778] {driver:120} INFO - Generating grammar tables from /usr/lib/python3.5/lib2to3/Grammar.txt
[2021-02-19 17:31:33,804] {driver:120} INFO - Generating grammar tables from /usr/lib/python3.5/lib2to3/PatternGrammar.txt
[2021-02-19 17:31:34,179] {<ipython-input-1-5aad934e0497>:84} INFO - 程序启动.............


In [2]:
## ======================
##       模型训练
## ======================

In [3]:
dl_site_ts_order_clean_dfc_sales = pd.read_csv(curr_dir+'dl_site_ts_order_clean_dfc_sales_0.csv')
dl_site_ts_order_clean_dfc_purchase = pd.read_csv(curr_dir+'dl_site_ts_order_clean_dfc_purchase_0.csv')
dl_site_ts_order_clean_quan = pd.read_csv(curr_dir+'dl_site_ts_order_clean_yh01.csv')


In [4]:
print(dl_site_ts_order_clean_dfc_sales.shape,dl_site_ts_order_clean_dfc_purchase.shape,dl_site_ts_order_clean_quan.shape)


(689715, 29) (921654, 32) (7996680, 30)


In [5]:
sql_info ="""
SELECT model_code,series_code,brand_code,series_name,brand_name  
FROM db_data.ods_car_model_model  
WHERE ds = date_sub('"""+curr_date+"""',1)
"""
dtype={'city_code':str}

model_info  = read_from_hive2('model_info ',sql_info,dtype)
db_columns = []
for col in model_info.columns:
    if len(col.split('.')) > 1:
        db_columns.append(col.split('.')[1])

    else:
        db_columns.append(col)

model_info.columns = db_columns 
logger.log("读取数据完成.............")


[2021-02-19 17:32:43,138] {hiveserver2:138} INFO - Using database default as default
[2021-02-19 17:32:43,282] {hive_server_hook:112} INFO - Running query: 
SELECT model_code,series_code,brand_code,series_name,brand_name  
FROM db_data.ods_car_model_model  
WHERE ds = date_sub('2021-02-19',1)

[2021-02-19 17:32:43,759] {hive_server_hook:162} INFO - Written 10000 rows so far.
[2021-02-19 17:32:44,007] {hive_server_hook:162} INFO - Written 20000 rows so far.
[2021-02-19 17:32:44,255] {hive_server_hook:162} INFO - Written 30000 rows so far.
[2021-02-19 17:32:44,499] {hive_server_hook:162} INFO - Written 40000 rows so far.
[2021-02-19 17:32:44,724] {hive_server_hook:162} INFO - Written 50000 rows so far.
[2021-02-19 17:32:44,965] {hive_server_hook:162} INFO - Written 60000 rows so far.
[2021-02-19 17:32:45,208] {hive_server_hook:162} INFO - Written 70000 rows so far.
[2021-02-19 17:32:45,287] {hive_server_hook:162} INFO - Written 73572 rows so far.
[2021-02-19 17:32:45,291] {hiveserver2:26

In [6]:
test_df = pd.read_excel("../标注数据818.xlsx")
test_df = test_df.rename(columns = {'订单号':'car_id', 
                                    '交易时间':'publish_time', 
                                    '车型code':'model_code', 
                                    '品牌':'brand_name', 
                                    '车型':'model_name', 
                                    '车系':'series_name', '省份':'a_province_name',
                                    '城市':'a_city_name', 
                                    '上牌时间':'license_time', 
                                    '公里数万':'mileage', 
                                    '颜色':'color', 
                                    '过户次数':'transfer_times', 
                                    '营运性质':'use_property',
                                    '订单金额（万元）':'real_pay_amount'})


test_df['license_time_year'] = test_df['license_time'].map(lambda x:x[:4])
test_df['publish_time_year'] = test_df['publish_time'].map(lambda x:x[:4])

for col in ['series_code','brand_code','series_name','brand_name']:
    if col in test_df.columns:
        del test_df[col] 

test_df = pd.merge(test_df,model_info,on='model_code',how='left')

for col in  ['car_id', 'area', 'source_model_name', 'brand_name',
       'brand_code', 'series_name', 'series_code', 'model_name', 'model_code',
       'model_year', 'real_pay_amount', 'emission', 'color', 'mileage',
       'use_property', 'license_time', 'publish_time', 'a_province_name',
       'a_city_name', 'source_publish_time', 'sale_time', 'license_time_year',
       'publish_time_year', 'site', 'transfer_times', 'real_pay_amount_max',
       'real_pay_amount_min', 'count']:
    if col not in test_df.columns:
        test_df[col] = 0
        
test_df['mileage'] = test_df['mileage'] * 10000

In [7]:
test_df.shape

(100, 44)

In [8]:
### 车牛数据
sql_info ="""
select t1.car_id,
       '' area,
       '' source_model_name,
       brand_name,
       brand_code,
       series_name,
       series_code,  
       model_name,
       model_code,
       '' model_year,
       amount/100 real_pay_amount,
       '' emission,
       car_body_color_name color,
       display_mileage/10000 mileage,
       '' use_property,
       first_license_plate_date license_time,
       deal_time publish_time,
       license_plate_province_name a_province_name,
       license_plate_city_name a_city_name,
       '' source_publish_time,
       '' sale_time,
       substr(first_license_plate_date,0,4) license_time_year,
       substr(deal_time,0,4) publish_time_year,
       '' site,
       0 transfer_times,
       t2.car_source,
       t2.report,
       t2.report_h5 
from (select * from dl_cheniu.dl_cheniu_ipmd_car_deal_dd
      where ds = date_sub('"""+curr_date+"""',1)) t1 
left join (select * from dl_cheniu.dl_cheniu_ipmd_car_dd 
           where ds = date_sub('"""+curr_date+"""',1)) t2 
on t1.car_id = t2.car_id 
where t1.order_status != 4 and t2.test_type = 0 
and substr(t2.vin,0,4) != 'TEST'
"""
dtype={'city_code':str}

cheniu_ipmd_car_deal = read_from_hive2('dl_cheniu_ipmd_car_deal_dd',sql_info,dtype)
db_columns = []
for col in cheniu_ipmd_car_deal.columns:
    if len(col.split('.')) > 1:
        db_columns.append(col.split('.')[1])

    else:
        db_columns.append(col)

cheniu_ipmd_car_deal.columns = db_columns 
logger.log("读取数据完成.............")

cheniu_ipmd_car_deal['a_province_name'] = cheniu_ipmd_car_deal['a_province_name'].map(lambda x:str(x).replace('省','').replace('壮族','').\
                                            replace('回族','').replace('自治区','').replace('维吾尔','').\
                                            replace('市',''))

cheniu_ipmd_car_deal['license_time_year'] = cheniu_ipmd_car_deal['license_time'].map(lambda x:str(x)[:4])
cheniu_ipmd_car_deal['publish_time_year'] = cheniu_ipmd_car_deal['publish_time'].map(lambda x:str(x)[:4])

for col in ['series_code','brand_code','series_name','brand_name']:
    if col in cheniu_ipmd_car_deal.columns:
        del cheniu_ipmd_car_deal[col] 

cheniu_ipmd_car_deal = pd.merge(cheniu_ipmd_car_deal,model_info,on='model_code',how='left')

for col in  ['car_id', 'area', 'source_model_name', 'brand_name',
       'brand_code', 'series_name', 'series_code', 'model_name', 'model_code',
       'model_year', 'real_pay_amount', 'emission', 'color', 'mileage',
       'use_property', 'license_time', 'publish_time', 'a_province_name',
       'a_city_name', 'source_publish_time', 'sale_time', 'license_time_year',
       'publish_time_year', 'site', 'transfer_times', 'real_pay_amount_max',
       'real_pay_amount_min', 'count']:
    if col not in cheniu_ipmd_car_deal.columns:
        cheniu_ipmd_car_deal[col] = 0
    
cheniu_ipmd_car_deal['mileage'] = cheniu_ipmd_car_deal['mileage'] * 10000
cheniu_ipmd_car_deal['real_pay_amount'] = cheniu_ipmd_car_deal['real_pay_amount']/10000


[2021-02-19 17:32:45,591] {hiveserver2:138} INFO - Using database default as default
[2021-02-19 17:32:45,789] {hive_server_hook:112} INFO - Running query: 
select t1.car_id,
       '' area,
       '' source_model_name,
       brand_name,
       brand_code,
       series_name,
       series_code,  
       model_name,
       model_code,
       '' model_year,
       amount/100 real_pay_amount,
       '' emission,
       car_body_color_name color,
       display_mileage/10000 mileage,
       '' use_property,
       first_license_plate_date license_time,
       deal_time publish_time,
       license_plate_province_name a_province_name,
       license_plate_city_name a_city_name,
       '' source_publish_time,
       '' sale_time,
       substr(first_license_plate_date,0,4) license_time_year,
       substr(deal_time,0,4) publish_time_year,
       '' site,
       0 transfer_times,
       t2.car_source,
       t2.report,
       t2.report_h5 
from (select * from dl_cheniu.dl_cheniu_ipmd_car_de

In [9]:
dl_site_ts_order_clean_dfc_sales['data_type'] = 'dfc_sales'
dl_site_ts_order_clean_dfc_purchase['data_type'] = 'dfc_purchase'

dl_site_ts_order_clean_quan['data_type'] = 'quan'
test_df['data_type'] = 'cyp'
cheniu_ipmd_car_deal['data_type'] = 'cheniu'


In [10]:
get_columns = ['car_id', 'area', 'source_model_name', 'brand_name',
       'brand_code', 'series_name', 'series_code', 'model_name', 'model_code',
       'model_year', 'real_pay_amount', 'emission', 'color', 'mileage',
       'use_property', 'license_time', 'publish_time', 'a_province_name',
       'a_city_name', 'source_publish_time', 'sale_time', 'license_time_year',
       'publish_time_year', 'site', 'transfer_times', 'data_type','delete_flag',
        'delete_flag_1', 'delete_flag_2']
for col in ['delete_flag','delete_flag_1', 'delete_flag_2'] :
    if col not in dl_site_ts_order_clean_dfc_sales.columns:dl_site_ts_order_clean_dfc_sales[col] = None
    if col not in dl_site_ts_order_clean_dfc_purchase.columns:dl_site_ts_order_clean_dfc_purchase[col] = None
    if col not in test_df.columns:test_df[col] = None
    if col not in cheniu_ipmd_car_deal.columns:cheniu_ipmd_car_deal[col] = None


dl_site_ts_order_clean = pd.concat([dl_site_ts_order_clean_dfc_sales[get_columns],
                                    dl_site_ts_order_clean_dfc_purchase[get_columns],
                                   dl_site_ts_order_clean_quan[get_columns],
                                   test_df[get_columns],
                                   cheniu_ipmd_car_deal[get_columns]],axis=0)


In [11]:
del dl_site_ts_order_clean_dfc_sales
del dl_site_ts_order_clean_dfc_purchase
del dl_site_ts_order_clean_quan
del cheniu_ipmd_car_deal


In [12]:
dl_site_ts_order_clean.shape

(9619482, 29)

In [13]:
dl_site_ts_order_clean['publish_time'] = dl_site_ts_order_clean['publish_time'].map(lambda x:str(x)[:10])


In [14]:
dl_site_ts_order_clean = dl_site_ts_order_clean[['car_id', 'brand_name','brand_code', 'series_name', 'series_code', 'model_name', 'model_code',
       'real_pay_amount', 'color', 'mileage','license_time', 'publish_time', 'a_province_name',
       'a_city_name', 'license_time_year','publish_time_year', 'site', 'transfer_times','data_type','delete_flag',
        'delete_flag_1', 'delete_flag_2']]


In [15]:
sql_info ="""
SELECT model_code,guide_price 
FROM db_data.ods_car_model_model_price 
WHERE ds = date_sub('"""+curr_date+"""',1)
"""
dtype={'city_code':str}

model_price  = read_from_hive2('model_price ',sql_info,dtype)
db_columns = []
for col in model_price.columns:
    if len(col.split('.')) > 1:
        db_columns.append(col.split('.')[1])

    else:
        db_columns.append(col)

model_price.columns = db_columns 
logger.log("读取数据完成.............")


[2021-02-19 17:33:32,629] {hiveserver2:138} INFO - Using database default as default
[2021-02-19 17:33:32,771] {hive_server_hook:112} INFO - Running query: 
SELECT model_code,guide_price 
FROM db_data.ods_car_model_model_price 
WHERE ds = date_sub('2021-02-19',1)

[2021-02-19 17:33:33,110] {hive_server_hook:162} INFO - Written 10000 rows so far.
[2021-02-19 17:33:33,229] {hive_server_hook:162} INFO - Written 20000 rows so far.
[2021-02-19 17:33:33,347] {hive_server_hook:162} INFO - Written 30000 rows so far.
[2021-02-19 17:33:33,463] {hive_server_hook:162} INFO - Written 40000 rows so far.
[2021-02-19 17:33:33,574] {hive_server_hook:162} INFO - Written 50000 rows so far.
[2021-02-19 17:33:33,692] {hive_server_hook:162} INFO - Written 60000 rows so far.
[2021-02-19 17:33:33,796] {hive_server_hook:162} INFO - Written 68853 rows so far.
[2021-02-19 17:33:33,799] {hiveserver2:265} INFO - Closing active operation
[2021-02-19 17:33:33,816] {hive_server_hook:163} INFO - Done. Loaded a total o

In [16]:
dl_site_ts_order_clean = pd.merge(dl_site_ts_order_clean,model_price,on='model_code',how='left')
dl_site_ts_order_clean['guide_price'] = dl_site_ts_order_clean['guide_price']/10000
dl_site_ts_order_clean['residual'] = dl_site_ts_order_clean['real_pay_amount']/dl_site_ts_order_clean['guide_price']


In [17]:
dl_site_ts_order_clean = dl_site_ts_order_clean.loc[(~dl_site_ts_order_clean['license_time_year'].isnull()) & 
                                                    (dl_site_ts_order_clean['license_time_year'] != 'nan')]
dl_site_ts_order_clean = dl_site_ts_order_clean.loc[(~dl_site_ts_order_clean['publish_time_year'].isnull())& 
                                                    (dl_site_ts_order_clean['publish_time_year'] != 'nan')]

dl_site_ts_order_clean['license_time_year'] = dl_site_ts_order_clean['license_time_year'].map(lambda x:int(x))
dl_site_ts_order_clean['publish_time_year'] = dl_site_ts_order_clean['publish_time_year'].map(lambda x:int(x))



# 处理残值率大于1 的数据

In [18]:
residual_ex = list(dl_site_ts_order_clean.loc[dl_site_ts_order_clean['residual']>=1]['model_code'].unique())
residual_ex_count = dl_site_ts_order_clean.loc[(dl_site_ts_order_clean['model_code'].isin(residual_ex)) & 
                           (dl_site_ts_order_clean['residual']>=1)][['model_code','car_id']].\
                            groupby(['model_code']).count().reset_index().\
                            rename(columns={'car_id':'count_ex'})

residual_count = dl_site_ts_order_clean.loc[dl_site_ts_order_clean['model_code'].isin(residual_ex)][['model_code','car_id']].\
                            groupby(['model_code']).count().reset_index().\
                            rename(columns={'car_id':'count'})

residual_count = pd.merge(residual_count,residual_ex_count,on='model_code',how='left')
residual_count['rate'] = residual_count['count_ex'] / residual_count['count']

## 对残值率大于1的数据进行处理
model_ex_list_gt_4 = list(residual_count.loc[(residual_count['rate'] >0.4) & (residual_count['count'] >=3)]['model_code'].unique())

dl_site_ts_order_clean_gt1 = dl_site_ts_order_clean.loc[(dl_site_ts_order_clean['model_code'].isin(model_ex_list_gt_4)) & 
                          (dl_site_ts_order_clean['residual']>=1) & 
                        (dl_site_ts_order_clean['residual']<1.5)]

dl_site_ts_order_clean = dl_site_ts_order_clean.loc[(dl_site_ts_order_clean['residual']<1) ]
dl_site_ts_order_clean = pd.concat([dl_site_ts_order_clean,dl_site_ts_order_clean_gt1],axis=0)


In [19]:
## 计算平均值和中位数

In [20]:
dl_site_ts_order_clean_mean = dl_site_ts_order_clean[['model_code','license_time_year','publish_time_year','real_pay_amount']].\
groupby(['model_code','license_time_year','publish_time_year']).mean().reset_index().rename(columns={"real_pay_amount":"price_mean"})

dl_site_ts_order_clean_std = dl_site_ts_order_clean[['model_code','license_time_year','publish_time_year','real_pay_amount']].\
groupby(['model_code','license_time_year','publish_time_year']).std().reset_index().rename(columns={"real_pay_amount":"price_std"})

dl_site_ts_order_clean_median = dl_site_ts_order_clean[['model_code','license_time_year','publish_time_year','real_pay_amount']].\
groupby(['model_code','license_time_year','publish_time_year']).median().reset_index().rename(columns={"real_pay_amount":"price_median"})

dl_site_ts_order_clean_count = dl_site_ts_order_clean[['model_code','license_time_year','publish_time_year','real_pay_amount']].\
groupby(['model_code','license_time_year','publish_time_year']).count().reset_index().rename(columns={"real_pay_amount":"price_count"})

dl_site_ts_order_clean_mean = pd.merge(dl_site_ts_order_clean_mean,
         dl_site_ts_order_clean_std,
        on=['model_code','license_time_year','publish_time_year'],
        how='left')

dl_site_ts_order_clean_mean = pd.merge(dl_site_ts_order_clean_mean,
         dl_site_ts_order_clean_median,
        on=['model_code','license_time_year','publish_time_year'],
        how='left')

dl_site_ts_order_clean_mean = pd.merge(dl_site_ts_order_clean_mean,
         dl_site_ts_order_clean_count,
        on=['model_code','license_time_year','publish_time_year'],
        how='left')

dl_site_ts_order_clean_mean = dl_site_ts_order_clean_mean.loc[~dl_site_ts_order_clean_mean['price_std'].isnull()]

dl_site_ts_order_clean_mean['price_mean'] = dl_site_ts_order_clean_mean['price_mean'].map(lambda x:round(x/10000,2))
dl_site_ts_order_clean_mean['price_std'] = dl_site_ts_order_clean_mean['price_std'].map(lambda x:round(x/10000,2))
dl_site_ts_order_clean_mean['price_median'] = dl_site_ts_order_clean_mean['price_median'].map(lambda x:round(x/10000,2))


In [21]:
order_residual_mean = dl_site_ts_order_clean[['series_code','license_time_year','publish_time_year','residual']].\
groupby(['series_code','license_time_year','publish_time_year']).mean().reset_index().rename(columns={"residual":"residual_mean"})

order_residual_std = dl_site_ts_order_clean[['series_code','license_time_year','publish_time_year','residual']].\
groupby(['series_code','license_time_year','publish_time_year']).std().reset_index().rename(columns={"residual":"residual_std"})

order_residual_median = dl_site_ts_order_clean[['series_code','license_time_year','publish_time_year','residual']].\
groupby(['series_code','license_time_year','publish_time_year']).median().reset_index().rename(columns={"residual":"residual_median"})

order_residual_count = dl_site_ts_order_clean[['series_code','license_time_year','publish_time_year','residual']].\
groupby(['series_code','license_time_year','publish_time_year']).count().reset_index().rename(columns={"residual":"residual_count"})

order_residual_mean = pd.merge(order_residual_mean,
         order_residual_std,
        on=['series_code','license_time_year','publish_time_year'],
        how='left')

order_residual_mean = pd.merge(order_residual_mean,
         order_residual_median,
        on=['series_code','license_time_year','publish_time_year'],
        how='left')

order_residual_mean = pd.merge(order_residual_mean,
         order_residual_count,
        on=['series_code','license_time_year','publish_time_year'],
        how='left')

order_residual_mean = order_residual_mean.loc[~order_residual_mean['residual_std'].isnull()]

order_residual_mean['residual_mean'] = order_residual_mean['residual_mean'].map(lambda x:round(x,6))
order_residual_mean['residual_median'] = order_residual_mean['residual_median'].map(lambda x:round(x,6))



In [22]:
#### ===========训练模型构建特征===============#####

In [23]:
dl_site_ts_order_clean = dl_site_ts_order_clean.loc[~dl_site_ts_order_clean['site'].isin(['taoche','yxpai','baixing'])]
if 'model_year' in dl_site_ts_order_clean.columns:
    dl_site_ts_order_clean = dl_site_ts_order_clean.drop(['model_year'], axis=1)
car = pd.read_csv(curr_dir+"2021-02-02版车型参数及独热编码.csv", header = 0, low_memory=False)

dl_site_ts_order_clean = pd.merge(dl_site_ts_order_clean,car,on='model_code',how='left')


In [24]:
dl_site_ts_order_clean = dl_site_ts_order_clean.loc[(dl_site_ts_order_clean['level']=="B") | 
                                                      (dl_site_ts_order_clean['level']=="C") | 
                                                      (dl_site_ts_order_clean['level']=="D") | 
                                                      (dl_site_ts_order_clean['level']=="E")]

dl_site_ts_order_clean.shape


(3675475, 203)

In [25]:
dl_site_ts_order_clean['mileage'] = dl_site_ts_order_clean['mileage'].map(lambda x:round(x/10000,2) )

dl_site_ts_order_clean = dl_site_ts_order_clean.loc[~((dl_site_ts_order_clean['license_time'].isnull()) |
                           (dl_site_ts_order_clean['publish_time'].isnull()))
                          ]
dl_site_ts_order_clean.shape

(3675475, 203)

In [26]:
dl_site_ts_order_clean.loc[dl_site_ts_order_clean['data_type'] == 'cyp'].shape


(25, 203)

In [27]:
## 计算车龄月
dl_site_ts_order_clean['license_time_month'] = dl_site_ts_order_clean['license_time'].map(lambda x:int(x.split('-')[1]))
dl_site_ts_order_clean['publish_time_month'] = dl_site_ts_order_clean['publish_time'].map(lambda x:int(x.split('-')[1]))
dl_site_ts_order_clean['license_month'] = dl_site_ts_order_clean['publish_time_year'].map(lambda x:int(x)) * 12 + \
                                            dl_site_ts_order_clean['publish_time_month'] - \
                                            dl_site_ts_order_clean['license_time_year'].map(lambda x:int(x)) * 12 - \
                                            dl_site_ts_order_clean['license_time_month'] + 1


In [28]:
dl_site_ts_order_clean = dl_site_ts_order_clean.loc[~dl_site_ts_order_clean['model_year'].isnull()]

In [29]:
## 计算衍生特征
dl_site_ts_order_clean['per_mile'] = round(dl_site_ts_order_clean['mileage']/(dl_site_ts_order_clean['license_month']/12),2)
dl_site_ts_order_clean['license_time_year'] = dl_site_ts_order_clean['license_time_year'].map(lambda x:int(x))
dl_site_ts_order_clean['model_year'] = dl_site_ts_order_clean['model_year'].map(lambda x:int(x))

dl_site_ts_order_clean['year_err'] = dl_site_ts_order_clean['license_time_year'] - dl_site_ts_order_clean['model_year']


In [30]:
# 计算车辆当前保值率
def computer_with_license_month(tar):
    try:
        license_month = tar['license_month']
        if(license_month<=12):
            #tar['keep_value'] = tar['year_1']
            return tar['year_1']
        else:
            year = license_month//12
            #当前年保值率
            keep_max = tar["year_"+str(int(year))]
            #下一年的保值率
            keep_min = tar["year_"+str(int(year+1))]

            #相比于上一年，已经过了几个月
            mon = license_month-12*year
            tem = (keep_max-keep_min)/12

            #tar['keep_value'] = round(keep_max - tem*mon, 4)
            return round(keep_max - tem*mon, 4)
    except:
        return tar["year_16"]

#data2m = data2m.apply(computer_with_license_month, axis=1)
dl_site_ts_order_clean['keep_value'] = dl_site_ts_order_clean[['license_month','year_1', 'year_2', 'year_3', 'year_4','year_5', 'year_6', 'year_7', 'year_8', 'year_9', 'year_10', 'year_11','year_12', 'year_13', 'year_14', 'year_15', 'year_16']].to_dict(orient='records')
print('keep_value 计算开始。。')
dl_site_ts_order_clean['keep_value'] = dl_site_ts_order_clean['keep_value'].map(lambda tar:computer_with_license_month(tar))



keep_value 计算开始。。


In [31]:
## 删除处理后的数据
drop_columns = ['year_1', 'year_2', 'year_3', 'year_4','year_5', 'year_6', 'year_7', 'year_8', 'year_9', 'year_10', 
            'year_11','year_12', 'year_13', 'year_14', 'year_15', 'year_16']
dl_site_ts_order_clean = dl_site_ts_order_clean.drop(columns=drop_columns)


In [32]:
## 计算是否过质保
def get_quality(license_month,mileage_std,quality_mile,quality_year):
    year = round(license_month/12, 2)
    if mileage_std < quality_mile and year< quality_year:
        return 1
    else:
        return 0

dl_site_ts_order_clean['quality'] = list(map(lambda license_month,mileage_std,quality_mile,quality_year:get_quality(license_month,mileage_std,quality_mile,quality_year),
                                             dl_site_ts_order_clean['license_month'],dl_site_ts_order_clean['mileage'],
                                             dl_site_ts_order_clean['quality_mile'],dl_site_ts_order_clean['quality_year']))
dl_site_ts_order_clean = dl_site_ts_order_clean.drop(['quality_mile', 'quality_year'], axis=1)


In [33]:
## 计算残值率
# dl_site_ts_order_clean['real_pay_amount'] = dl_site_ts_order_clean['real_pay_amount'].map(lambda x:round(x/10000,2))
dl_site_ts_order_clean['residual'] = dl_site_ts_order_clean['real_pay_amount'] / dl_site_ts_order_clean['new_car_price']



# 把类别特征转化为编码

In [34]:
dl_site_ts_order_clean = dl_site_ts_order_clean.loc[~dl_site_ts_order_clean['a_province_name'].isin(['州','江', '庆', '苏', 'nan','川'])]
dl_site_ts_order_clean.loc[dl_site_ts_order_clean['data_type'] =='cheniu','color'] = '白色'
dl_site_ts_order_clean = dl_site_ts_order_clean.loc[~dl_site_ts_order_clean['brand_code'].isnull()]
dl_site_ts_order_clean = dl_site_ts_order_clean.loc[~dl_site_ts_order_clean['series_code'].isnull()]


In [35]:
dl_site_ts_order_clean.loc[dl_site_ts_order_clean['brand_code'] == 'brand-889','brand_code'] = 'brand-54'
dl_site_ts_order_clean.loc[dl_site_ts_order_clean['series_code'] == 'series-2085-n','series_code'] = 'series-211'
dl_site_ts_order_clean.loc[dl_site_ts_order_clean['series_code'] == 'series-50999','series_code'] = 'series-50035'

transfer_times_df = pd.read_excel('../transfer_times_df_yh01.xlsx')
dl_site_ts_order_clean = dl_site_ts_order_clean.loc[~dl_site_ts_order_clean['brand_code'].isnull()]
dl_site_ts_order_clean = dl_site_ts_order_clean.loc[~dl_site_ts_order_clean['series_code'].isnull()]

# transfer_times_dict = 
transfer_times_dict = {}
for key,value in np.array(transfer_times_df.loc[transfer_times_df['name'] == 'transfer_times'][['key','value']]):
    transfer_times_dict[key] = value
a_province_name_dict = {}
for key,value in np.array(transfer_times_df.loc[transfer_times_df['name'] == 'province_name'][['key','value']]):
    a_province_name_dict[key] = value
color_dict = {}
for key,value in np.array(transfer_times_df.loc[transfer_times_df['name'] == 'color'][['key','value']]):
    color_dict[key] = value
brand_code_dict = {}
for key,value in np.array(transfer_times_df.loc[transfer_times_df['name'] == 'brand_code'][['key','value']]):
    brand_code_dict[key] = value
series_code_dict = {}
for key,value in np.array(transfer_times_df.loc[transfer_times_df['name'] == 'series_code'][['key','value']]):
    series_code_dict[key] = value

## 剔除车型库已变更的车型数据
dl_site_ts_order_clean = dl_site_ts_order_clean.loc[dl_site_ts_order_clean['series_code'].isin(list(series_code_dict.keys()))]
dl_site_ts_order_clean = dl_site_ts_order_clean.loc[dl_site_ts_order_clean['brand_code'].isin(list(brand_code_dict.keys()))]

## 处理省份数据
dl_site_ts_order_clean = dl_site_ts_order_clean.loc[~dl_site_ts_order_clean['a_province_name'].isnull()]
dl_site_ts_order_clean['a_province_name'] = dl_site_ts_order_clean['a_province_name'].map(lambda x:x.replace('省','').replace('市',''))
dl_site_ts_order_clean['a_province_name'] = dl_site_ts_order_clean['a_province_name'].map(lambda x:str(x).replace('省','').replace('市',''))
## 省份
dl_site_ts_order_clean['province_name'] = dl_site_ts_order_clean['a_province_name'].map(lambda x:a_province_name_dict[x])
## 品牌
dl_site_ts_order_clean['brand_code_index'] = dl_site_ts_order_clean['brand_code'].map(lambda x:brand_code_dict[x])
## 车系
dl_site_ts_order_clean['series_code_index'] = dl_site_ts_order_clean['series_code'].map(lambda x:series_code_dict[x])

## 处理颜色和过户次数
color_list = ['多彩色','粉红色','冰川白','金色','香槟金','银色','橙色','绿色','咖啡色','黄色','紫色','灰色','棕色','深灰色','香槟色','蓝色',
'银灰色','红色','其他','黑色','白色']
dl_site_ts_order_clean.loc[dl_site_ts_order_clean['color'].isin(['不详','其他色', '其它色','其它','其他','—']),'color'] = '其他'

dl_site_ts_order_clean = dl_site_ts_order_clean.loc[dl_site_ts_order_clean['color'].isin(color_list)]

dl_site_ts_order_clean['transfer_times'] = dl_site_ts_order_clean['transfer_times'].fillna(-1)
def transfer_times_fun(x):
    try:
        return int(x)
    except:
        pass

dl_site_ts_order_clean['transfer_times'] = dl_site_ts_order_clean['transfer_times'].map(lambda x:transfer_times_fun(x))
dl_site_ts_order_clean.loc[dl_site_ts_order_clean['transfer_times'] >=5,'transfer_times'] = 5 
dl_site_ts_order_clean = dl_site_ts_order_clean.loc[dl_site_ts_order_clean['transfer_times'].isin([-1,0,1,2,3,4,5])]
## 过户次数
dl_site_ts_order_clean['transfer_times_index'] = dl_site_ts_order_clean['transfer_times'].map(lambda x:transfer_times_dict[x])
## 颜色
dl_site_ts_order_clean['color_index'] = dl_site_ts_order_clean['color'].map(lambda x:color_dict[x])


In [36]:
## 对数据进行过滤
dl_site_ts_order_clean = dl_site_ts_order_clean.loc[~dl_site_ts_order_clean['brand_code'].isnull()]
dl_site_ts_order_clean = dl_site_ts_order_clean.loc[dl_site_ts_order_clean['license_month'] > 1]
dl_site_ts_order_clean = dl_site_ts_order_clean.loc[dl_site_ts_order_clean['per_mile'] >= 0.01]
dl_site_ts_order_clean = dl_site_ts_order_clean.loc[dl_site_ts_order_clean['per_mile'] <= 10]


In [37]:
## 上牌年和交易年对应的最小车龄
dl_site_ts_order_clean['license_month_min'] = list(map(lambda x,y:date_time_sub(x,y,"%Y-%m"),dl_site_ts_order_clean['license_time_year'].map(lambda x:str(int(x))+"-12"),dl_site_ts_order_clean['publish_time_year'].map(lambda x:str(x)+'-01')))
dl_site_ts_order_clean['license_month_min'] = dl_site_ts_order_clean['license_month_min'].map(lambda x:round(x/30))
dl_site_ts_order_clean['license_month_cha'] = (dl_site_ts_order_clean['license_month'] - dl_site_ts_order_clean['license_month_min'])

dl_site_ts_order_clean['publish_time_year'] = dl_site_ts_order_clean['publish_time_year'].map(lambda x:int(x))
dl_site_ts_order_clean['license_time_year'] = dl_site_ts_order_clean['license_time_year'].map(lambda x:int(x))
dl_site_ts_order_clean['car_years'] = dl_site_ts_order_clean['publish_time_year'] - dl_site_ts_order_clean['license_time_year']

dl_site_ts_order_clean['months_cha'] = dl_site_ts_order_clean['publish_time'].map(lambda x:int(x[5:7])) - dl_site_ts_order_clean['license_time'].map(lambda x:int(x.split('-')[1]))



In [38]:
## 对数转化
dl_site_ts_order_clean['license_month_log'] = dl_site_ts_order_clean['license_month'].map(lambda x:math.log(x))
dl_site_ts_order_clean['mileage_log'] = dl_site_ts_order_clean['mileage'].map(lambda x:math.log(x))


In [None]:
## 订单距今时间
dl_site_ts_order_clean['months'] = dl_site_ts_order_clean['publish_time'].map(lambda x:round(date_time_sub(str(x)[:10],'2021-12-31',"%Y-%m-%d")/30))



In [40]:
fearture_columns = ['months','mileage_log', 'car_years','months_cha',#'license_month', 
                    'new_car_price','model_year', 'rate', 
                    'rate_count', 'per_mile', 'year_err','province_name','license_time_year', 'publish_time_year',
                    'brand_code_index','series_code_index',
                    'transfer_times_index','color_index',
#                     'province_name0', 'province_name1', 'province_name2', 'province_name3', 'province_name4',
#                     'province_name5', 'province_name6', 'province_name7', 'province_name8', 'province_name9', 
#                     'province_name10', 'province_name11', 'province_name12', 'province_name13', 'province_name14',
#                     'province_name15', 'province_name16', 'province_name17', 'province_name18', 'province_name19', 
#                     'province_name20', 'province_name21', 'province_name22', 'province_name23', 'province_name24', 
#                     'province_name25', 'province_name26', 'province_name27', 'province_name28', 'province_name29', 
#                     'province_name30',
                    'keep_value', 'quality',
                    'wheel_base', 'length', 'height', 'width', 'max_torque', 'max_power', 'engine_volume_l', 'cylinder_number', 
                    'seat_number_top', 'driving_mode0', 'driving_mode1', 'driving_mode2', 'driving_mode3', 'driving_mode4', 
                    'driving_mode5', 'driving_mode6', 'driving_mode7', 'driving_mode8', 'driving_mode9', 'gear_box_type0', 
                    'gear_box_type1', 'gear_box_type2', 'gear_box_type3', 'gear_box_type4', 'gear_box_type5', 'gear_box_type6', 
                    'gear_box_type7', 'gear_box_type8', 'gear_box_type9', 'gear_box_type10', 'country_id0', 'country_id1', 
                    'country_id2', 'country_id3', 'country_id4', 'country_id5', 'country_id6', 'country_id7', 'country_id8', 
                    'country_id9', 'country_id10', 'country_id11', 'country_id12', 'country_id13', 'country_id14', 'import_type0', 
                    'import_type1', 'intake_type0', 'intake_type1', 'intake_type2', 'intake_type3', 'intake_type4', 'intake_type5', 
                    'intake_type6', 'intake_type7', 'fuel_form0', 'fuel_form1', 'fuel_form2', 'fuel_form3', 'fuel_form4', 
                    'fuel_form5', 'fuel_form6', 'fuel_form7', 'fuel_form8', 'car_body0', 'car_body1', 'car_body2', 'car_body3',
                    'car_body4', 'car_body5', 'car_body6', 'car_body7', 'car_body8', 'car_body9', 'car_body10', 'car_body11', 
                    'series_level0', 'series_level1', 'series_level2', 'series_level3', 'series_level4', 'series_level5', 
                    'series_level6', 'series_level7', 'series_level8', 'series_level9', 'series_level10', 'series_level11', 
                    'series_level12', 'series_level13', 'series_level14', 'series_level15', 'series_level16', 'series_level17', 
                    'series_level18', 'series_level19', 'series_level20', 'series_level21', 'series_level22', 'series_level23', 
                    'series_level24', 'series_level25', 'series_level26', 'series_level27', 'series_level28', 'series_level29',
                    'series_level30', 'series_level31', 'series_level32', 'series_level33', 'series_level34', 'series_level35',
                    'series_level36', 'series_level37', 'series_level38', 'series_level39', 'series_level40', 'series_level41', 
                    'series_level42', 'series_level43', 'series_level44', 'series_level45', 'series_level46', 'series_level47', 
                    'series_level48', 'series_level49', 'series_level50', 'series_level51', 'series_level52', 'series_level53', 
                    'series_level54', 'series_level55', 'series_level56', 'series_level57', 'series_level58', 'series_level59', 
                    'series_level60', 'series_level61', 'wordvec0', 'wordvec1', 'wordvec2', 'wordvec3', 'wordvec4', 'wordvec5', 
                    'wordvec6', 'wordvec7', 'wordvec8', 'wordvec9', 'wordvec10', 'wordvec11', 'wordvec12', 'wordvec13', 'wordvec14',
                    'wordvec15', 'wordvec16', 'wordvec17', 'wordvec18', 'wordvec19', 'wordvec20', 'wordvec21', 'wordvec22', 'wordvec23',
                    'wordvec24', 'wordvec25', 'wordvec26', 'wordvec27', 'wordvec28', 'wordvec29', 'wordvec30', 'wordvec31', 'wordvec32', 
                    'wordvec33', 'wordvec34', 'wordvec35', 'wordvec36', 'wordvec37', 'wordvec38', 'wordvec39', 'wordvec40', 'wordvec41',
                    'wordvec42', 'wordvec43', 'wordvec44', 'wordvec45', 'wordvec46', 'wordvec47', 'wordvec48', 'wordvec49', 'wordvec50',
                    'wordvec51', 'wordvec52', 'wordvec53', 'wordvec54', 'wordvec55', 'wordvec56', 'wordvec57', 'wordvec58', 'wordvec59', 
                    'wordvec60', 'wordvec61', 'wordvec62', 'wordvec63', 'wordvec64', 'wordvec65', 'wordvec66', 'wordvec67', 'wordvec68', 
                    'wordvec69', 'wordvec70', 'wordvec71', 'wordvec72', 'wordvec73', 'wordvec74', 'wordvec75', 'wordvec76', 'wordvec77',
                    'wordvec78', 'wordvec79', 'wordvec80', 'wordvec81', 'wordvec82', 'wordvec83', 'wordvec84', 'wordvec85', 'wordvec86',
                    'wordvec87', 'wordvec88', 'wordvec89', 'wordvec90', 'wordvec91', 'wordvec92', 'wordvec93', 'wordvec94', 'wordvec95', 
                    'wordvec96', 'wordvec97', 'wordvec98', 'wordvec99']

label = 'residual'


In [41]:
## 增加车型名称编码特征
vec = pd.read_csv(curr_dir+"2021-02-02版-TF-IDF权重的词向量.csv", dtype={'model_code': str}, header = 0)
vec = vec.drop(['model_name'], axis=1)

dl_site_ts_order_clean = pd.merge(dl_site_ts_order_clean,vec,on='model_code',how='left')


In [42]:
## 取数训练模型
dl_site_ts_order_clean = dl_site_ts_order_clean.loc[dl_site_ts_order_clean['per_mile'] <= 5]
train_data = dl_site_ts_order_clean.loc[(dl_site_ts_order_clean['publish_time'] > '2019-01-01') 
                                         & (dl_site_ts_order_clean['publish_time'] < '2020-12-01')
                                       ]
test_data = dl_site_ts_order_clean.loc[(dl_site_ts_order_clean['publish_time'] >= '2020-12-01') & 
                                       (dl_site_ts_order_clean['publish_time'] < '2021-01-01') 
                                      ]

train_data = train_data.loc[train_data['data_type'].isin(['quan'])]
test_data = test_data.loc[test_data['data_type'].isin(['quan'])]


In [43]:
print(train_data.shape,test_data.shape)
print(dl_site_ts_order_clean.shape)


(2815976, 304) (55159, 304)
(3590353, 304)


In [45]:
import lightgbm as lgb
gbm = lgb.LGBMRegressor(boosting_type = 'gbdt', 
                        objective = 'regression',
                        learning_rate = 0.03, 
                        n_estimators = 3000,
                        max_depth = 12,
                        num_leaves = 100, 
                        subsample = 1, 
                        colsample_bytree = 0.34,
                        min_child_samples = 110, 
                        n_jobs = 8,
                        verbose = 1)

gbm.fit(train_data[fearture_columns],
        train_data[label],
        eval_set = (test_data[fearture_columns],
                    test_data[label]),
        eval_metric = {'l1','l2'}, 
        early_stopping_rounds = 50, 
        verbose=True,
        categorical_feature=['province_name','model_year', 'publish_time_year','quality',
                            'brand_code_index','series_code_index','transfer_times_index','color_index']
       )


[1]	valid_0's l1: 0.207167	valid_0's l2: 0.0639764
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's l1: 0.201653	valid_0's l2: 0.0607072
[3]	valid_0's l1: 0.196799	valid_0's l2: 0.0579445
[4]	valid_0's l1: 0.191864	valid_0's l2: 0.0552061
[5]	valid_0's l1: 0.186797	valid_0's l2: 0.0524337
[6]	valid_0's l1: 0.181726	valid_0's l2: 0.049732
[7]	valid_0's l1: 0.176981	valid_0's l2: 0.0472607
[8]	valid_0's l1: 0.172315	valid_0's l2: 0.0448847
[9]	valid_0's l1: 0.1676	valid_0's l2: 0.0425261
[10]	valid_0's l1: 0.163124	valid_0's l2: 0.0403579
[11]	valid_0's l1: 0.158719	valid_0's l2: 0.0382932
[12]	valid_0's l1: 0.154442	valid_0's l2: 0.0363207
[13]	valid_0's l1: 0.150693	valid_0's l2: 0.0346537
[14]	valid_0's l1: 0.147689	valid_0's l2: 0.0333325
[15]	valid_0's l1: 0.144141	valid_0's l2: 0.0318064
[16]	valid_0's l1: 0.140683	valid_0's l2: 0.0303745
[17]	valid_0's l1: 0.137316	valid_0's l2: 0.0289861
[18]	valid_0's l1: 0.134074	valid_0's l2: 0.0276986
[19]	valid_0's

[153]	valid_0's l1: 0.0348597	valid_0's l2: 0.00241576
[154]	valid_0's l1: 0.0348067	valid_0's l2: 0.00240972
[155]	valid_0's l1: 0.0347466	valid_0's l2: 0.00240259
[156]	valid_0's l1: 0.0346426	valid_0's l2: 0.00238764
[157]	valid_0's l1: 0.0345555	valid_0's l2: 0.00237564
[158]	valid_0's l1: 0.0344948	valid_0's l2: 0.00236819
[159]	valid_0's l1: 0.0344032	valid_0's l2: 0.00235609
[160]	valid_0's l1: 0.0342804	valid_0's l2: 0.00234138
[161]	valid_0's l1: 0.0342122	valid_0's l2: 0.00233232
[162]	valid_0's l1: 0.0341831	valid_0's l2: 0.00232912
[163]	valid_0's l1: 0.0341015	valid_0's l2: 0.00231759
[164]	valid_0's l1: 0.0340195	valid_0's l2: 0.00230668
[165]	valid_0's l1: 0.0338908	valid_0's l2: 0.00229128
[166]	valid_0's l1: 0.0337678	valid_0's l2: 0.00227532
[167]	valid_0's l1: 0.0337319	valid_0's l2: 0.00227022
[168]	valid_0's l1: 0.0336504	valid_0's l2: 0.00226055
[169]	valid_0's l1: 0.0336011	valid_0's l2: 0.0022551
[170]	valid_0's l1: 0.0335416	valid_0's l2: 0.00224797
[171]	valid

[303]	valid_0's l1: 0.0288168	valid_0's l2: 0.00170097
[304]	valid_0's l1: 0.028805	valid_0's l2: 0.00169989
[305]	valid_0's l1: 0.028781	valid_0's l2: 0.00169741
[306]	valid_0's l1: 0.0287637	valid_0's l2: 0.00169527
[307]	valid_0's l1: 0.0287461	valid_0's l2: 0.00169349
[308]	valid_0's l1: 0.0287241	valid_0's l2: 0.001691
[309]	valid_0's l1: 0.0287185	valid_0's l2: 0.00169024
[310]	valid_0's l1: 0.0286992	valid_0's l2: 0.00168772
[311]	valid_0's l1: 0.0286758	valid_0's l2: 0.00168476
[312]	valid_0's l1: 0.0286493	valid_0's l2: 0.00168186
[313]	valid_0's l1: 0.0286357	valid_0's l2: 0.00168063
[314]	valid_0's l1: 0.0286164	valid_0's l2: 0.00167847
[315]	valid_0's l1: 0.028613	valid_0's l2: 0.00167809
[316]	valid_0's l1: 0.0285863	valid_0's l2: 0.00167532
[317]	valid_0's l1: 0.0285643	valid_0's l2: 0.00167264
[318]	valid_0's l1: 0.028553	valid_0's l2: 0.00167155
[319]	valid_0's l1: 0.028537	valid_0's l2: 0.00166963
[320]	valid_0's l1: 0.0285139	valid_0's l2: 0.00166734
[321]	valid_0's l

[453]	valid_0's l1: 0.0267446	valid_0's l2: 0.00148025
[454]	valid_0's l1: 0.0267398	valid_0's l2: 0.00147978
[455]	valid_0's l1: 0.0267307	valid_0's l2: 0.00147887
[456]	valid_0's l1: 0.0267198	valid_0's l2: 0.00147764
[457]	valid_0's l1: 0.0267125	valid_0's l2: 0.00147679
[458]	valid_0's l1: 0.026705	valid_0's l2: 0.00147603
[459]	valid_0's l1: 0.0266991	valid_0's l2: 0.00147485
[460]	valid_0's l1: 0.0266958	valid_0's l2: 0.00147464
[461]	valid_0's l1: 0.0266836	valid_0's l2: 0.00147332
[462]	valid_0's l1: 0.0266777	valid_0's l2: 0.00147262
[463]	valid_0's l1: 0.0266718	valid_0's l2: 0.00147204
[464]	valid_0's l1: 0.0266666	valid_0's l2: 0.00147159
[465]	valid_0's l1: 0.0266589	valid_0's l2: 0.00147081
[466]	valid_0's l1: 0.0266456	valid_0's l2: 0.00146954
[467]	valid_0's l1: 0.0266402	valid_0's l2: 0.00146909
[468]	valid_0's l1: 0.0266331	valid_0's l2: 0.0014681
[469]	valid_0's l1: 0.0266215	valid_0's l2: 0.00146684
[470]	valid_0's l1: 0.0266195	valid_0's l2: 0.0014666
[471]	valid_0

[603]	valid_0's l1: 0.0256005	valid_0's l2: 0.00136349
[604]	valid_0's l1: 0.0255964	valid_0's l2: 0.00136305
[605]	valid_0's l1: 0.0255887	valid_0's l2: 0.00136244
[606]	valid_0's l1: 0.0255818	valid_0's l2: 0.0013619
[607]	valid_0's l1: 0.0255789	valid_0's l2: 0.0013617
[608]	valid_0's l1: 0.0255652	valid_0's l2: 0.00136056
[609]	valid_0's l1: 0.0255512	valid_0's l2: 0.0013593
[610]	valid_0's l1: 0.0255392	valid_0's l2: 0.00135837
[611]	valid_0's l1: 0.0255304	valid_0's l2: 0.00135757
[612]	valid_0's l1: 0.0255264	valid_0's l2: 0.00135697
[613]	valid_0's l1: 0.0255177	valid_0's l2: 0.00135617
[614]	valid_0's l1: 0.0255131	valid_0's l2: 0.00135565
[615]	valid_0's l1: 0.0255081	valid_0's l2: 0.00135521
[616]	valid_0's l1: 0.0254982	valid_0's l2: 0.00135421
[617]	valid_0's l1: 0.0254932	valid_0's l2: 0.00135375
[618]	valid_0's l1: 0.0254914	valid_0's l2: 0.00135359
[619]	valid_0's l1: 0.0254857	valid_0's l2: 0.00135326
[620]	valid_0's l1: 0.025479	valid_0's l2: 0.00135259
[621]	valid_0'

[753]	valid_0's l1: 0.0248085	valid_0's l2: 0.00128635
[754]	valid_0's l1: 0.0248039	valid_0's l2: 0.00128579
[755]	valid_0's l1: 0.024798	valid_0's l2: 0.00128527
[756]	valid_0's l1: 0.0247964	valid_0's l2: 0.00128496
[757]	valid_0's l1: 0.0247965	valid_0's l2: 0.00128486
[758]	valid_0's l1: 0.0247944	valid_0's l2: 0.00128449
[759]	valid_0's l1: 0.0247933	valid_0's l2: 0.00128437
[760]	valid_0's l1: 0.0247901	valid_0's l2: 0.00128393
[761]	valid_0's l1: 0.0247887	valid_0's l2: 0.0012838
[762]	valid_0's l1: 0.0247762	valid_0's l2: 0.00128293
[763]	valid_0's l1: 0.0247715	valid_0's l2: 0.00128245
[764]	valid_0's l1: 0.024769	valid_0's l2: 0.00128223
[765]	valid_0's l1: 0.0247653	valid_0's l2: 0.00128185
[766]	valid_0's l1: 0.0247636	valid_0's l2: 0.00128162
[767]	valid_0's l1: 0.0247619	valid_0's l2: 0.00128141
[768]	valid_0's l1: 0.0247584	valid_0's l2: 0.00128107
[769]	valid_0's l1: 0.024753	valid_0's l2: 0.00128067
[770]	valid_0's l1: 0.024751	valid_0's l2: 0.00128049
[771]	valid_0's

[903]	valid_0's l1: 0.0243526	valid_0's l2: 0.00124165
[904]	valid_0's l1: 0.0243497	valid_0's l2: 0.00124126
[905]	valid_0's l1: 0.0243479	valid_0's l2: 0.00124108
[906]	valid_0's l1: 0.0243468	valid_0's l2: 0.00124098
[907]	valid_0's l1: 0.0243443	valid_0's l2: 0.00124069
[908]	valid_0's l1: 0.0243406	valid_0's l2: 0.00124028
[909]	valid_0's l1: 0.0243369	valid_0's l2: 0.00124001
[910]	valid_0's l1: 0.0243352	valid_0's l2: 0.00123979
[911]	valid_0's l1: 0.0243275	valid_0's l2: 0.00123916
[912]	valid_0's l1: 0.0243208	valid_0's l2: 0.00123854
[913]	valid_0's l1: 0.0243182	valid_0's l2: 0.0012383
[914]	valid_0's l1: 0.0243168	valid_0's l2: 0.00123813
[915]	valid_0's l1: 0.0243159	valid_0's l2: 0.00123806
[916]	valid_0's l1: 0.0243083	valid_0's l2: 0.00123732
[917]	valid_0's l1: 0.0243071	valid_0's l2: 0.00123708
[918]	valid_0's l1: 0.0243036	valid_0's l2: 0.00123676
[919]	valid_0's l1: 0.0242987	valid_0's l2: 0.00123629
[920]	valid_0's l1: 0.0242955	valid_0's l2: 0.00123601
[921]	valid

[1052]	valid_0's l1: 0.0240164	valid_0's l2: 0.00120872
[1053]	valid_0's l1: 0.0240155	valid_0's l2: 0.00120856
[1054]	valid_0's l1: 0.0240124	valid_0's l2: 0.00120818
[1055]	valid_0's l1: 0.0240111	valid_0's l2: 0.001208
[1056]	valid_0's l1: 0.0240086	valid_0's l2: 0.00120776
[1057]	valid_0's l1: 0.024008	valid_0's l2: 0.00120769
[1058]	valid_0's l1: 0.0240034	valid_0's l2: 0.00120728
[1059]	valid_0's l1: 0.0240019	valid_0's l2: 0.00120713
[1060]	valid_0's l1: 0.0240015	valid_0's l2: 0.0012071
[1061]	valid_0's l1: 0.0239987	valid_0's l2: 0.00120691
[1062]	valid_0's l1: 0.0239978	valid_0's l2: 0.00120683
[1063]	valid_0's l1: 0.0239965	valid_0's l2: 0.00120672
[1064]	valid_0's l1: 0.0239951	valid_0's l2: 0.00120656
[1065]	valid_0's l1: 0.0239939	valid_0's l2: 0.00120644
[1066]	valid_0's l1: 0.0239933	valid_0's l2: 0.00120634
[1067]	valid_0's l1: 0.0239912	valid_0's l2: 0.0012062
[1068]	valid_0's l1: 0.0239887	valid_0's l2: 0.00120603
[1069]	valid_0's l1: 0.0239884	valid_0's l2: 0.001205

[1201]	valid_0's l1: 0.0237876	valid_0's l2: 0.00118701
[1202]	valid_0's l1: 0.0237859	valid_0's l2: 0.00118691
[1203]	valid_0's l1: 0.0237835	valid_0's l2: 0.00118665
[1204]	valid_0's l1: 0.0237829	valid_0's l2: 0.00118657
[1205]	valid_0's l1: 0.0237816	valid_0's l2: 0.00118644
[1206]	valid_0's l1: 0.0237794	valid_0's l2: 0.00118624
[1207]	valid_0's l1: 0.0237781	valid_0's l2: 0.0011861
[1208]	valid_0's l1: 0.0237763	valid_0's l2: 0.00118579
[1209]	valid_0's l1: 0.0237754	valid_0's l2: 0.0011857
[1210]	valid_0's l1: 0.0237737	valid_0's l2: 0.00118556
[1211]	valid_0's l1: 0.0237705	valid_0's l2: 0.0011853
[1212]	valid_0's l1: 0.023769	valid_0's l2: 0.0011852
[1213]	valid_0's l1: 0.0237685	valid_0's l2: 0.00118515
[1214]	valid_0's l1: 0.0237675	valid_0's l2: 0.00118507
[1215]	valid_0's l1: 0.0237669	valid_0's l2: 0.00118501
[1216]	valid_0's l1: 0.0237666	valid_0's l2: 0.001185
[1217]	valid_0's l1: 0.0237653	valid_0's l2: 0.00118479
[1218]	valid_0's l1: 0.0237649	valid_0's l2: 0.00118473

[1348]	valid_0's l1: 0.0236003	valid_0's l2: 0.00116885
[1349]	valid_0's l1: 0.0235992	valid_0's l2: 0.00116877
[1350]	valid_0's l1: 0.023598	valid_0's l2: 0.00116859
[1351]	valid_0's l1: 0.0235975	valid_0's l2: 0.00116849
[1352]	valid_0's l1: 0.0235968	valid_0's l2: 0.00116846
[1353]	valid_0's l1: 0.0235962	valid_0's l2: 0.00116839
[1354]	valid_0's l1: 0.0235946	valid_0's l2: 0.00116824
[1355]	valid_0's l1: 0.0235941	valid_0's l2: 0.00116817
[1356]	valid_0's l1: 0.0235931	valid_0's l2: 0.00116809
[1357]	valid_0's l1: 0.0235895	valid_0's l2: 0.00116786
[1358]	valid_0's l1: 0.0235888	valid_0's l2: 0.00116782
[1359]	valid_0's l1: 0.0235883	valid_0's l2: 0.00116778
[1360]	valid_0's l1: 0.0235884	valid_0's l2: 0.00116779
[1361]	valid_0's l1: 0.0235876	valid_0's l2: 0.00116772
[1362]	valid_0's l1: 0.0235803	valid_0's l2: 0.00116715
[1363]	valid_0's l1: 0.0235796	valid_0's l2: 0.00116704
[1364]	valid_0's l1: 0.0235779	valid_0's l2: 0.00116684
[1365]	valid_0's l1: 0.0235774	valid_0's l2: 0.00

[1495]	valid_0's l1: 0.0234414	valid_0's l2: 0.00115357
[1496]	valid_0's l1: 0.0234405	valid_0's l2: 0.00115346
[1497]	valid_0's l1: 0.0234394	valid_0's l2: 0.0011533
[1498]	valid_0's l1: 0.0234393	valid_0's l2: 0.00115331
[1499]	valid_0's l1: 0.0234385	valid_0's l2: 0.00115325
[1500]	valid_0's l1: 0.0234377	valid_0's l2: 0.00115312
[1501]	valid_0's l1: 0.0234376	valid_0's l2: 0.00115312
[1502]	valid_0's l1: 0.0234376	valid_0's l2: 0.00115305
[1503]	valid_0's l1: 0.023437	valid_0's l2: 0.00115301
[1504]	valid_0's l1: 0.0234368	valid_0's l2: 0.00115301
[1505]	valid_0's l1: 0.0234355	valid_0's l2: 0.0011529
[1506]	valid_0's l1: 0.023435	valid_0's l2: 0.00115286
[1507]	valid_0's l1: 0.0234347	valid_0's l2: 0.00115282
[1508]	valid_0's l1: 0.0234341	valid_0's l2: 0.00115278
[1509]	valid_0's l1: 0.0234329	valid_0's l2: 0.00115266
[1510]	valid_0's l1: 0.0234309	valid_0's l2: 0.00115249
[1511]	valid_0's l1: 0.0234305	valid_0's l2: 0.00115243
[1512]	valid_0's l1: 0.0234299	valid_0's l2: 0.00115

[1643]	valid_0's l1: 0.0233198	valid_0's l2: 0.0011421
[1644]	valid_0's l1: 0.0233184	valid_0's l2: 0.00114198
[1645]	valid_0's l1: 0.0233185	valid_0's l2: 0.00114197
[1646]	valid_0's l1: 0.0233184	valid_0's l2: 0.00114195
[1647]	valid_0's l1: 0.0233173	valid_0's l2: 0.00114185
[1648]	valid_0's l1: 0.0233162	valid_0's l2: 0.00114176
[1649]	valid_0's l1: 0.0233151	valid_0's l2: 0.00114161
[1650]	valid_0's l1: 0.0233147	valid_0's l2: 0.00114158
[1651]	valid_0's l1: 0.0233139	valid_0's l2: 0.00114152
[1652]	valid_0's l1: 0.0233133	valid_0's l2: 0.00114146
[1653]	valid_0's l1: 0.0233122	valid_0's l2: 0.00114136
[1654]	valid_0's l1: 0.023311	valid_0's l2: 0.00114132
[1655]	valid_0's l1: 0.0233099	valid_0's l2: 0.00114123
[1656]	valid_0's l1: 0.0233089	valid_0's l2: 0.0011411
[1657]	valid_0's l1: 0.0233076	valid_0's l2: 0.00114096
[1658]	valid_0's l1: 0.0233062	valid_0's l2: 0.00114083
[1659]	valid_0's l1: 0.0233059	valid_0's l2: 0.00114082
[1660]	valid_0's l1: 0.0233036	valid_0's l2: 0.0011

[1790]	valid_0's l1: 0.0232113	valid_0's l2: 0.00113121
[1791]	valid_0's l1: 0.0232112	valid_0's l2: 0.00113123
[1792]	valid_0's l1: 0.0232102	valid_0's l2: 0.00113115
[1793]	valid_0's l1: 0.0232097	valid_0's l2: 0.00113111
[1794]	valid_0's l1: 0.0232095	valid_0's l2: 0.00113106
[1795]	valid_0's l1: 0.0232091	valid_0's l2: 0.00113098
[1796]	valid_0's l1: 0.0232086	valid_0's l2: 0.00113092
[1797]	valid_0's l1: 0.0232085	valid_0's l2: 0.00113092
[1798]	valid_0's l1: 0.0232077	valid_0's l2: 0.00113088
[1799]	valid_0's l1: 0.0232068	valid_0's l2: 0.00113083
[1800]	valid_0's l1: 0.0232067	valid_0's l2: 0.00113082
[1801]	valid_0's l1: 0.0232065	valid_0's l2: 0.00113078
[1802]	valid_0's l1: 0.0232059	valid_0's l2: 0.00113073
[1803]	valid_0's l1: 0.0232057	valid_0's l2: 0.00113071
[1804]	valid_0's l1: 0.0232054	valid_0's l2: 0.00113068
[1805]	valid_0's l1: 0.0232049	valid_0's l2: 0.00113063
[1806]	valid_0's l1: 0.0232043	valid_0's l2: 0.00113058
[1807]	valid_0's l1: 0.0232003	valid_0's l2: 0.0

[1938]	valid_0's l1: 0.0231032	valid_0's l2: 0.00112201
[1939]	valid_0's l1: 0.0231029	valid_0's l2: 0.00112196
[1940]	valid_0's l1: 0.0231021	valid_0's l2: 0.00112186
[1941]	valid_0's l1: 0.0231012	valid_0's l2: 0.00112178
[1942]	valid_0's l1: 0.023101	valid_0's l2: 0.00112175
[1943]	valid_0's l1: 0.0231003	valid_0's l2: 0.00112171
[1944]	valid_0's l1: 0.0230998	valid_0's l2: 0.00112167
[1945]	valid_0's l1: 0.0230997	valid_0's l2: 0.00112167
[1946]	valid_0's l1: 0.0230982	valid_0's l2: 0.00112157
[1947]	valid_0's l1: 0.0230976	valid_0's l2: 0.00112152
[1948]	valid_0's l1: 0.0230975	valid_0's l2: 0.00112152
[1949]	valid_0's l1: 0.0230967	valid_0's l2: 0.00112143
[1950]	valid_0's l1: 0.023096	valid_0's l2: 0.00112138
[1951]	valid_0's l1: 0.0230938	valid_0's l2: 0.00112124
[1952]	valid_0's l1: 0.0230934	valid_0's l2: 0.0011212
[1953]	valid_0's l1: 0.0230929	valid_0's l2: 0.00112117
[1954]	valid_0's l1: 0.0230928	valid_0's l2: 0.00112116
[1955]	valid_0's l1: 0.0230921	valid_0's l2: 0.0011

[2085]	valid_0's l1: 0.0230244	valid_0's l2: 0.00111514
[2086]	valid_0's l1: 0.0230231	valid_0's l2: 0.00111498
[2087]	valid_0's l1: 0.0230228	valid_0's l2: 0.00111494
[2088]	valid_0's l1: 0.0230222	valid_0's l2: 0.00111487
[2089]	valid_0's l1: 0.0230218	valid_0's l2: 0.00111481
[2090]	valid_0's l1: 0.023021	valid_0's l2: 0.00111472
[2091]	valid_0's l1: 0.0230207	valid_0's l2: 0.00111469
[2092]	valid_0's l1: 0.023021	valid_0's l2: 0.00111472
[2093]	valid_0's l1: 0.0230204	valid_0's l2: 0.00111468
[2094]	valid_0's l1: 0.0230193	valid_0's l2: 0.00111458
[2095]	valid_0's l1: 0.0230189	valid_0's l2: 0.00111453
[2096]	valid_0's l1: 0.0230178	valid_0's l2: 0.00111445
[2097]	valid_0's l1: 0.0230176	valid_0's l2: 0.00111443
[2098]	valid_0's l1: 0.0230169	valid_0's l2: 0.00111437
[2099]	valid_0's l1: 0.0230168	valid_0's l2: 0.00111437
[2100]	valid_0's l1: 0.0230165	valid_0's l2: 0.00111434
[2101]	valid_0's l1: 0.023016	valid_0's l2: 0.0011143
[2102]	valid_0's l1: 0.0230159	valid_0's l2: 0.00111

[2233]	valid_0's l1: 0.0229474	valid_0's l2: 0.00110812
[2234]	valid_0's l1: 0.0229471	valid_0's l2: 0.00110807
[2235]	valid_0's l1: 0.0229471	valid_0's l2: 0.00110809
[2236]	valid_0's l1: 0.0229464	valid_0's l2: 0.00110805
[2237]	valid_0's l1: 0.0229459	valid_0's l2: 0.00110801
[2238]	valid_0's l1: 0.0229455	valid_0's l2: 0.00110797
[2239]	valid_0's l1: 0.022945	valid_0's l2: 0.00110793
[2240]	valid_0's l1: 0.0229447	valid_0's l2: 0.0011079
[2241]	valid_0's l1: 0.0229445	valid_0's l2: 0.00110789
[2242]	valid_0's l1: 0.0229444	valid_0's l2: 0.00110787
[2243]	valid_0's l1: 0.0229436	valid_0's l2: 0.00110788
[2244]	valid_0's l1: 0.0229433	valid_0's l2: 0.00110785
[2245]	valid_0's l1: 0.0229405	valid_0's l2: 0.00110767
[2246]	valid_0's l1: 0.0229404	valid_0's l2: 0.00110765
[2247]	valid_0's l1: 0.0229401	valid_0's l2: 0.00110763
[2248]	valid_0's l1: 0.0229394	valid_0's l2: 0.0011076
[2249]	valid_0's l1: 0.0229381	valid_0's l2: 0.0011075
[2250]	valid_0's l1: 0.0229373	valid_0's l2: 0.00110

[2380]	valid_0's l1: 0.0228896	valid_0's l2: 0.00110324
[2381]	valid_0's l1: 0.0228893	valid_0's l2: 0.00110321
[2382]	valid_0's l1: 0.0228892	valid_0's l2: 0.00110319
[2383]	valid_0's l1: 0.0228891	valid_0's l2: 0.00110318
[2384]	valid_0's l1: 0.0228888	valid_0's l2: 0.00110315
[2385]	valid_0's l1: 0.0228885	valid_0's l2: 0.00110312
[2386]	valid_0's l1: 0.0228878	valid_0's l2: 0.00110304
[2387]	valid_0's l1: 0.0228877	valid_0's l2: 0.00110301
[2388]	valid_0's l1: 0.0228878	valid_0's l2: 0.001103
[2389]	valid_0's l1: 0.0228877	valid_0's l2: 0.00110299
[2390]	valid_0's l1: 0.0228874	valid_0's l2: 0.00110296
[2391]	valid_0's l1: 0.0228871	valid_0's l2: 0.00110293
[2392]	valid_0's l1: 0.0228864	valid_0's l2: 0.00110289
[2393]	valid_0's l1: 0.0228857	valid_0's l2: 0.00110282
[2394]	valid_0's l1: 0.0228851	valid_0's l2: 0.00110274
[2395]	valid_0's l1: 0.0228842	valid_0's l2: 0.00110267
[2396]	valid_0's l1: 0.0228839	valid_0's l2: 0.00110265
[2397]	valid_0's l1: 0.0228835	valid_0's l2: 0.001

[2528]	valid_0's l1: 0.0228283	valid_0's l2: 0.00109799
[2529]	valid_0's l1: 0.0228281	valid_0's l2: 0.00109796
[2530]	valid_0's l1: 0.022828	valid_0's l2: 0.00109795
[2531]	valid_0's l1: 0.0228275	valid_0's l2: 0.00109791
[2532]	valid_0's l1: 0.0228269	valid_0's l2: 0.00109788
[2533]	valid_0's l1: 0.0228267	valid_0's l2: 0.00109786
[2534]	valid_0's l1: 0.0228263	valid_0's l2: 0.00109784
[2535]	valid_0's l1: 0.0228264	valid_0's l2: 0.00109784
[2536]	valid_0's l1: 0.0228262	valid_0's l2: 0.00109783
[2537]	valid_0's l1: 0.0228259	valid_0's l2: 0.00109781
[2538]	valid_0's l1: 0.0228253	valid_0's l2: 0.00109774
[2539]	valid_0's l1: 0.022825	valid_0's l2: 0.00109775
[2540]	valid_0's l1: 0.0228243	valid_0's l2: 0.00109771
[2541]	valid_0's l1: 0.0228239	valid_0's l2: 0.00109769
[2542]	valid_0's l1: 0.022824	valid_0's l2: 0.00109769
[2543]	valid_0's l1: 0.0228237	valid_0's l2: 0.0010977
[2544]	valid_0's l1: 0.0228235	valid_0's l2: 0.00109768
[2545]	valid_0's l1: 0.0228236	valid_0's l2: 0.00109

[2676]	valid_0's l1: 0.022782	valid_0's l2: 0.00109411
[2677]	valid_0's l1: 0.0227812	valid_0's l2: 0.00109405
[2678]	valid_0's l1: 0.0227811	valid_0's l2: 0.00109404
[2679]	valid_0's l1: 0.0227806	valid_0's l2: 0.00109401
[2680]	valid_0's l1: 0.0227804	valid_0's l2: 0.00109395
[2681]	valid_0's l1: 0.0227797	valid_0's l2: 0.00109391
[2682]	valid_0's l1: 0.0227792	valid_0's l2: 0.00109389
[2683]	valid_0's l1: 0.0227766	valid_0's l2: 0.00109371
[2684]	valid_0's l1: 0.0227765	valid_0's l2: 0.00109371
[2685]	valid_0's l1: 0.0227763	valid_0's l2: 0.00109366
[2686]	valid_0's l1: 0.0227761	valid_0's l2: 0.00109365
[2687]	valid_0's l1: 0.0227759	valid_0's l2: 0.00109362
[2688]	valid_0's l1: 0.0227755	valid_0's l2: 0.00109357
[2689]	valid_0's l1: 0.0227751	valid_0's l2: 0.00109354
[2690]	valid_0's l1: 0.0227752	valid_0's l2: 0.00109357
[2691]	valid_0's l1: 0.022775	valid_0's l2: 0.00109355
[2692]	valid_0's l1: 0.0227745	valid_0's l2: 0.00109352
[2693]	valid_0's l1: 0.0227742	valid_0's l2: 0.001

[2823]	valid_0's l1: 0.0227317	valid_0's l2: 0.0010904
[2824]	valid_0's l1: 0.0227314	valid_0's l2: 0.00109036
[2825]	valid_0's l1: 0.0227314	valid_0's l2: 0.0010903
[2826]	valid_0's l1: 0.0227312	valid_0's l2: 0.0010903
[2827]	valid_0's l1: 0.0227312	valid_0's l2: 0.00109028
[2828]	valid_0's l1: 0.022731	valid_0's l2: 0.00109028
[2829]	valid_0's l1: 0.0227309	valid_0's l2: 0.00109027
[2830]	valid_0's l1: 0.0227299	valid_0's l2: 0.00109019
[2831]	valid_0's l1: 0.0227299	valid_0's l2: 0.00109019
[2832]	valid_0's l1: 0.0227299	valid_0's l2: 0.00109019
[2833]	valid_0's l1: 0.02273	valid_0's l2: 0.00109021
[2834]	valid_0's l1: 0.0227293	valid_0's l2: 0.00109016
[2835]	valid_0's l1: 0.0227288	valid_0's l2: 0.00109014
[2836]	valid_0's l1: 0.0227281	valid_0's l2: 0.00109009
[2837]	valid_0's l1: 0.0227282	valid_0's l2: 0.0010901
[2838]	valid_0's l1: 0.0227281	valid_0's l2: 0.00109009
[2839]	valid_0's l1: 0.0227278	valid_0's l2: 0.00109007
[2840]	valid_0's l1: 0.0227277	valid_0's l2: 0.00109005

[2971]	valid_0's l1: 0.0226929	valid_0's l2: 0.00108702
[2972]	valid_0's l1: 0.0226927	valid_0's l2: 0.00108702
[2973]	valid_0's l1: 0.0226926	valid_0's l2: 0.00108701
[2974]	valid_0's l1: 0.0226925	valid_0's l2: 0.00108701
[2975]	valid_0's l1: 0.0226895	valid_0's l2: 0.00108678
[2976]	valid_0's l1: 0.0226894	valid_0's l2: 0.00108677
[2977]	valid_0's l1: 0.0226892	valid_0's l2: 0.00108676
[2978]	valid_0's l1: 0.022687	valid_0's l2: 0.00108661
[2979]	valid_0's l1: 0.0226871	valid_0's l2: 0.00108661
[2980]	valid_0's l1: 0.0226869	valid_0's l2: 0.00108659
[2981]	valid_0's l1: 0.0226867	valid_0's l2: 0.00108656
[2982]	valid_0's l1: 0.0226861	valid_0's l2: 0.00108647
[2983]	valid_0's l1: 0.0226858	valid_0's l2: 0.00108645
[2984]	valid_0's l1: 0.0226858	valid_0's l2: 0.00108643
[2985]	valid_0's l1: 0.0226853	valid_0's l2: 0.0010864
[2986]	valid_0's l1: 0.0226851	valid_0's l2: 0.00108637
[2987]	valid_0's l1: 0.0226846	valid_0's l2: 0.00108632
[2988]	valid_0's l1: 0.0226843	valid_0's l2: 0.001

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=0.34,
       learning_rate=0.03, max_depth=12, min_child_samples=110,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=3000,
       n_jobs=8, num_leaves=100, objective='regression', random_state=None,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1,
       subsample_for_bin=200000, subsample_freq=0, verbose=1)

In [46]:
feature_importances_df = pd.DataFrame(fearture_columns,columns=['fearture'])
feature_importances_df['feature_importances'] = gbm.feature_importances_

# feature_importances_df.loc[feature_importances_df['feature_importances']>0].sort_values(by='feature_importances',ascending=False)
# feature_importances_df = pd.merge(feature_importances_df,check_item_value[['check_item_value_code','check_item_value_name']],left_on='fearture',right_on='check_item_value_code',how='left')
feature_importances_df.sort_values(by='feature_importances',ascending=False)



Unnamed: 0,fearture,feature_importances
14,series_code_index,24432
10,province_name,14575
13,brand_code_index,9843
5,model_year,9238
0,months,7667
9,year_err,7351
17,keep_value,6831
1,mileage_log,6778
4,new_car_price,6768
16,color_index,6685


In [47]:
# ##保存模型
import os
from sklearn.externals import joblib

# 保存模型
joblib.dump(gbm,curr_dir+'biaozhun_model_车况一般_2021-02-02_B.pkl')

# 上传到 oss上 
ph = curr_dir+'biaozhun_model_车况一般_2021-02-02_B.pkl'
from dayu.hooks.oss_hook import OSSHook
oss = OSSHook("oss_algorithm")
oss.put_file("algorithm/qiongjiu/valuation/全网数据-优化模型/biaozhun_model_车况一般_2021-02-02_B.pkl", ph)


[2021-02-19 17:59:30,717] {oss_hook:28} INFO - Done. Loaded the key algorithm/qiongjiu/valuation/全网数据-优化模型/biaozhun_model_车况一般_2021-02-02_B.pkl .


In [48]:
test_data['ypred'] = gbm.predict(test_data[fearture_columns])
test_data['err'] = abs(test_data['ypred'] - test_data['residual']) / test_data['residual']

# train_data['ypred'] = gbm.predict(train_data[fearture_columns])
# train_data['err'] = abs(train_data['ypred'] - train_data['residual']) / train_data['residual']


In [49]:
def data3m_pinggu(data_df,col):
    total = data_df.shape[0]
    num_3 = data_df.loc[(data_df[col]>=-0.03 ) & (data_df[col]<=0.03 )].shape[0]
    print("P<3%: ",round(num_3/total,4))
    num_5 = data_df.loc[(data_df[col]>=-0.05 ) & (data_df[col]<=0.05 )].shape[0]
    print("P<5%: ",round(num_5/total,4))
    num_8 = data_df.loc[(data_df[col]>=-0.08 ) & (data_df[col]<=0.08 )].shape[0]
    print("P<8%: ",round(num_8/total,4))
    num_10 = data_df.loc[(data_df[col]>=-0.1 ) & (data_df[col]<=0.1 )].shape[0]
    print("P<10%: ",round(num_10/total,4))
    num_20 = data_df.loc[(data_df[col]>=-0.2 ) & (data_df[col]<=0.2 )].shape[0]
    print("P<20%: ",round(num_20/total,4))
    
print("测试集。。。。")
data3m_pinggu(test_data,'err')


训练集。。。。
测试集。。。。
P<3%:  0.4679
P<5%:  0.6781
P<8%:  0.8439
P<10%:  0.8985
P<20%:  0.9788


In [50]:
train_data['ypred'] = gbm.predict(train_data[fearture_columns])
test_data['ypred'] = gbm.predict(test_data[fearture_columns])

train_data['ypred_price'] = train_data['ypred'] * train_data['new_car_price']
test_data['ypred_price'] = test_data['ypred'] * test_data['new_car_price']


In [51]:
data3m_pinggu(test_data,'err')
print(test_data.loc[(test_data['delete_flag'] !=1) & 
                           (test_data['delete_flag_1'] !=1) & 
                           (test_data['delete_flag_2'] !=1)].shape)
data3m_pinggu(test_data.loc[(test_data['delete_flag'] !=1) & 
                           (test_data['delete_flag_1'] !=1) & 
                           (test_data['delete_flag_2'] !=1)],'err')


P<3%:  0.4679
P<5%:  0.6781
P<8%:  0.8439
P<10%:  0.8985
P<20%:  0.9788
(40383, 307)
P<3%:  0.5122
P<5%:  0.7151
P<8%:  0.8625
P<10%:  0.9083
P<20%:  0.9787


In [59]:
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn2pmml import sklearn2pmml
import lightgbm as lgb

pipeline = PMMLPipeline([("regressor", lgb.LGBMRegressor(boosting_type = 'gbdt', 
                        objective = 'regression',
                        learning_rate = 0.03, 
                        n_estimators = 3000,
                        max_depth = 12,
                        num_leaves = 100, 
                        subsample = 1, 
                        colsample_bytree = 0.34,
                        min_child_samples = 110, 
                        n_jobs = 8,
                        verbose = 1))])

pipeline.fit(train_data[fearture_columns],
            train_data[label],
#             verbose=True#,
#             categorical_feature=['province_name','model_year', 'publish_time_year','quality','brand_code_index','series_code_index','transfer_times_index','color_index']
            )



PMMLPipeline(steps=[('regressor', LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=0.34,
       learning_rate=0.03, max_depth=12, min_child_samples=110,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=3000,
       n_jobs=8, num_leaves=100, objective='regression', random_state=None,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1,
       subsample_for_bin=200000, subsample_freq=0, verbose=1))])

In [61]:
test_data['ypred'] = pipeline.predict(test_data[fearture_columns].fillna(0))
test_data['err'] = abs(test_data['ypred'] - test_data['residual']) / test_data['residual']
def data3m_pinggu(data_df,col):
    total = data_df.shape[0]
    num_3 = data_df.loc[(data_df[col]>=-0.03 ) & (data_df[col]<=0.03 )].shape[0]
    print("P<3%: ",round(num_3/total,4))
    num_5 = data_df.loc[(data_df[col]>=-0.05 ) & (data_df[col]<=0.05 )].shape[0]
    print("P<5%: ",round(num_5/total,4))
    num_8 = data_df.loc[(data_df[col]>=-0.08 ) & (data_df[col]<=0.08 )].shape[0]
    print("P<8%: ",round(num_8/total,4))
    num_10 = data_df.loc[(data_df[col]>=-0.1 ) & (data_df[col]<=0.1 )].shape[0]
    print("P<10%: ",round(num_10/total,4))
    num_20 = data_df.loc[(data_df[col]>=-0.2 ) & (data_df[col]<=0.2 )].shape[0]
    print("P<20%: ",round(num_20/total,4))
data3m_pinggu(test_data,'err')
print(test_data.loc[(test_data['delete_flag'] !=1) & 
                           (test_data['delete_flag_1'] !=1) & 
                           (test_data['delete_flag_2'] !=1)].shape)
data3m_pinggu(test_data.loc[(test_data['delete_flag'] !=1) & 
                           (test_data['delete_flag_1'] !=1) & 
                           (test_data['delete_flag_2'] !=1)],'err')


P<3%:  0.4575
P<5%:  0.6685
P<8%:  0.8375
P<10%:  0.8934
P<20%:  0.9774
(40194, 307)
P<3%:  0.5057
P<5%:  0.7105
P<8%:  0.8574
P<10%:  0.9052
P<20%:  0.9778


In [109]:
ph = curr_dir+"enterprise2_BBB_level_dfc_retail_quan.pmml"
sklearn2pmml(pipeline, ph, with_repr = True)
from dayu.hooks.oss_hook import OSSHook
oss = OSSHook("oss_algorithm")
oss.put_file("algorithm/qiongjiu/valuation/全网数据-优化模型/"+curr_date+"/enterprise2_BBB_level_dfc_retail_quan.pmml", ph)


[2021-02-04 11:21:38,948] {oss_hook:28} INFO - Done. Loaded the key algorithm/qiongjiu/valuation/全网数据-优化模型/2021-02-03/enterprise2_BBB_level_dfc_retail_quan_2021.pmml .


In [64]:
test_data.to_csv(curr_dir+"enterprise2_B_test.csv")
by_model = test_data[['model_code','err']]
by_model = by_model.groupby("model_code").median().reset_index().rename(columns={"err":"median_res"})

ph = curr_dir+"enterprise2_BBB_level_retail_median_res.xlsx"
by_model.to_excel(ph)
