In [4]:
# -*- coding: utf-8 -*-
import warnings
warnings.filterwarnings('ignore')
import logging
import datetime

import csv, math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.externals import joblib
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from dayu.hooks.oss_hook import OSSHook
from dayu.hooks.hive_server_hook import HiveServerHook
from dayu.hooks.hive_cli_hook import HiveCliHook

def split_table_name(datain):
    new_cols = []
    for column in datain.columns:
        if(len(column.split('.'))<2):
            return datain
        tb_name, col_name = column.split('.')
        new_cols.append((column, col_name))
    datain = datain.rename(columns=dict(new_cols))
    return datain

def read_from_hive2(output_file_name,insql,dtype):
    filename = output_file_name
    filepath = curr_dir+filename
    hive = HiveServerHook("warehouse_hive")
    hive.to_csv(insql,filepath , delimiter=',',lineterminator='\n', output_header=True)
    outdata = pd.read_csv(filepath, header=0,dtype=dtype)
    # 去除列名中带有的表名
#     outdata = split_table_name(outdata)
    return outdata

## 计算时间差
def date_time_sub(startTime,endTime,date_format):
    try:
        startTime= datetime.datetime.strptime(startTime,date_format)
        endTime= datetime.datetime.strptime(endTime,date_format)
        return (endTime - startTime).days
    except:
        pass

    # 计算车辆当前保值率
def computer_with_license_month(tar):
    try:
        license_month = tar['license_month']
        if(license_month<=12):
            #tar['keep_value'] = tar['year_1']
            return tar['year_1']
        else:
            year = license_month//12
            #当前年保值率
            keep_max = tar["year_"+str(int(year))]
            #下一年的保值率
            keep_min = tar["year_"+str(int(year+1))]

            #相比于上一年，已经过了几个月
            mon = license_month-12*year
            tem = (keep_max-keep_min)/12

            #tar['keep_value'] = round(keep_max - tem*mon, 4)
            return round(keep_max - tem*mon, 4)
    except:
        return tar["year_16"]

class Logger:       
    def __init__(self, logName, logFile):
        self._logger = logging.getLogger(logName)
        handler = logging.FileHandler(logFile)
        formatter = logging.Formatter('%(asctime)s ********* %(message)s')
        handler.setFormatter(formatter)
        self._logger.addHandler(handler)
        self._logger.setLevel(logging.INFO)

    def log(self, msg):
        if self._logger is not None:
            self._logger.info(msg)


pd.set_option('display.max_columns', 500)
curr_dir = '/home/souche/qiongjiu/hgc/'

curr_date=str(datetime.datetime.now())[0:10]
hive_cli = HiveCliHook("warehouse_hive")

logger = Logger('model_service','./log/accurate_valuation_cyp_run_log.log')
logger.log("程序启动.............")


[2021-01-08 11:19:59,911] {<ipython-input-4-5aad934e0497>:84} INFO - 程序启动.............


In [3]:
sql_info ="""
    select t1.car_id,
           t1.area,
           t1.source_model_name,
           t1.brand_name,
           t1.brand_code,
           t1.series_name,
           t1.series_code,  
           t1.model_name,
           t1.model_code,
           t1.model_year,
           t1.real_pay_amount,
           t1.emission,
           t1.color,
           t1.mileage,
           t1.use_property,
           t1.license_time,
           t1.publish_time,
           t1.a_province_name,
           t1.a_city_name,
           t1.source_publish_time,
           t1.sale_time,
           t1.license_time_year,
           t1.publish_time_year,
           t1.site,
           t1.transfer_times 
    from (select car_id,
                 area,
                 source_model_name,
                 brand_name,
                 brand_code,
                 series_name,
                 series_code,  
                 model_name,
                 model_code,
                 model_year,
                 second_price real_pay_amount,
                 emission,
                 color,
                 mileage,
                 use_property,
                 license_time,
                 publish_time,
                 a_province_name,
                 a_city_name,
                 source_publish_time,
                 sale_time,
                 substr(license_time,0,4) license_time_year,
                 substr(publish_time,0,4) publish_time_year,
                 site ,
                 transfer_times,
                 row_number() over(partition by model_code,mileage,license_time,a_province_name order by publish_time desc) num 
         from  dl_outrs.dl_vehicle_data_scavenger_ipbl_t_reptile_car_dd
         where site not in ('51','renrenche','taoche','yxpai','baixing') 
                and ds =date_sub('"""+curr_date+"""',2) 
                and second_price/10000 >= 0.3    
                and second_price/10000 < 300 
                and mileage/10000 > 0     
                and mileage/10000 < 100 
                and substr(publish_time,0,4) in ('2020','2019')
         ) t1 
    where t1.num = 1
"""
dtype={'city_code':str}

dl_site_ts_order_clean = read_from_hive2('dl_site_ts_order_clean',sql_info,dtype)
db_columns = []
for col in dl_site_ts_order_clean.columns:
    if len(col.split('.')) > 1:
        db_columns.append(col.split('.')[1])

    else:
        db_columns.append(col)

dl_site_ts_order_clean.columns = db_columns 
logger.log("读取数据完成.............")



[2021-01-08 11:04:12,219] {hiveserver2:138} INFO - Using database default as default
[2021-01-08 11:04:12,370] {hive_server_hook:112} INFO - Running query: 
    select t1.car_id,
           t1.area,
           t1.source_model_name,
           t1.brand_name,
           t1.brand_code,
           t1.series_name,
           t1.series_code,  
           t1.model_name,
           t1.model_code,
           t1.model_year,
           t1.real_pay_amount,
           t1.emission,
           t1.color,
           t1.mileage,
           t1.use_property,
           t1.license_time,
           t1.publish_time,
           t1.a_province_name,
           t1.a_city_name,
           t1.source_publish_time,
           t1.sale_time,
           t1.license_time_year,
           t1.publish_time_year,
           t1.site,
           t1.transfer_times 
    from (select car_id,
                 area,
                 source_model_name,
                 brand_name,
                 brand_code,
                 series

[2021-01-08 11:08:00,352] {hive_server_hook:162} INFO - Written 720000 rows so far.
[2021-01-08 11:08:01,343] {hive_server_hook:162} INFO - Written 730000 rows so far.
[2021-01-08 11:08:02,427] {hive_server_hook:162} INFO - Written 740000 rows so far.
[2021-01-08 11:08:03,499] {hive_server_hook:162} INFO - Written 750000 rows so far.
[2021-01-08 11:08:04,575] {hive_server_hook:162} INFO - Written 760000 rows so far.
[2021-01-08 11:08:05,584] {hive_server_hook:162} INFO - Written 770000 rows so far.
[2021-01-08 11:08:06,621] {hive_server_hook:162} INFO - Written 780000 rows so far.
[2021-01-08 11:08:07,639] {hive_server_hook:162} INFO - Written 790000 rows so far.
[2021-01-08 11:08:08,659] {hive_server_hook:162} INFO - Written 800000 rows so far.
[2021-01-08 11:08:09,677] {hive_server_hook:162} INFO - Written 810000 rows so far.
[2021-01-08 11:08:10,633] {hive_server_hook:162} INFO - Written 820000 rows so far.
[2021-01-08 11:08:11,679] {hive_server_hook:162} INFO - Written 830000 rows 

[2021-01-08 11:09:38,139] {hive_server_hook:162} INFO - Written 1690000 rows so far.
[2021-01-08 11:09:39,181] {hive_server_hook:162} INFO - Written 1700000 rows so far.
[2021-01-08 11:09:40,090] {hive_server_hook:162} INFO - Written 1710000 rows so far.
[2021-01-08 11:09:41,066] {hive_server_hook:162} INFO - Written 1720000 rows so far.
[2021-01-08 11:09:42,045] {hive_server_hook:162} INFO - Written 1730000 rows so far.
[2021-01-08 11:09:43,038] {hive_server_hook:162} INFO - Written 1740000 rows so far.
[2021-01-08 11:09:43,981] {hive_server_hook:162} INFO - Written 1750000 rows so far.
[2021-01-08 11:09:45,078] {hive_server_hook:162} INFO - Written 1760000 rows so far.
[2021-01-08 11:09:46,188] {hive_server_hook:162} INFO - Written 1770000 rows so far.
[2021-01-08 11:09:47,276] {hive_server_hook:162} INFO - Written 1780000 rows so far.
[2021-01-08 11:09:48,352] {hive_server_hook:162} INFO - Written 1790000 rows so far.
[2021-01-08 11:09:49,384] {hive_server_hook:162} INFO - Written 1

[2021-01-08 11:11:12,341] {hive_server_hook:162} INFO - Written 2660000 rows so far.
[2021-01-08 11:11:13,297] {hive_server_hook:162} INFO - Written 2670000 rows so far.
[2021-01-08 11:11:14,271] {hive_server_hook:162} INFO - Written 2680000 rows so far.
[2021-01-08 11:11:15,208] {hive_server_hook:162} INFO - Written 2690000 rows so far.
[2021-01-08 11:11:16,170] {hive_server_hook:162} INFO - Written 2700000 rows so far.
[2021-01-08 11:11:17,125] {hive_server_hook:162} INFO - Written 2710000 rows so far.
[2021-01-08 11:11:18,086] {hive_server_hook:162} INFO - Written 2720000 rows so far.
[2021-01-08 11:11:19,046] {hive_server_hook:162} INFO - Written 2730000 rows so far.
[2021-01-08 11:11:19,932] {hive_server_hook:162} INFO - Written 2740000 rows so far.
[2021-01-08 11:11:20,905] {hive_server_hook:162} INFO - Written 2750000 rows so far.
[2021-01-08 11:11:21,878] {hive_server_hook:162} INFO - Written 2760000 rows so far.
[2021-01-08 11:11:22,872] {hive_server_hook:162} INFO - Written 2

[2021-01-08 11:12:53,345] {hive_server_hook:162} INFO - Written 3630000 rows so far.
[2021-01-08 11:12:54,321] {hive_server_hook:162} INFO - Written 3640000 rows so far.
[2021-01-08 11:12:55,281] {hive_server_hook:162} INFO - Written 3650000 rows so far.
[2021-01-08 11:12:56,299] {hive_server_hook:162} INFO - Written 3660000 rows so far.
[2021-01-08 11:12:57,197] {hive_server_hook:162} INFO - Written 3670000 rows so far.
[2021-01-08 11:12:58,152] {hive_server_hook:162} INFO - Written 3680000 rows so far.
[2021-01-08 11:12:59,121] {hive_server_hook:162} INFO - Written 3690000 rows so far.
[2021-01-08 11:13:00,087] {hive_server_hook:162} INFO - Written 3700000 rows so far.
[2021-01-08 11:13:01,099] {hive_server_hook:162} INFO - Written 3710000 rows so far.
[2021-01-08 11:13:02,075] {hive_server_hook:162} INFO - Written 3720000 rows so far.
[2021-01-08 11:13:03,051] {hive_server_hook:162} INFO - Written 3730000 rows so far.
[2021-01-08 11:13:04,046] {hive_server_hook:162} INFO - Written 3

KeyboardInterrupt: 

In [None]:
if 'flag' in dl_site_ts_order_clean.columns:
    del dl_site_ts_order_clean['flag']

dl_site_ts_order_clean_flag = [pd.read_csv(curr_dir+"dl_site_ts_order_clean_2019_yh01.csv"),
                               pd.read_csv(curr_dir+"dl_site_ts_order_clean_2020_yh01.csv")]
dl_site_ts_order_clean_flag = pd.concat(dl_site_ts_order_clean_flag,axis=0)
dl_site_ts_order_clean = pd.merge(dl_site_ts_order_clean,dl_site_ts_order_clean_flag,on='car_id',how='left')


In [None]:
del dl_site_ts_order_clean['real_pay_amount_x']
dl_site_ts_order_clean = dl_site_ts_order_clean.rename(columns={'real_pay_amount_y':'real_pay_amount'})

dl_site_ts_order_clean.to_csv(curr_dir+"dl_site_ts_order_clean_yh01.csv")


In [None]:
sql_info ="""
SELECT car_id,
       '' area,
       '' source_model_name,
       brand_name,
       brand_code,
       series_name,
       series_code,  
       model_name,
       model_code,
       '' model_year,
       cast(purchase_price/10000 AS DECIMAL(10,2)) real_pay_amount,
       '' emission,
       color_name color,
       mileage,
       '' use_property,
       first_license_date license_time,
       pay_time publish_time,
       province_name a_province_name,
       city_name a_city_name,
       '' source_publish_time,
       '' sale_time,
       substr(first_license_date,0,4) license_time_year,
       substr(pay_time,0,4) publish_time_year,
       '' site,
       transform_num transfer_times 
FROM db_data.mid_car_dfc_purchase_order_flag_b 
WHERE flag = 0 
and pay_time is not null
 -- 确认选取建模时间
and pay_time<="2021-01-01"
and province_name is not null
and city_name is not null
"""
dtype={'city_code':str}

dl_site_ts_order_clean = read_from_hive2('dl_site_ts_order_clean_dfc',sql_info,dtype)
db_columns = []
for col in dl_site_ts_order_clean.columns:
    if len(col.split('.')) > 1:
        db_columns.append(col.split('.')[1])

    else:
        db_columns.append(col)

dl_site_ts_order_clean.columns = db_columns 
logger.log("读取数据完成.............")



In [None]:
## 剔除峰值
for col in ['count', 'mean', 'std','median']:
    if col in dl_site_ts_order_clean.columns:
        del dl_site_ts_order_clean[col]
dl_site_ts_order_clean_count = dl_site_ts_order_clean[['model_code','license_time_year','publish_time_year','real_pay_amount']].\
                groupby(['model_code','license_time_year','publish_time_year']).count().\
                reset_index().rename(columns={'real_pay_amount':'count'})

dl_site_ts_order_clean_mean = dl_site_ts_order_clean[['model_code','license_time_year','publish_time_year','real_pay_amount']].\
                groupby(['model_code','license_time_year','publish_time_year']).mean().\
                reset_index().rename(columns={'real_pay_amount':'mean'})

dl_site_ts_order_clean_std = dl_site_ts_order_clean[['model_code','license_time_year','publish_time_year','real_pay_amount']].\
                groupby(['model_code','license_time_year','publish_time_year']).std().\
                reset_index().rename(columns={'real_pay_amount':'std'})

dl_site_ts_order_clean_median = dl_site_ts_order_clean[['model_code','license_time_year','publish_time_year','real_pay_amount']].\
                groupby(['model_code','license_time_year','publish_time_year']).median().\
                reset_index().rename(columns={'real_pay_amount':'median'})

dl_site_ts_order_clean = pd.merge(dl_site_ts_order_clean,
                                  dl_site_ts_order_clean_count,
                                  on=['model_code','license_time_year','publish_time_year'],
                                 how='left')

dl_site_ts_order_clean = pd.merge(dl_site_ts_order_clean,
                                  dl_site_ts_order_clean_mean,
                                  on=['model_code','license_time_year','publish_time_year'],
                                 how='left')

dl_site_ts_order_clean = pd.merge(dl_site_ts_order_clean,
                                  dl_site_ts_order_clean_std,
                                  on=['model_code','license_time_year','publish_time_year'],
                                 how='left')

dl_site_ts_order_clean = pd.merge(dl_site_ts_order_clean,
                                  dl_site_ts_order_clean_median,
                                  on=['model_code','license_time_year','publish_time_year'],
                                 how='left')
dl_site_ts_order_clean = dl_site_ts_order_clean.loc[dl_site_ts_order_clean['count'] >5]

dl_site_ts_order_clean.loc[dl_site_ts_order_clean['mean'] * 0.3 < dl_site_ts_order_clean['std'],'std'] = dl_site_ts_order_clean['mean'] * 0.3

dl_site_ts_order_clean.loc[(dl_site_ts_order_clean['median'] > dl_site_ts_order_clean['mean']) & 
                          (dl_site_ts_order_clean['real_pay_amount'] < dl_site_ts_order_clean['mean']*0.3),'delete_flag'] = 1

dl_site_ts_order_clean.loc[(dl_site_ts_order_clean['median'] < dl_site_ts_order_clean['mean']) & 
                          (dl_site_ts_order_clean['real_pay_amount'] > dl_site_ts_order_clean['mean']*1.3),'delete_flag'] = 1


dl_site_ts_order_clean.loc[(dl_site_ts_order_clean['median'] > dl_site_ts_order_clean['mean']) & 
                          (dl_site_ts_order_clean['real_pay_amount'] < dl_site_ts_order_clean['mean']*0.3),\
                           'real_pay_amount'] = dl_site_ts_order_clean['mean']*0.3

dl_site_ts_order_clean.loc[(dl_site_ts_order_clean['median'] < dl_site_ts_order_clean['mean']) & 
                          (dl_site_ts_order_clean['real_pay_amount'] > dl_site_ts_order_clean['mean']*1.3),\
                           'real_pay_amount'] = dl_site_ts_order_clean['mean']*1.3


In [None]:
## 剔除异常值
for col in ['count', 'mean', 'std','median']:
    if col in dl_site_ts_order_clean.columns:
        del dl_site_ts_order_clean[col]
dl_site_ts_order_clean_count = dl_site_ts_order_clean[['model_code','license_time_year','publish_time_year','real_pay_amount']].\
                groupby(['model_code','license_time_year','publish_time_year']).count().\
                reset_index().rename(columns={'real_pay_amount':'count'})

dl_site_ts_order_clean_mean = dl_site_ts_order_clean[['model_code','license_time_year','publish_time_year','real_pay_amount']].\
                groupby(['model_code','license_time_year','publish_time_year']).mean().\
                reset_index().rename(columns={'real_pay_amount':'mean'})

dl_site_ts_order_clean_std = dl_site_ts_order_clean[['model_code','license_time_year','publish_time_year','real_pay_amount']].\
                groupby(['model_code','license_time_year','publish_time_year']).std().\
                reset_index().rename(columns={'real_pay_amount':'std'})

dl_site_ts_order_clean_median = dl_site_ts_order_clean[['model_code','license_time_year','publish_time_year','real_pay_amount']].\
                groupby(['model_code','license_time_year','publish_time_year']).median().\
                reset_index().rename(columns={'real_pay_amount':'median'})

dl_site_ts_order_clean = pd.merge(dl_site_ts_order_clean,
                                  dl_site_ts_order_clean_count,
                                  on=['model_code','license_time_year','publish_time_year'],
                                 how='left')

dl_site_ts_order_clean = pd.merge(dl_site_ts_order_clean,
                                  dl_site_ts_order_clean_mean,
                                  on=['model_code','license_time_year','publish_time_year'],
                                 how='left')

dl_site_ts_order_clean = pd.merge(dl_site_ts_order_clean,
                                  dl_site_ts_order_clean_std,
                                  on=['model_code','license_time_year','publish_time_year'],
                                 how='left')

dl_site_ts_order_clean = pd.merge(dl_site_ts_order_clean,
                                  dl_site_ts_order_clean_median,
                                  on=['model_code','license_time_year','publish_time_year'],
                                 how='left')
dl_site_ts_order_clean = dl_site_ts_order_clean.loc[dl_site_ts_order_clean['count'] >5]

dl_site_ts_order_clean.loc[dl_site_ts_order_clean['mean'] * 0.3 < dl_site_ts_order_clean['std'],'std'] = dl_site_ts_order_clean['mean'] * 0.3

dl_site_ts_order_clean.loc[(dl_site_ts_order_clean['real_pay_amount'] < dl_site_ts_order_clean['mean']*0.3),'delete_flag1'] = 1

dl_site_ts_order_clean.loc[(dl_site_ts_order_clean['real_pay_amount'] > dl_site_ts_order_clean['mean']*1.3),'delete_flag1'] = 1


dl_site_ts_order_clean.loc[(dl_site_ts_order_clean['real_pay_amount'] < dl_site_ts_order_clean['mean']*0.3),\
                           'real_pay_amount'] = dl_site_ts_order_clean['mean']*0.3

dl_site_ts_order_clean.loc[(dl_site_ts_order_clean['real_pay_amount'] > dl_site_ts_order_clean['mean']*1.3),\
                           'real_pay_amount'] = dl_site_ts_order_clean['mean']*1.3

dl_site_ts_order_clean.to_csv(curr_dir+'dl_site_ts_order_clean_dfc_purchase_0.csv')


In [None]:
sql_info ="""
SELECT car_id,
       '' area,
       '' source_model_name,
       brand_name,
       brand_code,
       series_name,
       series_code,  
       model_name,
       model_code,
       '' model_year,
       real_pay_amount,
       '' emission,
       color_name color,
       mileage,
       '' use_property,
       first_license_date license_time,
       pay_time publish_time,
       province_name a_province_name,
       city_name a_city_name,
       '' source_publish_time,
       '' sale_time,
       substr(first_license_date,0,4) license_time_year,
       substr(pay_time,0,4) publish_time_year,
       '' site,
       transform_num transfer_times 
FROM db_data.mid_car_dfc_sale_order_flag_b  
WHERE flag = 0
and trade_type !='B2B' 
and pay_time is not null
and pay_time<="2021-01-01"
and province_name is not null
and city_name is not null 
"""
dtype={'city_code':str}

dl_site_ts_order_clean = read_from_hive2('dl_site_ts_order_clean_dfc',sql_info,dtype)
db_columns = []
for col in dl_site_ts_order_clean.columns:
    if len(col.split('.')) > 1:
        db_columns.append(col.split('.')[1])

    else:
        db_columns.append(col)

dl_site_ts_order_clean.columns = db_columns 
logger.log("读取数据完成.............")



In [None]:
## 清洗大风车2019和2020年数据
get_columns = [ 'car_id', 'area', 'source_model_name', 'brand_name',
       'brand_code', 'series_name', 'series_code', 'model_name', 'model_code',
       'model_year', 'real_pay_amount', 'emission', 'color', 'mileage',
       'use_property', 'license_time', 'publish_time', 'a_province_name',
       'a_city_name', 'source_publish_time', 'sale_time', 'license_time_year',
       'publish_time_year', 'site', 'transfer_times']

dl_site_ts_order_clean_quan = pd.read_csv(curr_dir+'dl_site_ts_order_clean_yh01.csv')

dl_site_ts_order_clean_quan = dl_site_ts_order_clean_quan.loc[~dl_site_ts_order_clean_quan['license_time_year'].isnull()]
dl_site_ts_order_clean_quan = dl_site_ts_order_clean_quan.loc[~dl_site_ts_order_clean_quan['publish_time_year'].isnull()]
dl_site_ts_order_clean = dl_site_ts_order_clean.loc[~dl_site_ts_order_clean['license_time_year'].isnull()]
dl_site_ts_order_clean = dl_site_ts_order_clean.loc[~dl_site_ts_order_clean['publish_time_year'].isnull()]

dl_site_ts_order_clean_quan['license_time_year'] = dl_site_ts_order_clean_quan['license_time_year'].map(lambda x:int(x))
dl_site_ts_order_clean_quan['publish_time_year'] = dl_site_ts_order_clean_quan['publish_time_year'].map(lambda x:int(x))

dl_site_ts_order_clean_min = dl_site_ts_order_clean_quan[['model_code','license_time_year','publish_time_year','real_pay_amount']].\
groupby(['model_code','license_time_year','publish_time_year']).min().reset_index().rename(columns={'real_pay_amount':'real_pay_amount_min'})
dl_site_ts_order_clean_max = dl_site_ts_order_clean_quan[['model_code','license_time_year','publish_time_year','real_pay_amount']].\
groupby(['model_code','license_time_year','publish_time_year']).max().reset_index().rename(columns={'real_pay_amount':'real_pay_amount_max'})
dl_site_ts_order_clean_count = dl_site_ts_order_clean[['model_code','publish_time_year','real_pay_amount']].\
groupby(['model_code','publish_time_year']).count().reset_index().rename(columns={'real_pay_amount':'count'})

dl_site_ts_order_clean['license_time_year'] = dl_site_ts_order_clean['license_time_year'].map(lambda x:int(x))
dl_site_ts_order_clean['publish_time_year'] = dl_site_ts_order_clean['publish_time_year'].map(lambda x:int(x))

dl_site_ts_order_clean_max_min= pd.merge(dl_site_ts_order_clean_max,
        dl_site_ts_order_clean_min,
        on=['model_code','license_time_year','publish_time_year'],
        how='left')

dl_site_ts_order_clean = pd.merge(dl_site_ts_order_clean,
         dl_site_ts_order_clean_max_min,
        on=['model_code','license_time_year','publish_time_year'],
        how='left')

dl_site_ts_order_clean = pd.merge(dl_site_ts_order_clean,
         dl_site_ts_order_clean_count,
        on=['model_code','publish_time_year'],
        how='left')

dl_site_ts_order_clean['real_pay_amount'] = dl_site_ts_order_clean['real_pay_amount']/10000
dl_site_ts_order_clean_dfc_1 = dl_site_ts_order_clean.loc[(dl_site_ts_order_clean['real_pay_amount'] >= dl_site_ts_order_clean['real_pay_amount_min']) & 
                          (dl_site_ts_order_clean['real_pay_amount'] <= dl_site_ts_order_clean['real_pay_amount_max'])
                          ]

dl_site_ts_order_clean_dfc_0 = dl_site_ts_order_clean.loc[(dl_site_ts_order_clean['real_pay_amount_max'].isnull()) & 
                          (dl_site_ts_order_clean['count'] > 10)
                          ]

print(dl_site_ts_order_clean_dfc_1.shape,dl_site_ts_order_clean_dfc_0.shape)
dl_site_ts_order_clean = pd.concat([dl_site_ts_order_clean_dfc_1,dl_site_ts_order_clean_dfc_0],axis=0)
dl_site_ts_order_clean.to_csv(curr_dir+'dl_site_ts_order_clean_dfc_sales_0.csv')


In [None]:
### =======================================
##            生成特征编码数据
### =======================================

In [9]:
sql_info ="""
SELECT model_code,series_code,brand_code,series_name,brand_name  
FROM db_data.ods_car_model_model  
WHERE ds = date_sub('"""+curr_date+"""',2) 
"""
dtype={'city_code':str}

model_info  = read_from_hive2('model_info ',sql_info,dtype)
db_columns = []
for col in model_info.columns:
    if len(col.split('.')) > 1:
        db_columns.append(col.split('.')[1])

    else:
        db_columns.append(col)

model_info.columns = db_columns 
logger.log("读取数据完成.............")


[2021-01-08 11:21:11,037] {hiveserver2:138} INFO - Using database default as default
[2021-01-08 11:21:14,399] {hive_server_hook:112} INFO - Running query: 
SELECT model_code,series_code,brand_code,series_name,brand_name  
FROM db_data.ods_car_model_model  
WHERE ds = date_sub('2021-01-08',2) 

[2021-01-08 11:21:14,869] {hive_server_hook:162} INFO - Written 10000 rows so far.
[2021-01-08 11:21:15,160] {hive_server_hook:162} INFO - Written 20000 rows so far.
[2021-01-08 11:21:15,426] {hive_server_hook:162} INFO - Written 30000 rows so far.
[2021-01-08 11:21:15,688] {hive_server_hook:162} INFO - Written 40000 rows so far.
[2021-01-08 11:21:15,933] {hive_server_hook:162} INFO - Written 50000 rows so far.
[2021-01-08 11:21:16,189] {hive_server_hook:162} INFO - Written 60000 rows so far.
[2021-01-08 11:21:16,449] {hive_server_hook:162} INFO - Written 70000 rows so far.
[2021-01-08 11:21:16,520] {hive_server_hook:162} INFO - Written 73032 rows so far.
[2021-01-08 11:21:16,524] {hiveserver2:2

In [10]:
province_name_list = ['山东','福建','河南','河北','重庆','湖南','湖北','海南','江西','黑龙江','天津','陕西','贵州','新疆','江苏',
                        '安徽','西藏','吉林','上海','甘肃','山西','宁夏','四川','广西','浙江','云南','内蒙古','辽宁','广东','青海',
                        '北京','香港','澳门','台湾']
a_province_name_dict = {}
for index,a_province_name in enumerate(province_name_list):
    a_province_name_dict[a_province_name] = index

# 
color_dict = {}
province_name_list = ['多彩色','粉红色','冰川白','金色','香槟金','银色','橙色','绿色','咖啡色','黄色','紫色','灰色','棕色','深灰色','香槟色','蓝色',
                    '银灰色','红色','其他','黑色','白色']
for index,a_province_name in enumerate(province_name_list):
    color_dict[a_province_name] = index

transfer_times_dict = {}
province_name_list = [-1,0,1,2,3,4,5]
for index,a_province_name in enumerate(province_name_list):
    transfer_times_dict[a_province_name] = index

brand_code_dict = {}
for index,brand_code in enumerate(list(model_info['brand_code'].unique())):
    brand_code_dict[brand_code] = index

series_code_dict = {}
for index,series_code in enumerate(list(model_info['series_code'].unique())):
    series_code_dict[series_code] = index


In [11]:
transfer_times_list = []
for key,value in transfer_times_dict.items():
    transfer_times_list.append(['transfer_times',key,value])
    
for key,value in color_dict.items():
    transfer_times_list.append(['color',key,value])

for key,value in a_province_name_dict.items():
    transfer_times_list.append(['province_name',key,value])
    
for key,value in brand_code_dict.items():
    transfer_times_list.append(['brand_code',key,value])
    
for key,value in series_code_dict.items():
    transfer_times_list.append(['series_code',key,value])
           
transfer_times_df = pd.DataFrame(transfer_times_list,columns=['name','key','value'])
transfer_times_df.to_excel('./transfer_times_df_yh01.xlsx')
