In [1]:
# -*- coding: utf-8 -*-
import warnings
warnings.filterwarnings('ignore')
import logging
import datetime

import csv, math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.externals import joblib
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from dayu.hooks.oss_hook import OSSHook
from dayu.hooks.hive_server_hook import HiveServerHook
from dayu.hooks.hive_cli_hook import HiveCliHook

def split_table_name(datain):
    new_cols = []
    for column in datain.columns:
        if(len(column.split('.'))<2):
            return datain
        tb_name, col_name = column.split('.')
        new_cols.append((column, col_name))
    datain = datain.rename(columns=dict(new_cols))
    return datain

def read_from_hive2(output_file_name,insql,dtype):
    filename = output_file_name
    filepath = curr_dir+filename
    hive = HiveServerHook("warehouse_hive")
    hive.to_csv(insql,filepath , delimiter=',',lineterminator='\n', output_header=True)
    outdata = pd.read_csv(filepath, header=0,dtype=dtype)
    # 去除列名中带有的表名
#     outdata = split_table_name(outdata)
    return outdata

## 计算时间差
def date_time_sub(startTime,endTime,date_format):
    try:
        startTime= datetime.datetime.strptime(startTime,date_format)
        endTime= datetime.datetime.strptime(endTime,date_format)
        return (endTime - startTime).days
    except:
        pass

    # 计算车辆当前保值率
def computer_with_license_month(tar):
    try:
        license_month = tar['license_month']
        if(license_month<=12):
            #tar['keep_value'] = tar['year_1']
            return tar['year_1']
        else:
            year = license_month//12
            #当前年保值率
            keep_max = tar["year_"+str(int(year))]
            #下一年的保值率
            keep_min = tar["year_"+str(int(year+1))]

            #相比于上一年，已经过了几个月
            mon = license_month-12*year
            tem = (keep_max-keep_min)/12

            #tar['keep_value'] = round(keep_max - tem*mon, 4)
            return round(keep_max - tem*mon, 4)
    except:
        return tar["year_16"]

class Logger:       
    def __init__(self, logName, logFile):
        self._logger = logging.getLogger(logName)
        handler = logging.FileHandler(logFile)
        formatter = logging.Formatter('%(asctime)s ********* %(message)s')
        handler.setFormatter(formatter)
        self._logger.addHandler(handler)
        self._logger.setLevel(logging.INFO)

    def log(self, msg):
        if self._logger is not None:
            self._logger.info(msg)


pd.set_option('display.max_columns', 500)
curr_dir = '/home/souche/qiongjiu/hgc/'

curr_date=str(datetime.datetime.now())[0:10]
hive_cli = HiveCliHook("warehouse_hive")

logger = Logger('model_service','./log/accurate_valuation_cyp_run_log.log')
logger.log("程序启动.............")
## 设定需要计算的年
computer_year = '2019'

DAYU_HOME : /home/souche/projects/datacenter-etl-v2
[2021-02-04 15:46:56,170] {driver:120} INFO - Generating grammar tables from /usr/lib/python3.5/lib2to3/Grammar.txt
[2021-02-04 15:46:56,197] {driver:120} INFO - Generating grammar tables from /usr/lib/python3.5/lib2to3/PatternGrammar.txt
[2021-02-04 15:46:56,541] {<ipython-input-1-bac83ce33734>:84} INFO - 程序启动.............


In [3]:
sql_info ="""
select * 
from (
    SELECT
        car_id,
        brand_name,
        model_name,
        a_province_name,
        model_code,
        substr(publish_time,1,4)  pay_year,
        substr(license_time,1,4) license_year,
        second_price/10000 real_pay_amount,
        a_province_name,
        mileage,
        row_number() over(partition by model_code,mileage,license_time,a_province_name order by publish_time desc) num 
    FROM
        dl_outrs.dl_vehicle_data_scavenger_ipbl_t_reptile_car_dd
    WHERE
        ds = date_sub('"""+curr_date+"""',1) 
        and second_price/10000 >= 0.3    
        and second_price/10000 < 300 
        and mileage/10000 > 0     
        and mileage/10000 < 100 
        and site not in ('51','renrenche') 
        and substr(publish_time,0,4) in ('"""+computer_year+"""')
) t1
where t1.num = 1 
"""
dtype={'city_code':str}
dl_site_ts_order_clean = read_from_hive2('dl_site_ts_order_clean',sql_info,dtype)

db_columns = []
for col in dl_site_ts_order_clean.columns:
    if len(col.split('.')) > 1:
        db_columns.append(col.split('.')[1])

    else:
        db_columns.append(col)

dl_site_ts_order_clean.columns = db_columns 
logger.log("读取数据完成.............")



[2021-01-07 15:20:27,585] {hiveserver2:138} INFO - Using database default as default
[2021-01-07 15:20:27,725] {hive_server_hook:112} INFO - Running query: 
select * 
from (
    SELECT
        car_id,
        brand_name,
        model_name,
        a_province_name,
        model_code,
        substr(publish_time,1,4)  pay_year,
        substr(license_time,1,4) license_year,
        second_price/10000 real_pay_amount,
        a_province_name,
        mileage,
        row_number() over(partition by model_code,mileage,license_time,a_province_name order by publish_time desc) num 
    FROM
        dl_outrs.dl_vehicle_data_scavenger_ipbl_t_reptile_car_dd
    WHERE
        ds = date_sub('2021-01-07',1) 
        and second_price/10000 >= 0.3    
        and second_price/10000 < 300 
        and mileage/10000 > 0     
        and mileage/10000 < 100 
        and site not in ('51','renrenche') 
        and substr(publish_time,0,4) in ('2019')
) t1
where t1.num = 1 

[2021-01-07 15:21:33,337] {hi

[2021-01-07 15:22:09,833] {hive_server_hook:162} INFO - Written 880000 rows so far.
[2021-01-07 15:22:10,265] {hive_server_hook:162} INFO - Written 890000 rows so far.
[2021-01-07 15:22:10,669] {hive_server_hook:162} INFO - Written 900000 rows so far.
[2021-01-07 15:22:11,093] {hive_server_hook:162} INFO - Written 910000 rows so far.
[2021-01-07 15:22:11,518] {hive_server_hook:162} INFO - Written 920000 rows so far.
[2021-01-07 15:22:11,942] {hive_server_hook:162} INFO - Written 930000 rows so far.
[2021-01-07 15:22:12,338] {hive_server_hook:162} INFO - Written 940000 rows so far.
[2021-01-07 15:22:12,757] {hive_server_hook:162} INFO - Written 950000 rows so far.
[2021-01-07 15:22:13,182] {hive_server_hook:162} INFO - Written 960000 rows so far.
[2021-01-07 15:22:13,615] {hive_server_hook:162} INFO - Written 970000 rows so far.
[2021-01-07 15:22:14,048] {hive_server_hook:162} INFO - Written 980000 rows so far.
[2021-01-07 15:22:14,455] {hive_server_hook:162} INFO - Written 990000 rows 

[2021-01-07 15:22:50,998] {hive_server_hook:162} INFO - Written 1850000 rows so far.
[2021-01-07 15:22:51,421] {hive_server_hook:162} INFO - Written 1860000 rows so far.
[2021-01-07 15:22:51,908] {hive_server_hook:162} INFO - Written 1870000 rows so far.
[2021-01-07 15:22:52,327] {hive_server_hook:162} INFO - Written 1880000 rows so far.
[2021-01-07 15:22:52,764] {hive_server_hook:162} INFO - Written 1890000 rows so far.
[2021-01-07 15:22:53,200] {hive_server_hook:162} INFO - Written 1900000 rows so far.
[2021-01-07 15:22:53,648] {hive_server_hook:162} INFO - Written 1910000 rows so far.
[2021-01-07 15:22:54,058] {hive_server_hook:162} INFO - Written 1920000 rows so far.
[2021-01-07 15:22:54,491] {hive_server_hook:162} INFO - Written 1930000 rows so far.
[2021-01-07 15:22:54,930] {hive_server_hook:162} INFO - Written 1940000 rows so far.
[2021-01-07 15:22:55,368] {hive_server_hook:162} INFO - Written 1950000 rows so far.
[2021-01-07 15:22:55,810] {hive_server_hook:162} INFO - Written 1

[2021-01-07 15:23:32,821] {hive_server_hook:162} INFO - Written 2820000 rows so far.
[2021-01-07 15:23:33,258] {hive_server_hook:162} INFO - Written 2830000 rows so far.
[2021-01-07 15:23:33,693] {hive_server_hook:162} INFO - Written 2840000 rows so far.
[2021-01-07 15:23:34,131] {hive_server_hook:162} INFO - Written 2850000 rows so far.
[2021-01-07 15:23:34,538] {hive_server_hook:162} INFO - Written 2860000 rows so far.
[2021-01-07 15:23:34,975] {hive_server_hook:162} INFO - Written 2870000 rows so far.
[2021-01-07 15:23:35,410] {hive_server_hook:162} INFO - Written 2880000 rows so far.
[2021-01-07 15:23:35,844] {hive_server_hook:162} INFO - Written 2890000 rows so far.
[2021-01-07 15:23:36,292] {hive_server_hook:162} INFO - Written 2900000 rows so far.
[2021-01-07 15:23:36,698] {hive_server_hook:162} INFO - Written 2910000 rows so far.
[2021-01-07 15:23:37,146] {hive_server_hook:162} INFO - Written 2920000 rows so far.
[2021-01-07 15:23:37,581] {hive_server_hook:162} INFO - Written 2

[2021-01-07 15:24:14,988] {hive_server_hook:162} INFO - Written 3790000 rows so far.
[2021-01-07 15:24:15,425] {hive_server_hook:162} INFO - Written 3800000 rows so far.
[2021-01-07 15:24:15,885] {hive_server_hook:162} INFO - Written 3810000 rows so far.
[2021-01-07 15:24:16,346] {hive_server_hook:162} INFO - Written 3820000 rows so far.
[2021-01-07 15:24:16,806] {hive_server_hook:162} INFO - Written 3830000 rows so far.
[2021-01-07 15:24:17,251] {hive_server_hook:162} INFO - Written 3840000 rows so far.
[2021-01-07 15:24:17,716] {hive_server_hook:162} INFO - Written 3850000 rows so far.
[2021-01-07 15:24:18,181] {hive_server_hook:162} INFO - Written 3860000 rows so far.
[2021-01-07 15:24:18,645] {hive_server_hook:162} INFO - Written 3870000 rows so far.
[2021-01-07 15:24:19,115] {hive_server_hook:162} INFO - Written 3880000 rows so far.
[2021-01-07 15:24:19,555] {hive_server_hook:162} INFO - Written 3890000 rows so far.
[2021-01-07 15:24:20,014] {hive_server_hook:162} INFO - Written 3

[2021-01-07 15:24:58,964] {hive_server_hook:162} INFO - Written 4760000 rows so far.
[2021-01-07 15:24:59,423] {hive_server_hook:162} INFO - Written 4770000 rows so far.
[2021-01-07 15:24:59,849] {hive_server_hook:162} INFO - Written 4780000 rows so far.
[2021-01-07 15:25:00,312] {hive_server_hook:162} INFO - Written 4790000 rows so far.
[2021-01-07 15:25:00,791] {hive_server_hook:162} INFO - Written 4800000 rows so far.
[2021-01-07 15:25:01,250] {hive_server_hook:162} INFO - Written 4810000 rows so far.
[2021-01-07 15:25:01,710] {hive_server_hook:162} INFO - Written 4820000 rows so far.
[2021-01-07 15:25:02,142] {hive_server_hook:162} INFO - Written 4830000 rows so far.
[2021-01-07 15:25:02,591] {hive_server_hook:162} INFO - Written 4840000 rows so far.
[2021-01-07 15:25:03,051] {hive_server_hook:162} INFO - Written 4850000 rows so far.
[2021-01-07 15:25:03,509] {hive_server_hook:162} INFO - Written 4860000 rows so far.
[2021-01-07 15:25:03,935] {hive_server_hook:162} INFO - Written 4

[2021-01-07 15:25:40,402] {hive_server_hook:162} INFO - Written 5730000 rows so far.
[2021-01-07 15:25:40,820] {hive_server_hook:162} INFO - Written 5740000 rows so far.
[2021-01-07 15:25:41,236] {hive_server_hook:162} INFO - Written 5750000 rows so far.
[2021-01-07 15:25:41,632] {hive_server_hook:162} INFO - Written 5760000 rows so far.
[2021-01-07 15:25:42,053] {hive_server_hook:162} INFO - Written 5770000 rows so far.
[2021-01-07 15:25:42,483] {hive_server_hook:162} INFO - Written 5780000 rows so far.
[2021-01-07 15:25:42,907] {hive_server_hook:162} INFO - Written 5790000 rows so far.
[2021-01-07 15:25:43,332] {hive_server_hook:162} INFO - Written 5800000 rows so far.
[2021-01-07 15:25:43,727] {hive_server_hook:162} INFO - Written 5810000 rows so far.
[2021-01-07 15:25:44,148] {hive_server_hook:162} INFO - Written 5820000 rows so far.
[2021-01-07 15:25:44,564] {hive_server_hook:162} INFO - Written 5830000 rows so far.
[2021-01-07 15:25:44,988] {hive_server_hook:162} INFO - Written 5

In [14]:
## 重命名
dl_site_ts_order_clean = dl_site_ts_order_clean.rename(columns={'license_year':'license_time_year','pay_year':'publish_time_year'})


In [15]:
## 剔除极大或者极小的峰值
dl_site_ts_order_clean_count = dl_site_ts_order_clean[['model_code','license_time_year','real_pay_amount']].\
groupby(['model_code','license_time_year']).count().reset_index().rename(columns={'real_pay_amount':'count'})

dl_site_ts_order_clean_mean = dl_site_ts_order_clean[['model_code','license_time_year','real_pay_amount']].\
groupby(['model_code','license_time_year']).mean().reset_index().rename(columns={'real_pay_amount':'mean'})

dl_site_ts_order_clean_std = dl_site_ts_order_clean[['model_code','license_time_year','real_pay_amount']].\
groupby(['model_code','license_time_year']).std().reset_index().rename(columns={'real_pay_amount':'std'})

dl_site_ts_order_clean_median = dl_site_ts_order_clean[['model_code','license_time_year','real_pay_amount']].\
groupby(['model_code','license_time_year']).median().reset_index().rename(columns={'real_pay_amount':'median'})

dl_site_ts_order_clean_min = dl_site_ts_order_clean[['model_code','license_time_year','real_pay_amount']].\
groupby(['model_code','license_time_year']).min().reset_index().rename(columns={'real_pay_amount':'min_price'})

dl_site_ts_order_clean_max = dl_site_ts_order_clean[['model_code','license_time_year','real_pay_amount']].\
groupby(['model_code','license_time_year']).max().reset_index().rename(columns={'real_pay_amount':'max_price'})

dl_site_ts_order_clean = pd.merge(dl_site_ts_order_clean,dl_site_ts_order_clean_count,on=['model_code','license_time_year'],how='left')
dl_site_ts_order_clean = pd.merge(dl_site_ts_order_clean,dl_site_ts_order_clean_mean,on=['model_code','license_time_year'],how='left')
dl_site_ts_order_clean = pd.merge(dl_site_ts_order_clean,dl_site_ts_order_clean_std,on=['model_code','license_time_year'],how='left')
dl_site_ts_order_clean = pd.merge(dl_site_ts_order_clean,dl_site_ts_order_clean_min,on=['model_code','license_time_year'],how='left')
dl_site_ts_order_clean = pd.merge(dl_site_ts_order_clean,dl_site_ts_order_clean_max,on=['model_code','license_time_year'],how='left')
dl_site_ts_order_clean = pd.merge(dl_site_ts_order_clean,dl_site_ts_order_clean_median,on=['model_code','license_time_year'],how='left')

dl_site_ts_order_clean.loc[dl_site_ts_order_clean['mean']*0.3 < dl_site_ts_order_clean['std'],'std'] = dl_site_ts_order_clean['mean']*0.3

dl_site_ts_order_clean.loc[(dl_site_ts_order_clean['median'] > dl_site_ts_order_clean['mean']) & 
#                           (dl_site_ts_order_clean['mean']*0.5 <= dl_site_ts_order_clean['std']) & 
                          (dl_site_ts_order_clean['real_pay_amount'] < dl_site_ts_order_clean['mean']-dl_site_ts_order_clean['std']),'delete_flag'] = 1

dl_site_ts_order_clean.loc[(dl_site_ts_order_clean['median'] < dl_site_ts_order_clean['mean']) & 
#                           (dl_site_ts_order_clean['mean']*0.5 <= dl_site_ts_order_clean['std']) & 
                          (dl_site_ts_order_clean['real_pay_amount'] > dl_site_ts_order_clean['mean']+dl_site_ts_order_clean['std']),'delete_flag'] = 1

dl_site_ts_order_clean.loc[(dl_site_ts_order_clean['median'] > dl_site_ts_order_clean['mean']) & 
#                           (dl_site_ts_order_clean['mean']*0.5 <= dl_site_ts_order_clean['std']) & 
                          (dl_site_ts_order_clean['real_pay_amount'] < dl_site_ts_order_clean['mean']-dl_site_ts_order_clean['std']),'real_pay_amount'] = dl_site_ts_order_clean['mean']-dl_site_ts_order_clean['std']

dl_site_ts_order_clean.loc[(dl_site_ts_order_clean['median'] < dl_site_ts_order_clean['mean']) & 
#                           (dl_site_ts_order_clean['mean']*0.5 <= dl_site_ts_order_clean['std']) & 
                          (dl_site_ts_order_clean['real_pay_amount'] > dl_site_ts_order_clean['mean']+dl_site_ts_order_clean['std']),'real_pay_amount'] = dl_site_ts_order_clean['mean']+dl_site_ts_order_clean['std']



In [16]:
## 剔除极大或者极小的峰值
dl_site_ts_order_clean_count = dl_site_ts_order_clean[['model_code','license_time_year','real_pay_amount']].\
groupby(['model_code','license_time_year']).count().reset_index().rename(columns={'real_pay_amount':'count_1'})

dl_site_ts_order_clean_mean = dl_site_ts_order_clean[['model_code','license_time_year','real_pay_amount']].\
groupby(['model_code','license_time_year']).mean().reset_index().rename(columns={'real_pay_amount':'mean_1'})

dl_site_ts_order_clean_std = dl_site_ts_order_clean[['model_code','license_time_year','real_pay_amount']].\
groupby(['model_code','license_time_year']).std().reset_index().rename(columns={'real_pay_amount':'std_1'})

dl_site_ts_order_clean_median = dl_site_ts_order_clean[['model_code','license_time_year','real_pay_amount']].\
groupby(['model_code','license_time_year']).median().reset_index().rename(columns={'real_pay_amount':'median_1'})

dl_site_ts_order_clean_min = dl_site_ts_order_clean[['model_code','license_time_year','real_pay_amount']].\
groupby(['model_code','license_time_year']).min().reset_index().rename(columns={'real_pay_amount':'min_price_1'})

dl_site_ts_order_clean_max = dl_site_ts_order_clean[['model_code','license_time_year','real_pay_amount']].\
groupby(['model_code','license_time_year']).max().reset_index().rename(columns={'real_pay_amount':'max_price_1'})

dl_site_ts_order_clean = pd.merge(dl_site_ts_order_clean,dl_site_ts_order_clean_count,on=['model_code','license_time_year'],how='left')
dl_site_ts_order_clean = pd.merge(dl_site_ts_order_clean,dl_site_ts_order_clean_mean,on=['model_code','license_time_year'],how='left')
dl_site_ts_order_clean = pd.merge(dl_site_ts_order_clean,dl_site_ts_order_clean_std,on=['model_code','license_time_year'],how='left')
dl_site_ts_order_clean = pd.merge(dl_site_ts_order_clean,dl_site_ts_order_clean_min,on=['model_code','license_time_year'],how='left')
dl_site_ts_order_clean = pd.merge(dl_site_ts_order_clean,dl_site_ts_order_clean_max,on=['model_code','license_time_year'],how='left')
dl_site_ts_order_clean = pd.merge(dl_site_ts_order_clean,dl_site_ts_order_clean_median,on=['model_code','license_time_year'],how='left')

## 剔除峰值
dl_site_ts_order_clean.loc[dl_site_ts_order_clean['mean_1']*0.3 < dl_site_ts_order_clean['std_1'],'std_1'] = dl_site_ts_order_clean['mean_1']*0.3

dl_site_ts_order_clean.loc[(dl_site_ts_order_clean['median_1'] > dl_site_ts_order_clean['mean_1']) & 
#                           (dl_site_ts_order_clean['mean']*0.5 <= dl_site_ts_order_clean['std']) & 
                          (dl_site_ts_order_clean['real_pay_amount'] < dl_site_ts_order_clean['mean_1']-dl_site_ts_order_clean['std_1']),'delete_flag_1'] = 1

dl_site_ts_order_clean.loc[(dl_site_ts_order_clean['median_1'] < dl_site_ts_order_clean['mean_1']) & 
#                           (dl_site_ts_order_clean['mean']*0.5 <= dl_site_ts_order_clean['std']) & 
                          (dl_site_ts_order_clean['real_pay_amount'] > dl_site_ts_order_clean['mean_1']+dl_site_ts_order_clean['std_1']),'delete_flag_1'] = 1

dl_site_ts_order_clean.loc[(dl_site_ts_order_clean['median_1'] > dl_site_ts_order_clean['mean_1']) & 
#                           (dl_site_ts_order_clean['mean']*0.5 <= dl_site_ts_order_clean['std']) & 
                          (dl_site_ts_order_clean['real_pay_amount'] < dl_site_ts_order_clean['mean_1']-dl_site_ts_order_clean['std_1']),'real_pay_amount'] = dl_site_ts_order_clean['mean_1']-dl_site_ts_order_clean['std_1']

dl_site_ts_order_clean.loc[(dl_site_ts_order_clean['median_1'] < dl_site_ts_order_clean['mean_1']) & 
#                           (dl_site_ts_order_clean['mean']*0.5 <= dl_site_ts_order_clean['std']) & 
                          (dl_site_ts_order_clean['real_pay_amount'] > dl_site_ts_order_clean['mean_1']+dl_site_ts_order_clean['std_1']),'real_pay_amount'] = dl_site_ts_order_clean['mean_1']+dl_site_ts_order_clean['std_1']



In [17]:
## 剔除极大或者极小的峰值
dl_site_ts_order_clean_count = dl_site_ts_order_clean[['model_code','license_time_year','real_pay_amount']].\
groupby(['model_code','license_time_year']).count().reset_index().rename(columns={'real_pay_amount':'count_2'})

dl_site_ts_order_clean_mean = dl_site_ts_order_clean[['model_code','license_time_year','real_pay_amount']].\
groupby(['model_code','license_time_year']).mean().reset_index().rename(columns={'real_pay_amount':'mean_2'})

dl_site_ts_order_clean_std = dl_site_ts_order_clean[['model_code','license_time_year','real_pay_amount']].\
groupby(['model_code','license_time_year']).std().reset_index().rename(columns={'real_pay_amount':'std_2'})

dl_site_ts_order_clean_median = dl_site_ts_order_clean[['model_code','license_time_year','real_pay_amount']].\
groupby(['model_code','license_time_year']).median().reset_index().rename(columns={'real_pay_amount':'median_2'})

dl_site_ts_order_clean_min = dl_site_ts_order_clean[['model_code','license_time_year','real_pay_amount']].\
groupby(['model_code','license_time_year']).min().reset_index().rename(columns={'real_pay_amount':'min_price_2'})

dl_site_ts_order_clean_max = dl_site_ts_order_clean[['model_code','license_time_year','real_pay_amount']].\
groupby(['model_code','license_time_year']).max().reset_index().rename(columns={'real_pay_amount':'max_price_2'})

dl_site_ts_order_clean = pd.merge(dl_site_ts_order_clean,dl_site_ts_order_clean_count,on=['model_code','license_time_year'],how='left')
dl_site_ts_order_clean = pd.merge(dl_site_ts_order_clean,dl_site_ts_order_clean_mean,on=['model_code','license_time_year'],how='left')
dl_site_ts_order_clean = pd.merge(dl_site_ts_order_clean,dl_site_ts_order_clean_std,on=['model_code','license_time_year'],how='left')
dl_site_ts_order_clean = pd.merge(dl_site_ts_order_clean,dl_site_ts_order_clean_min,on=['model_code','license_time_year'],how='left')
dl_site_ts_order_clean = pd.merge(dl_site_ts_order_clean,dl_site_ts_order_clean_max,on=['model_code','license_time_year'],how='left')
dl_site_ts_order_clean = pd.merge(dl_site_ts_order_clean,dl_site_ts_order_clean_median,on=['model_code','license_time_year'],how='left')

## 剔除峰值
dl_site_ts_order_clean.loc[dl_site_ts_order_clean['mean_2']*0.3 < dl_site_ts_order_clean['std_2'],'std_2'] = dl_site_ts_order_clean['mean_2']*0.3

dl_site_ts_order_clean.loc[(dl_site_ts_order_clean['median_2'] > dl_site_ts_order_clean['mean_2']) & 
#                           (dl_site_ts_order_clean['mean']*0.5 <= dl_site_ts_order_clean['std']) & 
                          (dl_site_ts_order_clean['real_pay_amount'] < dl_site_ts_order_clean['mean_2']-dl_site_ts_order_clean['std_2']),'delete_flag_2'] = 1

dl_site_ts_order_clean.loc[(dl_site_ts_order_clean['median_2'] < dl_site_ts_order_clean['mean_2']) & 
#                           (dl_site_ts_order_clean['mean']*0.5 <= dl_site_ts_order_clean['std']) & 
                          (dl_site_ts_order_clean['real_pay_amount'] > dl_site_ts_order_clean['mean_2']+dl_site_ts_order_clean['std_2']),'delete_flag_2'] = 1

dl_site_ts_order_clean.loc[(dl_site_ts_order_clean['median_2'] > dl_site_ts_order_clean['mean_2']) & 
#                           (dl_site_ts_order_clean['mean']*0.5 <= dl_site_ts_order_clean['std']) & 
                          (dl_site_ts_order_clean['real_pay_amount'] < dl_site_ts_order_clean['mean_2']-dl_site_ts_order_clean['std_2']),'real_pay_amount'] = dl_site_ts_order_clean['mean_2']-dl_site_ts_order_clean['std_2']

dl_site_ts_order_clean.loc[(dl_site_ts_order_clean['median_2'] < dl_site_ts_order_clean['mean_2']) & 
#                           (dl_site_ts_order_clean['mean']*0.5 <= dl_site_ts_order_clean['std']) & 
                          (dl_site_ts_order_clean['real_pay_amount'] > dl_site_ts_order_clean['mean_2']+dl_site_ts_order_clean['std_2']),'real_pay_amount'] = dl_site_ts_order_clean['mean_2']+dl_site_ts_order_clean['std_2']


In [21]:
## 保存处理后的数据
dl_site_ts_order_clean[['car_id','real_pay_amount','delete_flag','delete_flag_1','delete_flag_2']].to_csv(curr_dir+"dl_site_ts_order_clean_"+computer_year+"_yh01.csv")


In [19]:
print(dl_site_ts_order_clean.loc[(dl_site_ts_order_clean['delete_flag'] == 1)].shape)
print(dl_site_ts_order_clean.loc[(dl_site_ts_order_clean['delete_flag_1'] == 1)].shape)
print(dl_site_ts_order_clean.loc[(dl_site_ts_order_clean['delete_flag_2'] == 1)].shape)


(957812, 32)
(1194801, 32)
(1339102, 32)
