In [2]:
# -*- coding: utf-8 -*-
import warnings
warnings.filterwarnings('ignore')
import logging
import datetime

import csv, math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.externals import joblib
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from dayu.hooks.oss_hook import OSSHook
from dayu.hooks.hive_server_hook import HiveServerHook
from dayu.hooks.hive_cli_hook import HiveCliHook

def split_table_name(datain):
    new_cols = []
    for column in datain.columns:
        if(len(column.split('.'))<2):
            return datain
        tb_name, col_name = column.split('.')
        new_cols.append((column, col_name))
    datain = datain.rename(columns=dict(new_cols))
    return datain

def read_from_hive2(output_file_name,insql,dtype):
    filename = output_file_name
    filepath = curr_dir+filename
    hive = HiveServerHook("warehouse_hive")
    hive.to_csv(insql,filepath , delimiter=',',lineterminator='\n', output_header=True)
    outdata = pd.read_csv(filepath, header=0,dtype=dtype)
    # 去除列名中带有的表名
#     outdata = split_table_name(outdata)
    return outdata

## 计算时间差
def date_time_sub(startTime,endTime,date_format):
    try:
        startTime= datetime.datetime.strptime(startTime,date_format)
        endTime= datetime.datetime.strptime(endTime,date_format)
        return (endTime - startTime).days
    except:
        pass

    # 计算车辆当前保值率
def computer_with_license_month(tar):
    try:
        license_month = tar['license_month']
        if(license_month<=12):
            #tar['keep_value'] = tar['year_1']
            return tar['year_1']
        else:
            year = license_month//12
            #当前年保值率
            keep_max = tar["year_"+str(int(year))]
            #下一年的保值率
            keep_min = tar["year_"+str(int(year+1))]

            #相比于上一年，已经过了几个月
            mon = license_month-12*year
            tem = (keep_max-keep_min)/12

            #tar['keep_value'] = round(keep_max - tem*mon, 4)
            return round(keep_max - tem*mon, 4)
    except:
        return tar["year_16"]

class Logger:       
    def __init__(self, logName, logFile):
        self._logger = logging.getLogger(logName)
        handler = logging.FileHandler(logFile)
        formatter = logging.Formatter('%(asctime)s ********* %(message)s')
        handler.setFormatter(formatter)
        self._logger.addHandler(handler)
        self._logger.setLevel(logging.INFO)

    def log(self, msg):
        if self._logger is not None:
            self._logger.info(msg)


pd.set_option('display.max_columns', 500)
curr_dir = '/home/souche/qiongjiu/hgc/'

curr_date=str(datetime.datetime.now())[0:10]
hive_cli = HiveCliHook("warehouse_hive")

logger = Logger('model_service','./log/accurate_valuation_cyp_run_log.log')
logger.log("程序启动.............")


DAYU_HOME : /home/souche/projects/datacenter-etl-v2
[2021-02-03 17:06:46,080] {driver:120} INFO - Generating grammar tables from /usr/lib/python3.5/lib2to3/Grammar.txt
[2021-02-03 17:06:46,107] {driver:120} INFO - Generating grammar tables from /usr/lib/python3.5/lib2to3/PatternGrammar.txt
[2021-02-03 17:06:46,452] {<ipython-input-2-5aad934e0497>:84} INFO - 程序启动.............


In [5]:
car_data_columns= ["country_id","manufacturer","battery_quality_mile","battery_quality_time","brand_code","brand_name","car_body",
"cylinder_number","driving_mode","engine_volume_l","fuel_form","gear_box_type","guide_price","height","import_type",
"intake_type","length","level","max_power","max_torque","model_code","model_name","quality_mile","quality_time","rate",
"rate_count","seat_number_top","series_code","series_level","series_name","wheel_base","width","year","down_market_time",
"production_time","shutdown_time","time_to_market","year_1","year_2","year_3","year_4","year_5","year_6","year_7","year_8",
"year_9","year_10","year_11","year_12","year_13","year_14","year_15","year_16"]

model = pd.read_csv(curr_dir+'all_car_model_sep_word_to_sql',sep='\t')
ods_car_model_model = model[['model_code','model_name','series_code','series_name','brand_code','brand_name']]


In [6]:
enterprise2_BBB_level = pd.read_excel(curr_dir+"enterprise2_BBB_level_retail_median_res.xlsx")
enterprise2_AAA_level = pd.read_excel(curr_dir+"enterprise2_AAA_level_retail_median_res.xlsx")
enterprise2_BBB_level['level'] = 'B'
enterprise2_AAA_level['level'] = 'A'

enterprise2_median_res = pd.concat([enterprise2_BBB_level,enterprise2_AAA_level],axis=0)

enterprise2_median_res = pd.merge(ods_car_model_model,enterprise2_median_res,on='model_code',how='left')


In [7]:
enterprise2_median_res_series = enterprise2_median_res[['series_code','median_res']].groupby('series_code').\
                                mean().reset_index().rename(columns={'median_res':'series_median_res'})

enterprise2_median_res = pd.merge(enterprise2_median_res,enterprise2_median_res_series,on='series_code',how='left')

enterprise2_median_res.loc[(enterprise2_median_res['median_res'].isnull()) 
                           & (~enterprise2_median_res['series_median_res'].isnull() ),'median_res'] = enterprise2_median_res['series_median_res']


In [8]:
level_A_mean = enterprise2_median_res.loc[enterprise2_median_res['level'] == 'A']['median_res'].mean()
level_B_mean = enterprise2_median_res.loc[enterprise2_median_res['level'] == 'B']['median_res'].mean()

enterprise2_median_res.loc[(enterprise2_median_res['median_res'].isnull()) & 
                           (enterprise2_median_res['level'] == 'A'),'median_res'] = level_A_mean

enterprise2_median_res.loc[(enterprise2_median_res['median_res'].isnull()) & 
                           (enterprise2_median_res['level'] == 'B'),'median_res'] = level_B_mean


enterprise2_median_res.loc[(enterprise2_median_res['median_res'].isnull()),'median_res'] = enterprise2_median_res['median_res'].mean()



In [10]:
enterprise2_median_res[['model_code','model_name','series_code','series_name','brand_code',
                        'brand_name','median_res']].to_csv(curr_dir+"enterprise2_retail_median_res.csv")


In [11]:
## 导出统计的数据中位数、平均值、标准差
ph = curr_dir+"enterprise2_retail_median_res.csv"
from dayu.hooks.oss_hook import OSSHook
oss = OSSHook("oss_algorithm")
oss.put_file("algorithm/qiongjiu/valuation/全网数据-优化模型/"+curr_date+"/enterprise2_retail_median_res.csv", ph)


[2021-02-03 17:07:12,466] {oss_hook:28} INFO - Done. Loaded the key algorithm/qiongjiu/valuation/全网数据-优化模型/2021-02-03/enterprise2_retail_median_res.csv .
