In [1]:
import re
import os
import json
import jieba
#import cn2an
import requests
import platform
import numpy as np
import pandas as pd
from datetime import datetime
from sqlalchemy import create_engine
from chinese_to_numbers import * 
pd.options.mode.chained_assignment = None

IS_DEV = True if platform.platform().find('Linux') == -1 else False

# online
table_name_1 = "all_car_model_sep_word_to_sql"
table_name_2 = "tfidf_to_sql"
table_name_3 = "i_enterprise_cleaned_all_model_code_to_sql"

ctn = chinese_to_number()

def _print(text):
    print("==>{} {}".format(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), text))

class DB(object):
    _instance = None

    def __new__(cls, *args, **kwargs):
        if not cls._instance:
            cls._instance = object.__new__(cls)
            host = "rm-bp151507j1na5atv3go.mysql.rds.aliyuncs.com" if IS_DEV else "rm-bp151507j1na5atv3.mysql.rds.aliyuncs.com"
            cls._instance.i_find_con = create_engine("mysql+pymysql://ifind_w:NkM4gO9cxJrGOWVt9uLB9e6LdgcEJu@{}:3306/ifind?charset=utf8mb4".format(host))
            cls._instance.enterprise_con = create_engine("mysql+pymysql://cyp_enterprise_w:h7wWszUTUHgh54DB2rwiEXajhpF5k9@{}:3306/cyp_enterprise?charset=utf8mb4".format(host))
        return cls._instance

    def i_find_query(self, sql):
        return pd.read_sql(sql=sql, con=self.i_find_con)

    def enterprise_query(self, sql):
        return pd.read_sql(sql=sql, con=self.enterprise_con)

    def close_con(self):
        self.i_find_con.dispose()
        self.enterprise_con.dispose()

def merge_data(df1, df2, anchor):
    cols_to_use = df2.columns.difference(df1.columns)
    df2_reindex = df2.set_index(anchor)[cols_to_use].reset_index()
    return df1.merge(df2_reindex, on=anchor, how='outer')

def mapping(data, param, dictionary, Param_dict):
    dict_temp = dictionary[dictionary['field'] == Param_dict][['key', 'value']].set_index('key')
    return data[param].map(dict_temp['value'], na_action='ignore').fillna(data[param])

def replace_with_nan(data):
    for column in data.select_dtypes(object):
        data[column] = data[column].replace(['', ' ', 'none', 'None', '-', '未知'], '缺失')
        data[column] = data[column].fillna('缺失')
    for column in data.select_dtypes(float):
        data[column] = data[column].replace(['', ' ', 'none', 'None', '-', '未知'], np.nan)
        data[column] = data[column].fillna(0)
    return data

def impute_battery_quality_time(df):
    if pd.isnull(df):
        return 8
    if df in ['', ' ', '-']:
        return 8
    elif '个月' in df:
        N = df.split('个月')[0]
        #return float(cn2an.cn2an(N.split('.')[0], 'smart')) / 12
        return float(ctn.compose_decimal(N.split('.')[0])) / 12
    elif '年' in df:
        N = df.split('年')[0]
        if N in ['不限']:
            return 10000
        else:
            #return float(cn2an.cn2an(N.split('.')[0], 'smart'))
            return float(ctn.compose_decimal(N.split('.')[0]))
    else:
        return 10000

def impute_battery_quality_mile(df):
    if pd.isnull(df):
        return 15
    if df in ['', ' ', '-']:
        return 15
    elif '/' in df:
        N = df.split('公里')[0].split('/')[1].split('万')[0]
        if N in ['不限', '不限里程']:
            return 10000
        else:
            #return float(cn2an.cn2an(N.split('.')[0], 'smart'))
            return float(ctn.compose_decimal(N.split('.')[0]))
    elif '或' in df:
        N = df.split('公里')[0].split('或')[1].split('万')[0]
        if N in ['不限', '不限里程']:
            return 10000
        else:
            #return float(cn2an.cn2an(N.split('.')[0], 'smart'))
            return float(ctn.compose_decimal(N.split('.')[0]))
    else:
        return 10000

def impute_quality_time(df):
    try:
        if pd.isnull(df):
            return 3
        if df in ['', ' ', '-']:
            return 3
        elif '个月' in df:
            N = df.split('个月')[0]
            #return float(cn2an.cn2an(N.split('.')[0], 'smart')) / 12
            return float(ctn.compose_decimal(N.split('.')[0])) / 12
        elif '年' in df:
            N = df.split('年')[0]
            if N in ['不限']:
                return 10000
            else:
                #return float(cn2an.cn2an(N.split('.')[0], 'smart'))
                return float(ctn.compose_decimal(N.split('.')[0]))
        else:
            return 10000
    except:
        print(df)
        return 10000

def impute_quality_mile(df):
    try:
        if pd.isnull(df):
            return 6
        if df in ['', ' ', '-']:
            return 6
        elif '/' in df:
            N = df.split('公里')[0].split('/')[1].split('万')[0]
            if N in ['不限', '不限里程']:
                return 10000
            else:
                #return float(cn2an.cn2an(N.split('.')[0], 'smart'))
                return float(ctn.compose_decimal(N.split('.')[0]))
        elif '或' in df:
            N = df.split('公里')[0].split('或')[1].split('万')[0]
            if N in ['不限', '不限里程']:
                return 10000
            else:
                #return float(cn2an.cn2an(N.split('.')[0], 'smart'))
                return float(ctn.compose_decimal(N.split('.')[0]))
        else:
            return 10000
    except:
        print(df)
        return 100000
def split_electric_engine(data):
    # 纯电动 max_power如果缺失，从engine里面找
    for i in range(0, data.shape[0]):
        if data['max_power'].iloc[i] == 0.0:
            if data['engine'].iloc[i] != '缺失':
                if len(data['engine'].iloc[i].split(' ')) == 2:
                    data['max_power'].iloc[i] = float(
                        data['engine'].iloc[i].split(' ')[1].strip('马力')) / 1.36
    return data

def split_engine(data):
    for i in range(0, data.shape[0]):
        if data['max_power'].iloc[i] == 0.0:
            if data['engine'].iloc[i] != '缺失':
                if len([x for x in data['engine'].iloc[i].split(' ') if x]) == 3:
                    data['max_power'].iloc[i] = float(data['engine'].iloc[i].split(' ')[1].strip('马力')) / 1.36
        if data['engine_volume_l'].iloc[i] == 0.0:
            if data['engine'].iloc[i] != '缺失':
                if len([x for x in data['engine'].iloc[i].split(' ') if x]) == 3:
                    data['engine_volume_l'].iloc[i] = float(re.sub(r'[^\d.]+', '', data['engine'].iloc[i].split(' ')[0]))
        if data['cylinder_number'].iloc[i] == 0.0:
            if data['engine'].iloc[i] != '缺失':
                if len([x for x in data['engine'].iloc[i].split(' ') if x]) == 3:
                    data['cylinder_number'].iloc[i] = float(re.sub(r'[^\d.]+', '', data['engine'].iloc[i].split(' ')[-1]))
    return data

def impute_gearbox(df):
    if df in ['手自一体变速箱(AT)']:
        return '自动变速箱(AT)'
    else:
        return df

def to_Julian_date(df):
    # 将日期改成Julian Date
    for i, x in enumerate(df):
        if not pd.isnull(x):
            df.iloc[i] = x.to_julian_date()
    return df

def impute_cylinder_number(df):
    if df == 122.0:
        return 12.0
    else:
        return df

def strip_model_name(df):
    for sep in ['版', '型', '款', '(', ')', '（', '）', '【', '】', '-', '/', '——', '+']:
        df = [x.replace(sep, '') for x in df]
    return df

def strip_manufacturer(df):
    for sep in ['(', ')', '（', '）', '-', '·']:
        df = [x.replace(sep, '') for x in df]
    return df

def sep_manufacturer(df):
    df_temp = []
    for item in df:
        item = jieba.lcut(item)
        item = [x for x in item if x != " "]
        item = " ".join(item)
        df_temp.append(item)
    return df_temp

class label_regularize():
    def __init__(self, df, column):
        self.df = df
        self.column = column

    def count_label(self):
        temp = pd.DataFrame({'label': self.df[self.column].value_counts().index,
                             'number': self.df[self.column].value_counts().values})
        return temp[temp['number'] <= 5]['label'].tolist()

    def label_impute(self):
        self.label = self.count_label()
        self.df.loc[self.df[self.column].isin(self.label), self.column] = '其他'
        return self.df

def jieba_cut(df):
    df['manufacturer'] = strip_manufacturer(df['manufacturer'])
    df['manufacturer'] = sep_manufacturer(df['manufacturer'])
    df['model_name'] = strip_model_name(df['model_name'])
    df['model_name'] = [x.split(' ') for x in df['model_name']]
    df_temp = []
    for i, item in enumerate(df['model_name']):
        item = [x for x in item if x != ""]
        item = item[2:]
        item = " ".join(item)
        # jieba 分词 只分车型名称后缀
        item = jieba.lcut(item)
        item.insert(0, df['series_name'].iloc[i])
        item.insert(0, df['manufacturer'].iloc[i])
        item.insert(0, str(int(df['year'].iloc[i])))
        item = [x for x in item if x != " "]
        item = " ".join(item)
        df_temp.append(item)
    df['model_name'] = df_temp
    return df

def send_dd_message(message):
    dd_url = 'https://oapi.dingtalk.com/robot/send?access_token=946b2fe7c4637185ca8a6886f4bc934c0c19d71ce0d216ef0d9df3d5ab07fb75'  # 史上最强
    headers = {
        "Content-Type": "application/json",
        "charset": "utf-8"
    }
    data = {
        "msgtype": "markdown",
        "markdown": {
            "title": "script_report",
            "text": message
        }
    }
    _print(requests.post(url=dd_url, headers=headers, data=json.dumps(data)).text)

def parse_group(df, column):
    df[column] = df[column].map(lambda x:str(x))
    condition_none = (df[column] == "缺失") | (df[column] == 0)  # 缺失条件
    if True in condition_none.tolist():  # 判断是否缺失
        unique_set = set(df[~condition_none][column].tolist())  # 去重正常字段
        if len(unique_set) == 1:  # 判断数量
            df[column] = list(unique_set)[0]
    return df

# 处理保值率表
def clean_dataset(df):
    df.drop_duplicates(keep='last', inplace=True)  # 剔除完全重复的数据；
    df.sort_values(by='date_update', axis=0, ascending=True, inplace=True)
    df.drop_duplicates(subset='series_code', keep='last', inplace=True)  # 保留最新的数据
    return df

def drop_per(df):
    for i in range(1, 17):
        name = "year_" + str(int(i))
        df[name] = df[name].split('%')[0]
    return df


In [2]:
import logging
import datetime
from logging.handlers import TimedRotatingFileHandler
from logging.handlers import RotatingFileHandler
from sklearn.externals import joblib
from collections import OrderedDict
import sys
from dayu.hooks.oss_hook import OSSHook

from dayu.hooks.hive_server_hook import HiveServerHook
from dayu.hooks.hive_cli_hook import HiveCliHook

import pandas as pd 
import numpy as np
from fastFM import sgd
from scipy import sparse
from scipy.sparse import csc_matrix
import time
from sklearn.metrics import auc,accuracy_score,roc_curve,roc_auc_score
from sklearn.metrics import confusion_matrix,recall_score
from sklearn import  metrics
from sklearn.linear_model import LogisticRegression

def split_table_name(datain):
    new_cols = []
    for column in datain.columns:
        if(len(column.split('.'))<2):
            return datain
        tb_name, col_name = column.split('.')
        new_cols.append((column, col_name))
    datain = datain.rename(columns=dict(new_cols))
    return datain

def read_from_hive2(output_file_name,insql,dtype):
    filename = output_file_name
    filepath = curr_dir+filename
    hive = HiveServerHook("warehouse_hive")
    hive.to_csv(insql,filepath , delimiter=',',lineterminator='\n', output_header=True)
    outdata = pd.read_csv(filepath, header=0,dtype=dtype)
    # 去除列名中带有的表名
    outdata = split_table_name(outdata)
    return outdata

curr_date = str(datetime.datetime.now())[0:10]
logging.info('当前日期: %s'%(curr_date))
curr_dir = '/home/souche/qiongjiu/hgc/'
curr_date=str(datetime.datetime.now())[0:10]
# hive_cli = HiveCliHook("warehouse_hive")

DAYU_HOME : /home/souche/projects/datacenter-etl-v2


  """)


[2021-02-02 15:13:50,731] {driver:120} INFO - Generating grammar tables from /usr/lib/python3.5/lib2to3/Grammar.txt
[2021-02-02 15:13:50,760] {driver:120} INFO - Generating grammar tables from /usr/lib/python3.5/lib2to3/PatternGrammar.txt
[2021-02-02 15:13:51,344] {<ipython-input-2-443fa120637a>:45} INFO - 当前日期: 2021-02-02


In [3]:
hive_cli = HiveCliHook("warehouse_hive")

In [4]:
# 获取车型库字典表
sql='''
select * from db_data.ods_car_model_dictionary 
where ds = date_sub("'''+curr_date+'''",1)
'''
dtype={'uid':str}
car_dictionary= read_from_hive2('ods_car_model_dictionary',sql,dtype)
print('获取车型库字典表...ok')
# 获取车型指导价
sql='''
select * from db_data.ods_car_model_model_price 
where ds = date_sub("'''+curr_date+'''",1)
'''
dtype={'uid':str}
model_price= read_from_hive2('ods_car_model_model_price',sql,dtype)
print('获取车型指导价...ok')
# 获取车型Parameter_by_cyp
sql='''
select model_code,
       series_level,
       Down_market_time,
       Production_time,
       Shutdown_time,
       Time_to_market 
from db_data.ods_cyp_enterprise_model_parameter_new 
where ds = date_sub("'''+curr_date+'''",1)
'''
dtype={'uid':str}
model_parameter_cyp= read_from_hive2('ods_cyp_enterprise_model_parameter_new',sql,dtype)
print('获取车型Parameter_by_cyp...ok')
#获取合并车型表
sql='''
select * 
from db_data.ods_car_model_model_bak 
where ds = date_sub("'''+curr_date+'''",1)
'''
dtype={'uid':str}
model_bak= read_from_hive2('ods_car_model_model_bak',sql,dtype)
print('获取合并车型表...ok')

#获取车型Configuration
sql='''
select * 
from db_data.ods_car_model_model_configuration 
where ds = date_sub("'''+curr_date+'''",1)
'''
dtype={'uid':str}
model_configuration= read_from_hive2('ods_car_model_model_configuration',sql,dtype)
print('获取车型Configuration...ok')
#获取车型Parameter
sql='''
select * 
from db_data.ods_car_model_model_parameter  
where ds = date_sub("'''+curr_date+'''",1)
'''
dtype={'uid':str}
model_parameter= read_from_hive2('ods_car_model_model_parameter',sql,dtype)
print('获取车型Parameter...ok')
#获取车型列表
sql='''
select * 
from db_data.ods_car_model_model   
where ds = date_sub("'''+curr_date+'''",1)
'''
dtype={'uid':str}
model = read_from_hive2('ods_car_model_model',sql,dtype)
print('获取车型列表...ok')
#获取车型列表
sql='''
select * 
from db_data.ods_ifind_ienterprise_dict_to_table    
where ds = '2021-01-01'
'''
dtype={'uid':str}
dict_to_table = read_from_hive2('ods_ifind_ienterprise_dict_to_table',sql,dtype)
print('获取车型列表...ok')
#获取原有指导价和填充后的指导价
sql='''
select * 
from db_data.ods_ifind_ienterprise_hi_or_ai     
where ds = date_sub("'''+curr_date+'''",1)
'''
dtype={'uid':str}
guide_price = read_from_hive2('ods_ifind_ienterprise_hi_or_ai',sql,dtype)
print('获取原有指导价和填充后的指导价...ok')
#获取车系表
sql='''
select series_code,
       level,
       manufacturer as manufacturer_id 
from db_data.ods_car_model_series 
where ds = date_sub("'''+curr_date+'''",1)
'''
dtype={'uid':str}
series = read_from_hive2('ods_car_model_series',sql,dtype)
print('获取车系表...ok')
#获取厂商
sql='''
select code as manufacturer_id, 
       name as Manufacturer 
from db_data.ods_car_model_manufacturer 
where ds = date_sub("'''+curr_date+'''",1)
'''
dtype={'uid':str}
manufacturer = read_from_hive2('ods_car_model_manufacturer',sql,dtype)
print('获取厂商...ok')
#获取品牌
sql='''
select brand_code,
       country as Country_Id 
from db_cmp.ods_oth_car_model_brand_dd 
where ds = date_sub("'''+curr_date+'''",1)
'''
dtype={'uid':str}
Brand = read_from_hive2('ods_oth_car_model_brand_dd',sql,dtype)
print('获取品牌...ok')
#获取保值率表
sql='''
select * from db_data.ods_scrapy_che168_keep_value
where ds = date_sub("'''+curr_date+'''",1)
'''
dtype={'uid':str}
Keep_Value = read_from_hive2('ods_scrapy_che168_keep_value',sql,dtype)
print('获取保值率表...ok')
#获取历史的评分
sql='''
select model_code,
       rate, 
       rate_count 
from db_data.mid_car_car_evaluation_score
where ds = date_sub("'''+curr_date+'''",1)
'''
dtype={'uid':str}
Rate_history = read_from_hive2('mid_car_car_evaluation_score',sql,dtype)
print('获取历史的保值率表,不再更新...ok')
#获取评分数据
sql='''
select model_code as autohome_id,
       rate, 
       rate_count 
from db_data.ods_che168_score_info
where ds = date_sub("'''+curr_date+'''",1)
'''
dtype={'uid':str}
Rate = read_from_hive2('ods_che168_score_info',sql,dtype)
print('获取新的保值率表,不再更新...ok')


[2021-02-02 15:14:38,397] {hiveserver2:138} INFO - Using database default as default
[2021-02-02 15:14:38,541] {hive_server_hook:112} INFO - Running query: 
select * from db_data.ods_car_model_dictionary 
where ds = date_sub("2021-02-02",1)

[2021-02-02 15:14:39,346] {hive_server_hook:162} INFO - Written 10000 rows so far.
[2021-02-02 15:14:39,595] {hive_server_hook:162} INFO - Written 14271 rows so far.
[2021-02-02 15:14:39,600] {hiveserver2:265} INFO - Closing active operation
[2021-02-02 15:14:39,613] {hive_server_hook:163} INFO - Done. Loaded a total of 14271 rows.
获取车型库字典表...ok
[2021-02-02 15:14:39,687] {hiveserver2:138} INFO - Using database default as default
[2021-02-02 15:14:39,824] {hive_server_hook:112} INFO - Running query: 
select * from db_data.ods_car_model_model_price 
where ds = date_sub("2021-02-02",1)

[2021-02-02 15:14:40,928] {hive_server_hook:162} INFO - Written 10000 rows so far.
[2021-02-02 15:14:41,779] {hive_server_hook:162} INFO - Written 20000 rows so far.
[

  if self.run_code(code, result):


[2021-02-02 15:14:46,672] {hive_server_hook:162} INFO - Written 10000 rows so far.
[2021-02-02 15:14:46,981] {hive_server_hook:162} INFO - Written 20000 rows so far.
[2021-02-02 15:14:47,289] {hive_server_hook:162} INFO - Written 30000 rows so far.
[2021-02-02 15:14:47,575] {hive_server_hook:162} INFO - Written 40000 rows so far.
[2021-02-02 15:14:47,840] {hive_server_hook:162} INFO - Written 50000 rows so far.
[2021-02-02 15:14:48,124] {hive_server_hook:162} INFO - Written 60000 rows so far.
[2021-02-02 15:14:48,214] {hive_server_hook:162} INFO - Written 63469 rows so far.
[2021-02-02 15:14:48,219] {hiveserver2:265} INFO - Closing active operation
[2021-02-02 15:14:48,232] {hive_server_hook:163} INFO - Done. Loaded a total of 63469 rows.
获取车型Parameter_by_cyp...ok
[2021-02-02 15:14:48,358] {hiveserver2:138} INFO - Using database default as default
[2021-02-02 15:14:48,505] {hive_server_hook:112} INFO - Running query: 
select * 
from db_data.ods_car_model_model_bak 
where ds = date_sub(

  if self.run_code(code, result):


获取车型Configuration...ok
[2021-02-02 15:15:23,334] {hiveserver2:138} INFO - Using database default as default
[2021-02-02 15:15:23,450] {hive_server_hook:112} INFO - Running query: 
select * 
from db_data.ods_car_model_model_parameter  
where ds = date_sub("2021-02-02",1)

[2021-02-02 15:15:27,753] {hive_server_hook:162} INFO - Written 10000 rows so far.
[2021-02-02 15:15:31,617] {hive_server_hook:162} INFO - Written 20000 rows so far.
[2021-02-02 15:15:35,231] {hive_server_hook:162} INFO - Written 30000 rows so far.
[2021-02-02 15:15:38,757] {hive_server_hook:162} INFO - Written 40000 rows so far.
[2021-02-02 15:15:42,147] {hive_server_hook:162} INFO - Written 50000 rows so far.
[2021-02-02 15:15:45,755] {hive_server_hook:162} INFO - Written 60000 rows so far.
[2021-02-02 15:15:49,129] {hive_server_hook:162} INFO - Written 70000 rows so far.
[2021-02-02 15:15:50,157] {hive_server_hook:162} INFO - Written 73634 rows so far.
[2021-02-02 15:15:50,172] {hiveserver2:265} INFO - Closing activ

  if self.run_code(code, result):


[2021-02-02 15:15:51,332] {hive_server_hook:112} INFO - Running query: 
select * 
from db_data.ods_car_model_model   
where ds = date_sub("2021-02-02",1)

[2021-02-02 15:15:53,166] {hive_server_hook:162} INFO - Written 10000 rows so far.
[2021-02-02 15:15:54,769] {hive_server_hook:162} INFO - Written 20000 rows so far.
[2021-02-02 15:15:56,378] {hive_server_hook:162} INFO - Written 30000 rows so far.
[2021-02-02 15:15:57,946] {hive_server_hook:162} INFO - Written 40000 rows so far.
[2021-02-02 15:15:59,369] {hive_server_hook:162} INFO - Written 50000 rows so far.
[2021-02-02 15:16:00,904] {hive_server_hook:162} INFO - Written 60000 rows so far.
[2021-02-02 15:16:02,458] {hive_server_hook:162} INFO - Written 70000 rows so far.
[2021-02-02 15:16:02,913] {hive_server_hook:162} INFO - Written 73431 rows so far.
[2021-02-02 15:16:02,921] {hiveserver2:265} INFO - Closing active operation
[2021-02-02 15:16:02,938] {hive_server_hook:163} INFO - Done. Loaded a total of 73431 rows.
获取车型列表...ok
[

  if self.run_code(code, result):


[2021-02-02 15:16:03,734] {hive_server_hook:112} INFO - Running query: 
select * 
from db_data.ods_ifind_ienterprise_dict_to_table    
where ds = '2021-01-01'

[2021-02-02 15:16:03,879] {hive_server_hook:162} INFO - Written 183 rows so far.
[2021-02-02 15:16:03,882] {hiveserver2:265} INFO - Closing active operation
[2021-02-02 15:16:03,894] {hive_server_hook:163} INFO - Done. Loaded a total of 183 rows.
获取车型列表...ok
[2021-02-02 15:16:03,917] {hiveserver2:138} INFO - Using database default as default
[2021-02-02 15:16:04,035] {hive_server_hook:112} INFO - Running query: 
select * 
from db_data.ods_ifind_ienterprise_hi_or_ai     
where ds = date_sub("2021-02-02",1)

[2021-02-02 15:16:04,142] {hiveserver2:265} INFO - Closing active operation
[2021-02-02 15:16:04,153] {hive_server_hook:163} INFO - Done. Loaded a total of 0 rows.
获取原有指导价和填充后的指导价...ok
[2021-02-02 15:16:04,186] {hiveserver2:138} INFO - Using database default as default
[2021-02-02 15:16:04,324] {hive_server_hook:112} INFO - Ru

In [5]:
print("-*- 2 -*-")
# Merge 'model' and 'series' and 'brand' and 'manufacturer'
model['series_code'] = model['series_code'].map(lambda x:str(x))
series['series_code'] = series['series_code'].map(lambda x:str(x))
model = model.merge(series, on='series_code', how='left')

model['manufacturer_id'] = model['manufacturer_id'].map(lambda x:str(x))
manufacturer['manufacturer_id'] = manufacturer['manufacturer_id'].map(lambda x:str(x))
model = model.merge(manufacturer, on='manufacturer_id', how='left').drop(['manufacturer_id'], axis=1)

model['brand_code'] = model['brand_code'].map(lambda x:str(x))
Brand['brand_code'] = Brand['brand_code'].map(lambda x:str(x))
model = model.merge(Brand, on='brand_code', how='left')

# Merge 'model' and 'rate' and 'rate_history'
model['autohome_id'] = model['autohome_id'].map(lambda x:str(x))
Rate['autohome_id'] = Rate['autohome_id'].map(lambda x:str(x))
model = model.merge(Rate, on='autohome_id', how='left')
Rate_history.drop_duplicates(keep='last', inplace=True)
Rate_history.drop_duplicates(subset = 'model_code',keep='last', inplace=True)

model['model_code'] = model['model_code'].map(lambda x:str(x))
Rate_history['model_code'] = Rate_history['model_code'].map(lambda x:str(x))
model_temp = model.drop(['rate', 'rate_count'], axis=1).merge(Rate_history, on='model_code', how='left')
model = model.combine_first(model_temp)
del model_temp


-*- 2 -*-


In [59]:
# Merge 'model' and 'model_parameter_cyp' and 'model_configuration' and 'model_parameter'
model_parameter_cyp['model_code'] = model_parameter_cyp['model_code'].map(lambda x:str(x))
model_configuration['model_code'] = model_configuration['model_code'].map(lambda x:str(x))
model_parameter['model_code'] = model_parameter['model_code'].map(lambda x:str(x))
model_price['model_code'] = model_price['model_code'].map(lambda x:str(x))
model_combined = merge_data(model, model_parameter_cyp, 'model_code')
model_combined = merge_data(model_combined, model_configuration, 'model_code')
model_combined = merge_data(model_combined, model_parameter, 'model_code')
model_combined = merge_data(model_combined, model_price[['model_code', 'guide_price']], 'model_code')

# 填充指导价
guide_price['guide_price'] = 10000 * guide_price['guide_price']
model_combined_temp = model_combined.drop('guide_price', 1).merge(guide_price[['model_code', 'guide_price']], 'left', on='model_code').set_index(model_combined.index)
model_combined = model_combined.combine_first(model_combined_temp)
del model_combined_temp

# 删除指导价为0的车型
model_combined = model_combined[(model_combined['guide_price'].notna()) & (model_combined['guide_price'] != 0)] 

model_combined_usable = model_combined.copy()


In [7]:
print("-*- 3 -*-")
# 车型库字典到数据表中字段的映射
# 'Name_in_dict' 为common的表明对应的字段是 1 or 0.
# 如果mapping_key不为空，其value就要被映射到对应的field里的key里
field_list = car_dictionary['field'].unique()
new_car_dictionary = pd.DataFrame()
for field in field_list:
    temp = car_dictionary[car_dictionary['field'] == field]
    temp = temp.reset_index(drop=True)
    for i, key in enumerate(temp['mapping_key']):
        if not pd.isnull(key):
            temp.loc[i, 'value'] = temp[temp['key'] == key]['value'].tolist()
    new_car_dictionary = pd.concat([new_car_dictionary, temp], axis=0)

new_car_dictionary = new_car_dictionary.reset_index(drop=True)

for col in dict_to_table[dict_to_table['name_in_dict'] != 'common']['name_in_tables'].unique():
    print(col)
    col_dic = dict_to_table[dict_to_table['name_in_tables'] == col]['name_in_dict'].iloc[0]
    model_combined_usable[col] = mapping(model_combined_usable, col, new_car_dictionary, col_dic)


-*- 3 -*-
level
gear_box
quality_assurance
energy_type
car_body
intake_type
cylinder_arrangement_type
admission_gear
fuel_form
fuel_number
oil_supply_mode
cylinder_head_material
cylinder_body_material
environmental_standards
electric_type
battery_type
short_name
gear_box_type
driving_mode
four_wheel_drive_mode
center_differential_structure
front_suspension_type
behind_suspension_type
assist_type
body_structure
front_brake_type
behind_brake_type
parking_brake_type
spare_tyre_size
car_body_material
outside_pedal
behind_door_open_way
engine_layout_set
four_wheel_transform_mode
driver_seat_air_bag
side_air_bag
head_air_bag
variable_suspension
front_differential_lock
rear_differential_lock
sliding_door
derma_steering_wheel
steering_wheel_adjust
park_radar
chair_material
driver_seat_adjust
third_chair
chair_warm
chair_ventilate
chair_massage
rear_chair_fold_style
central_armrest
output_audio_interface
multimedia
near_light
far_light
electric_window
rearview_anti_glare
air_conditioner_control

In [46]:
print("-*- 4 -*-")
model_combined_usable_cleaned = model_combined_usable.drop(['sub_series_code',
                                                            'sub_series_name',
                                                            'vague_key_word',
                                                            'lead_pic',
                                                            'source',
                                                            'short_name',
                                                            'vague_key_word',
                                                            'model_type',
                                                            'model_type_data',
                                                            'model_type_cheniu',
                                                            'sell_status',
                                                            'exclusive',
                                                            'is_hidden',
                                                            'souche_display',
                                                            'ch168_display',
                                                            'date_create',
                                                            'date_update',
                                                            'alias_name',
                                                            'exact_key_word',
                                                            'display_tag',
                                                            'license_car_type',
                                                            'gb_hostling_weight',
                                                            'gb_code',
                                                            'cheyipai_model_name',
                                                            'biz_tag',
                                                            'body_formid',
                                                            'car_body_decorate',
                                                            'car_bottom_lamp',
                                                            'chair_memory',
                                                            'pedal_damping_adjust',
                                                            'tempered_glass',
                                                            'vehicle_theft_track',
                                                            'container_size',
                                                            'four_wheel_transform_mode',
                                                            'secondary_oil_box_capacity',
                                                            'third_head_space',
                                                            'third_leg_space',
                                                            'third_shoulder_space'], axis=1)

# 合并被合并掉的model_code
model_merge_original = model_combined_usable_cleaned[model_combined_usable_cleaned['model_code'].isin(model_bak['mapping_code'])]
model_bak['mapping_code'] = model_bak['mapping_code'].map(lambda x:str(x))
model_need_merge = model_bak[model_bak['mapping_code'].isin(model_merge_original['model_code'])][['model_code', 'mapping_code']]
model_need_merge.columns = ['model_code1', 'model_code']
model_merge_after = model_merge_original.merge(model_need_merge, on='model_code', how='left')
# del model_merge_original



-*- 4 -*-


In [48]:
print("-*- 5 -*-")
model_merge_after = model_merge_after.drop('model_code', axis=1)
model_merge_after.rename(columns={'model_code1': 'model_code'}, inplace=True)
model_combined_usable_cleaned = pd.concat([model_combined_usable_cleaned, model_merge_after], axis=0, sort=False)
model_combined_usable_cleaned = model_combined_usable_cleaned.reset_index(drop=True)
model_combined_usable_cleaned = model_combined_usable_cleaned.dropna(subset=['model_name'])


-*- 5 -*-


In [53]:
print("-*- 6 -*-")
# 存一个全的表最后用
all_model = model_combined_usable_cleaned.copy()

# 剔除'微卡', '中客', '大客', '重型货车', '货车', '房车', '-'等， 具体逻辑看代码
modellist = model_combined_usable_cleaned[
    (model_combined_usable_cleaned['car_body'].isin(['货车', '-']))
    | (model_combined_usable_cleaned['level'].isin(['微卡', '中客', '大客', '重型货车', '货车', '房车', '-']))
    | (model_combined_usable_cleaned['seat_number_top'] >= 10)
    ]['model_code']

model_combined_usable_cleaned = model_combined_usable_cleaned[~model_combined_usable_cleaned['model_code'].isin(modellist)]
print("车型数量：",model_combined_usable_cleaned.shape)
# print(model_combined_usable_cleaned.loc[model_combined_usable_cleaned['model_code'] == '38440'])
# 剔除更多的商用车，测试车型
model_combined_usable_cleaned = model_combined_usable_cleaned[~model_combined_usable_cleaned['series_name']
    .isin(['test-zzsouche', 'test-z', '其它', '庆铃 五十铃载货车', 'series-dasouche', '欧马可', '多利卡', 'TESTSERIES001', 'series-dasouche-1'])]

# 将quality_assurance拆分成 quality_mile 和 quality_time两个字段
model_combined_usable_cleaned['quality_time'] = model_combined_usable_cleaned['quality_assurance']
model_combined_usable_cleaned['quality_mile'] = model_combined_usable_cleaned['quality_assurance']
model_combined_usable_cleaned['quality_time'] = model_combined_usable_cleaned['quality_time'].apply(impute_quality_time)
model_combined_usable_cleaned['quality_mile'] = model_combined_usable_cleaned['quality_mile'].apply(impute_quality_mile)

model_combined_usable_cleaned = model_combined_usable_cleaned.drop('quality_assurance', axis=1)
# print(model_combined_usable_cleaned.loc[model_combined_usable_cleaned['model_code'] == '38440'])
for column in ['is_import',
               #'import_type',
               'import_type_id',
               'brand_Name',
               'body_form',
               'gear_box',
               'energy_type',
               'manufacture',
               'engine_layout_set',
               'engine_model',
               'engine_specific_technique',
               'is_replacement',
               'generation']:
    if column in model_combined_usable_cleaned.columns.tolist():
        model_combined_usable_cleaned = model_combined_usable_cleaned.drop([column], axis=1)

# 分传统能源和新能源车数据
Electric_Hybrid_car_model = model_combined_usable_cleaned[model_combined_usable_cleaned['fuel_form'].isin(['油电混合', '纯电动', '油气混合', '插电式混合动力', '增程式'])]
Traditional_car_model = model_combined_usable_cleaned[~model_combined_usable_cleaned['fuel_form'].isin(['油电混合', '纯电动', '油气混合', '插电式混合动力', '增程式'])]

# 将数字型变量空值转换成np.nan，字符串型变量空值转换成缺失



-*- 6 -*-
车型数量： (60745, 293)
可选:三年或15万公里/五年或10万公里/七年或7万公里
可选:三年或15万公里/五年或10万公里/七年或7万公里
可选:三年或15万公里/五年或10万公里/七年或7万公里
可选:三年或15万公里/五年或10万公里/七年或7万公里
可选:三年或15万公里/五年或10万公里/七年或7万公里
可选:三年或15万公里/五年或10万公里/七年或7万公里
可选:三年或15万公里/五年或10万公里/七年或7万公里


In [14]:
print("-*- 7 -*-")
Electric_Hybrid_car_model_for_class = Electric_Hybrid_car_model[['model_code',
                                                                 'series_code',
                                                                 'brand_code',
                                                                 'model_name',
                                                                 'brand_name',
                                                                 'series_name',
                                                                 'driving_mode',
                                                                 'gear_box_type',
                                                                 'year',
                                                                 'country_id',
                                                                 'import_type',
                                                                 'manufacturer',
                                                                 'series_level',
                                                                 'wheel_base',
                                                                 'length',
                                                                 'height',
                                                                 'width',
                                                                 'max_torque',
                                                                 'max_power',
                                                                 'electric_total_power',
                                                                 'electric_total_torque',
                                                                 'guide_price',
                                                                 'engine',
                                                                 'engine_volume_l',
                                                                 'cylinder_number',
                                                                 'intake_type',
                                                                 'fuel_form',
                                                                 'seat_number_top',
                                                                 'battery_assurance',
                                                                 'quality_time',
                                                                 'quality_mile',
                                                                 'car_body',
                                                                 'level',
                                                                 'rate',
                                                                 'rate_count'
                                                                 ]]
Electric_Hybrid_car_model_for_class = replace_with_nan(Electric_Hybrid_car_model_for_class)

# 增程式的车用electric_total_power，torque 替代 max_power，max_torque
fuel_form_condition = Electric_Hybrid_car_model_for_class['fuel_form'] == '增程式'
Electric_Hybrid_car_model_for_class.loc[fuel_form_condition, 'max_power'] = Electric_Hybrid_car_model_for_class[fuel_form_condition]['electric_total_power']
Electric_Hybrid_car_model_for_class.loc[fuel_form_condition, 'max_torque'] = Electric_Hybrid_car_model_for_class[fuel_form_condition]['electric_total_torque']

# 将battery_assurance拆分成 battery_quality_mile 和 battery_quality_time两个字段
Electric_Hybrid_car_model_for_class['battery_quality_time'] = Electric_Hybrid_car_model_for_class['battery_assurance']
Electric_Hybrid_car_model_for_class['battery_quality_mile'] = Electric_Hybrid_car_model_for_class['battery_assurance']
Electric_Hybrid_car_model_for_class['battery_quality_time'] = Electric_Hybrid_car_model_for_class['battery_quality_time'].apply(impute_battery_quality_time)
Electric_Hybrid_car_model_for_class['battery_quality_mile'] = Electric_Hybrid_car_model_for_class['battery_quality_mile'].apply(impute_battery_quality_mile)
Electric_Hybrid_car_model_for_class = Electric_Hybrid_car_model_for_class.drop('battery_assurance', axis=1)
Electric_Hybrid_car_model_for_class = Electric_Hybrid_car_model_for_class.drop(['electric_total_power', 'electric_total_torque'], axis=1)

# 纯电车没有进气方式
Electric_Hybrid_car_model_for_class.loc[Electric_Hybrid_car_model_for_class['fuel_form'] == '纯电动', 'intake_type'] = '无'
# engine_volume_l 对电动车置为0
Electric_Hybrid_car_model_for_class.loc[Electric_Hybrid_car_model_for_class['fuel_form'] == '纯电动', 'engine_volume_l'] = 0
# cylinder_number 对电动车置为0
Electric_Hybrid_car_model_for_class.loc[Electric_Hybrid_car_model_for_class['fuel_form'] == '纯电动', 'cylinder_number'] = 0
Electric_Hybrid_car_model_for_class[['max_power', 'engine']] = split_electric_engine(Electric_Hybrid_car_model_for_class[['max_power', 'engine']])


-*- 7 -*-


In [15]:
print("-*- 8 -*-")
# 处理传统能源车+新能源车
Traditional_car_model_for_class = Traditional_car_model[['model_code',
                                                         'series_code',
                                                         'brand_code',
                                                         'model_name',
                                                         'brand_name',
                                                         'series_name',
                                                         'driving_mode',
                                                         'gear_box_type',
                                                         'year',
                                                         'country_id',
                                                         'import_type',
                                                         'manufacturer',
                                                         'series_level',
                                                         'wheel_base',
                                                         'length',
                                                         'height',
                                                         'width',
                                                         'max_torque',
                                                         'max_power',
                                                         'guide_price',
                                                         'engine',
                                                         'engine_volume_l',
                                                         'cylinder_number',
                                                         'intake_type',
                                                         'fuel_form',
                                                         'seat_number_top',
                                                         'quality_time',
                                                         'quality_mile',
                                                         'car_body',
                                                         'level',
                                                         'rate', 'rate_count'
                                                         ]]

Traditional_car_model_for_class = pd.concat([Electric_Hybrid_car_model_for_class, Traditional_car_model_for_class], sort=True, axis=0)
# 根据reference_feature（比如series_code）来fill缺失值,或统一数值
Traditional_car_model_for_class = replace_with_nan(Traditional_car_model_for_class)
Traditional_car_model_for_class['engine_volume_l'] = Traditional_car_model_for_class['engine_volume_l'].astype(float)
list_1 = ['max_power', 'engine', 'engine_volume_l', 'cylinder_number']
Traditional_car_model_for_class[list_1] = split_engine(Traditional_car_model_for_class[list_1])
Traditional_car_model_for_class = Traditional_car_model_for_class.drop('engine', axis=1)
Traditional_car_model_for_class.loc[Traditional_car_model_for_class['intake_type'] == '高压共轨', 'intake_type'] = '涡轮增压'
list_time = ['down_market_time', 'production_time', 'shutdown_time', 'time_to_market']
Traditional_car_model_for_class = pd.concat([Traditional_car_model_for_class, Traditional_car_model[list_time]], axis=1)
for column in list_time:
    Traditional_car_model_for_class[column] = pd.to_datetime(Traditional_car_model_for_class[column]).apply(lambda x: x.to_julian_date().astype(int) if pd.notnull(x) else 0)



-*- 8 -*-


In [16]:
Traditional_car_model_for_class.shape

(60741, 37)

In [17]:
print("-*- 9 -*-")

Traditional_car_model_for_class.loc[Traditional_car_model_for_class['series_name'] == '迈凯伦675LT', 'gear_box_type'] = '双离合变速箱(DCT)'
Traditional_car_model_for_class.loc[Traditional_car_model_for_class['series_name'] == '欧陆', 'driving_mode'] = '前置四驱'
Traditional_car_model_for_class.loc[Traditional_car_model_for_class['series_name'] == 'Mustang', 'driving_mode'] = '前置后驱'
Traditional_car_model_for_class.loc[(Traditional_car_model_for_class['series_name'] == '欧陆') & (Traditional_car_model_for_class['gear_box_type'] == '无级变速箱(CVT)'), 'gear_box_type'] = '自动变速箱(AT)'

Traditional_car_model_for_class['gear_box_type'] = Traditional_car_model_for_class['gear_box_type'].apply(impute_gearbox)
Traditional_car_model_for_class['cylinder_number'] = Traditional_car_model_for_class['cylinder_number'].apply(impute_cylinder_number)


-*- 9 -*-


In [18]:
Traditional_car_model_for_class.shape

(60741, 37)

In [19]:
print("-*- 10 -*-")
# 填补缺失值
column_list = ['manufacturer',
               'country_id',
               'driving_mode',
               'gear_box_type',
               'import_type',
               'series_level',
               'intake_type',
               'fuel_form',
               'cylinder_number',
               'down_market_time',
               'production_time',
               'shutdown_time',
               'time_to_market',
               'max_power',
               'max_torque',
               'wheel_base',
               'length',
               'height',
               'width',
               ]

for column in column_list:
    for dimension in [["series_name", "year"], ["series_name"], ["brand_name"]]:
        print(column,dimension)
        df = Traditional_car_model_for_class[dimension + [column]]
        df = df.groupby(dimension).apply(lambda data: parse_group(data, column))  # 分组
        Traditional_car_model_for_class[dimension + [column]] = df


-*- 10 -*-
manufacturer ['series_name', 'year']
manufacturer ['series_name']
manufacturer ['brand_name']
country_id ['series_name', 'year']
country_id ['series_name']
country_id ['brand_name']
driving_mode ['series_name', 'year']
driving_mode ['series_name']
driving_mode ['brand_name']
gear_box_type ['series_name', 'year']
gear_box_type ['series_name']
gear_box_type ['brand_name']
import_type ['series_name', 'year']
import_type ['series_name']
import_type ['brand_name']
series_level ['series_name', 'year']
series_level ['series_name']
series_level ['brand_name']
intake_type ['series_name', 'year']
intake_type ['series_name']
intake_type ['brand_name']
fuel_form ['series_name', 'year']
fuel_form ['series_name']
fuel_form ['brand_name']
cylinder_number ['series_name', 'year']
cylinder_number ['series_name']
cylinder_number ['brand_name']
down_market_time ['series_name', 'year']
down_market_time ['series_name']
down_market_time ['brand_name']
production_time ['series_name', 'year']
produc

In [25]:
print("-*- 11 -*-")
# 将枚举特别少的合并，统一变成其他
column_list = ['country_id',
               'driving_mode',
               'gear_box_type',
               'import_type',
               'series_level',
               'intake_type',
               'fuel_form']
for column in column_list:
    Traditional_car_model_for_class = label_regularize(Traditional_car_model_for_class, column).label_impute()


-*- 11 -*-


In [26]:
Traditional_car_model_for_class.shape

(60741, 37)

In [27]:
print("-*- 12 -*-")
# 处理保值率表
columns_tmp = {
    'first_year_keepvalue': "year_1",
    'second_year_keepvalue': "year_2",
    'third_year_keepvalue': "year_3",
    'fourth_year_keepvalue': "year_4",
    'fifth_year_keepvalue': "year_5",
    'sixth_year_keepvalue': "year_6",
    'seventh_year_keepvalue': "year_7",
    'eighth_year_keepvalue': "year_8",
    'ninth_year_keepvalue': "year_9",
    'tenth_year_keepvalue': "year_10",
    'eleventh_year_keepvalue': "year_11",
    'twelfth_year_keepvalue': "year_12",
    'thirteenth_year_keepvalue': "year_13",
    'fourteenth_year_keepvalue': "year_14",
    'fifteenth_year_keepvalue': "year_15",
    'sixteenth_year_keepvalue': "year_16"
}

Keep_Value = Keep_Value.rename(columns=columns_tmp)
Keep_Value = clean_dataset(Keep_Value)
Keep_Value = Keep_Value[['brand_code',
                         'series_code',
                         'year_1',
                         'year_2',
                         'year_3',
                         'year_4',
                         'year_5',
                         'year_6',
                         'year_7',
                         'year_8',
                         'year_9',
                         'year_10',
                         'year_11',
                         'year_12',
                         'year_13',
                         'year_14',
                         'year_15',
                         'year_16']]

Keep_Value = Keep_Value.apply(drop_per, axis=1)
Keep_Value[Keep_Value.columns[2:]] = Keep_Value[Keep_Value.columns[2:]].astype(float) / 100

# 填充保值率表
series_brand_list = Traditional_car_model_for_class[['series_code', 'brand_code']].drop_duplicates(subset='series_code', keep='last')

Keep_Value = series_brand_list.merge(Keep_Value.drop('brand_code', axis=1), on='series_code', how='left')


-*- 12 -*-


In [28]:
Keep_Value.shape

(2589, 18)

In [29]:
print("-*- 13 -*-")
# 缺失值使用品牌维度中位数填充
gp = Keep_Value.groupby(['brand_code'])[['year_1',
                                         'year_2',
                                         'year_3',
                                         'year_4',
                                         'year_5',
                                         'year_6',
                                         'year_7',
                                         'year_8',
                                         'year_9',
                                         'year_10',
                                         'year_11',
                                         'year_12',
                                         'year_13',
                                         'year_14',
                                         'year_15',
                                         'year_16']].median()

gp['brand_code'] = list(gp.index)
gp = gp.reset_index(drop=True)

Keep_Value_brand = series_brand_list.merge(gp, on='brand_code', how='left')
Keep_Value = Keep_Value.combine_first(Keep_Value_brand)

del Keep_Value_brand, series_brand_list, gp

# 用全局中位数填充剩下的缺失值
Keep_Value = Keep_Value.fillna(Keep_Value.median())
Keep_Value = Keep_Value.drop('brand_code', axis=1)
Keep_Value['series_code'] = Keep_Value['series_code'].map(lambda x:str(x))
Traditional_car_model_for_class = Traditional_car_model_for_class.merge(Keep_Value, on='series_code', how='left')

# 载入词库, 分词
#static_file_path = '{}{}brand_words.txt'.format(os.path.dirname(os.path.abspath(__file__)), os.path.sep)
static_file_path = 'brand_words.txt'
jieba.load_userdict(static_file_path)
jieba_cut(Traditional_car_model_for_class)


Building prefix dict from the default dictionary ...


-*- 13 -*-
[2021-02-02 15:27:52,827] {__init__:111} DEBUG - Building prefix dict from the default dictionary ...


Loading model from cache /tmp/jieba.cache


[2021-02-02 15:27:52,831] {__init__:131} DEBUG - Loading model from cache /tmp/jieba.cache


Loading model cost 0.863 seconds.


[2021-02-02 15:27:53,692] {__init__:163} DEBUG - Loading model cost 0.863 seconds.


Prefix dict has been built succesfully.


[2021-02-02 15:27:53,694] {__init__:164} DEBUG - Prefix dict has been built succesfully.


Unnamed: 0,battery_quality_mile,battery_quality_time,brand_code,brand_name,car_body,country_id,cylinder_number,driving_mode,engine_volume_l,fuel_form,...,year_7,year_8,year_9,year_10,year_11,year_12,year_13,year_14,year_15,year_16
0,0.0,0.0,brand-25,奔驰,三厢车,GERMANY,4.0,前置四驱,2.0,汽油,...,0.3928,0.3242,0.2934,0.2379,0.2139,0.1681,0.1504,0.1174,0.1031,0.0805
1,0.0,0.0,brand-25,奔驰,三厢车,GERMANY,4.0,前置后驱,2.0,汽油,...,0.3928,0.3242,0.2934,0.2379,0.2139,0.1681,0.1504,0.1174,0.1031,0.0805
2,0.0,0.0,brand-25,奔驰,三厢车,GERMANY,4.0,前置后驱,1.6,汽油,...,0.3928,0.3242,0.2934,0.2379,0.2139,0.1681,0.1504,0.1174,0.1031,0.0805
3,0.0,0.0,brand-25,奔驰,三厢车,GERMANY,4.0,前置后驱,2.0,汽油,...,0.3928,0.3242,0.2934,0.2379,0.2139,0.1681,0.1504,0.1174,0.1031,0.0805
4,0.0,0.0,brand-20,宝马,三厢车,GERMANY,6.0,前置四驱,3.0,汽油,...,0.3168,0.2821,0.2303,0.2036,0.1626,0.1442,0.1126,0.0989,0.0772,0.0679
5,0.0,0.0,brand-25,奔驰,三厢车,GERMANY,4.0,前置后驱,1.6,汽油,...,0.3928,0.3242,0.2934,0.2379,0.2139,0.1681,0.1504,0.1174,0.1031,0.0805
6,0.0,0.0,brand-25,奔驰,三厢车,GERMANY,4.0,前置后驱,2.0,汽油,...,0.3928,0.3242,0.2934,0.2379,0.2139,0.1681,0.1504,0.1174,0.1031,0.0805
7,0.0,0.0,brand-25,奔驰,三厢车,GERMANY,4.0,前置后驱,2.0,汽油,...,0.3928,0.3242,0.2934,0.2379,0.2139,0.1681,0.1504,0.1174,0.1031,0.0805
8,0.0,0.0,brand-25,奔驰,三厢车,GERMANY,4.0,前置四驱,2.0,汽油,...,0.3928,0.3242,0.2934,0.2379,0.2139,0.1681,0.1504,0.1174,0.1031,0.0805
9,0.0,0.0,brand-25,奔驰,三厢车,GERMANY,4.0,前置后驱,2.0,汽油,...,0.3928,0.3242,0.2934,0.2379,0.2139,0.1681,0.1504,0.1174,0.1031,0.0805


In [30]:
columns= ['battery_quality_mile', 'battery_quality_time', 'brand_code',
       'brand_name', 'car_body', 'country_id', 'cylinder_number',
       'driving_mode', 'engine_volume_l', 'fuel_form', 'gear_box_type',
       'guide_price', 'height', 'import_type', 'intake_type', 'length',
       'level', 'manufacturer', 'max_power', 'max_torque', 'model_code',
       'model_name', 'quality_mile', 'quality_time', 'rate', 'rate_count',
       'seat_number_top', 'series_code', 'series_level', 'series_name',
       'wheel_base', 'width', 'year', 'down_market_time', 'production_time',
       'shutdown_time', 'time_to_market', 'year_1', 'year_2', 'year_3',
       'year_4', 'year_5', 'year_6', 'year_7', 'year_8', 'year_9', 'year_10',
       'year_11', 'year_12', 'year_13', 'year_14', 'year_15', 'year_16']
Traditional_car_model_for_class[['year_11', 'year_12', 'year_13', 'year_14', 'year_15', 'year_16']].head()


Unnamed: 0,year_11,year_12,year_13,year_14,year_15,year_16
0,0.2139,0.1681,0.1504,0.1174,0.1031,0.0805
1,0.2139,0.1681,0.1504,0.1174,0.1031,0.0805
2,0.2139,0.1681,0.1504,0.1174,0.1031,0.0805
3,0.2139,0.1681,0.1504,0.1174,0.1031,0.0805
4,0.1626,0.1442,0.1126,0.0989,0.0772,0.0679


In [31]:
car_data_columns= ["country_id","manufacturer","battery_quality_mile","battery_quality_time","brand_code","brand_name","car_body",
"cylinder_number","driving_mode","engine_volume_l","fuel_form","gear_box_type","guide_price","height","import_type",
"intake_type","length","level","max_power","max_torque","model_code","model_name","quality_mile","quality_time","rate",
"rate_count","seat_number_top","series_code","series_level","series_name","wheel_base","width","year","down_market_time",
"production_time","shutdown_time","time_to_market","year_1","year_2","year_3","year_4","year_5","year_6","year_7","year_8",
"year_9","year_10","year_11","year_12","year_13","year_14","year_15","year_16"]

Traditional_car_model_for_class.to_csv(curr_dir+'all_car_model_sep_word_to_sql',columns = car_data_columns,index=False,sep='\t',quotechar='#')


# outfile = open(curr_dir+'all_car_model_sep_word_to_sql','w',encoding='utf-8')
# outfile.write(Traditional_car_model_for_class.to_csv(columns = car_data_columns,header=False,index=False,sep='\t',quotechar='#'))
# outfile.close()
# logging.info('###################完成用户历史浏览数据输出')

# outputfile = curr_dir + 'all_car_model_sep_word_to_sql'
# mysql = MySqlHook("datacenter_base_service")
# mysql.bulk_load("all_car_model_sep_word_to_sql", outputfile,replace=True)


In [32]:
print("-*- 14 -*-")
# 计算tf-idf
model_code = Traditional_car_model_for_class['model_code']
model_name_df = Traditional_car_model_for_class['model_name']
model_name = model_name_df.str.split(' ').tolist()  # 车型数
original_word_list = list(model_name_df.str.split(' ', expand=True).stack())  # 原始词语总数量
word_list = list(set(original_word_list))  # 去重之后数量
# todo 耗时1分半
word_count_dict = {w: np.log10(len(model_name_df) / original_word_list.count(w)) for w in word_list}  # 计算每个词语的占比


-*- 14 -*-


In [33]:
print("-*- 15 -*-")
model_name_list = []
value = []  # 最终的元素列表
for doc in model_name:  # 车型词语列表
    model_name_list.append(json.dumps(doc, ensure_ascii=False))
    raw_value = 1.0 / float(len(doc))  # 每个车型的单词数量
    value.append(json.dumps([raw_value * word_count_dict[word] for word in doc], ensure_ascii=False))

result = pd.DataFrame({
    'model_code': model_code,
    'model_name': model_name_list,
    'value': value
})
# 存储数据
#result.to_sql(name=table_name_2, con=db.i_find_con, if_exists='replace', index=False)


-*- 15 -*-


In [35]:
result.shape

(60741, 3)

In [36]:
car_data_columns= ["model_code","model_name","value"]
result.to_csv(curr_dir+'tfidf_to_sql',columns = car_data_columns,header=False,index=False,sep='\t',quotechar='#')

# outfile = open(curr_dir+'tfidf_to_sql','w',encoding='utf-8')
# outfile.write(result.to_csv(columns = car_data_columns,header=False,index=False,sep='\t',quotechar='#'))
# outfile.close()
# logging.info('###################完成用户历史浏览数据输出')

# outputfile = curr_dir + 'tfidf_to_sql'
# mysql = MySqlHook("datacenter_base_service")
# mysql.bulk_load("tfidf_to_sql", curr_dir+'tfidf_to_sql',replace=True)


In [37]:
print("-*- 16 -*-")

# 存一张正常的表
for column in list_time:
    try:
        tmp_df = Traditional_car_model_for_class[Traditional_car_model_for_class[column] != 0][column]
        Traditional_car_model_for_class[column] = pd.to_datetime(tmp_df, errors='coerce', unit='D', origin='julian').dt.date
    except Exception as e:
        print(column)
        print(e)
Traditional_car_model_for_class = Traditional_car_model_for_class.drop('model_name', axis=1)
Traditional_car_model_for_class = Traditional_car_model_for_class.merge(all_model[['model_code', 'model_name']], on='model_code', how='left')
#Traditional_car_model_for_class.to_sql(name=table_name_3, con=db.i_find_con, if_exists='replace', index=False)
del all_model


-*- 16 -*-
down_market_time
incompatible 'arg' type for given 'origin'='julian'
production_time
incompatible 'arg' type for given 'origin'='julian'
shutdown_time
incompatible 'arg' type for given 'origin'='julian'
time_to_market
incompatible 'arg' type for given 'origin'='julian'


In [38]:
columns= ["country_id","manufacturer","battery_quality_mile","battery_quality_time","brand_code","brand_name",
"car_body","cylinder_number","driving_mode","engine_volume_l","fuel_form","gear_box_type","guide_price","height",
"import_type","intake_type","length","level","max_power","max_torque","model_code","quality_mile","quality_time",
"rate","rate_count","seat_number_top","series_code","series_level","series_name","wheel_base","width","year",
"down_market_time","production_time","shutdown_time","time_to_market","year_1","year_2","year_3","year_4","year_5",
"year_6","year_7","year_8","year_9","year_10","year_11","year_12","year_13","year_14","year_15","year_16","model_name"]
result.to_csv(curr_dir+'i_enterprise_cleaned_all_model_code_to_sql',columns = columns,header=False,index=False,sep='\t',quotechar='#')

# outfile = open(curr_dir+'i_enterprise_cleaned_all_model_code_to_sql','w',encoding='utf-8')
# outfile.write(result.to_csv(columns = columns,header=False,index=False,sep='\t',quotechar='#'))
# outfile.close()
# logging.info('###################完成用户历史浏览数据输出')

# outputfile = curr_dir + 'i_enterprise_cleaned_all_model_code_to_sql'
# mysql = MySqlHook("datacenter_base_service")
# mysql.bulk_load("i_enterprise_cleaned_all_model_code_to_sql", outputfile,replace=True)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)
