### 贝叶斯平滑
代码参考：
---
> * 平滑的对象有各个广告的转化率
> * 组合特征的转化率

-----
经过贝叶斯平滑后的特征的转化率有：
> * item_id
> * item_category_list
> * item_property_list
> * item_brand_id
> * user_id
> * item_city_id
> * shop_id

2元组合特征的平滑特征有
----------------------
广告级别
> * item_id,user_id
> * item_id,item_category_list
> * item_id,item_property_list
> * item_id,item_brand_id
> * item_id,item_city_id
> * item_id,item_collected_level
> * item_id,item_pv_level
> * item_id,item_price_level (补)
> * item_id,user_gender_id
> * item_id,user_occupation_id
> * item_id,user_age_level
> * item_id,user_star_level
> * item_id,shop_id
> * item_id,shop_star_level
> * item_id,category_0
> * item_id,category_1
> * item_id,category_2
> * item_id,hour
> * item_id,shop_review_num_level

----------------------
用户级
> * user_gender_id,user_age
> * user_gender_id,category_0
> * user_gender_id,category_1
> * user_gender_id,item_category_list
> * user_gender_id,item_property_list
> * user_age_level,category_0
> * user_age_level,category_1
> * user_age_level,item_category_list
> * user_age_level,item_property_list
> * user_occupation_id,category_0
> * user_occupation_id,category_1
> * user_occupation_id,item_category_list
> * user_occupation_id,item_property_list
> * user_gender_id,hour
> * user_age_level,hour
> * user_occupation_id,hour
> * user_gender_id,shop_id
> * user_age_level,shop_id
> * user_occupation_id,shop_id
---
> * user_age_level,item_price_level
> * user_age_level,item_sales_level
> * user_age_level,item_city_id
> * user_age_level,item_brand_id
> * user_age_level,item_collected_level
> * user_age_level,item_pv_level

> * user_gender_id,item_price_level
> * user_gender_id,item_sales_level
> * user_gender_id,item_city_id
> * user_gender_id,item_brand_id
> * user_gender_id,item_collected_level
> * user_gender_id,item_pv_level

> * user_occupation_id,item_price_level
> * user_occupation_id,item_sales_level
> * user_occupation_id,item_city_id
> * user_occupation_id,item_brand_id
> * user_occupation_id,item_collected_level
> * user_occupation_id,item_pv_level
----
shop_id
----
> * shop_id,user_age
> * shop_id,user_age_level
> * shop_id,user_occupation_id
> * shop_id,item_category_list
> * shop_id,item_property_list
> * shop_id,category_0
> * shop_id,category_1

> * shop_id,item_price_level
> * shop_id,item_sales_level
> * shop_id,item_city_id
> * shop_id,item_brand_id
> * shop_id,item_collected_level
> * shop_id,item_pv_level
---
> * shop_star_level,item_price_level
> * shop_star_level,item_sales_level
> * shop_star_level,item_city_id
> * shop_star_level,item_brand_id
> * shop_star_level,item_collected_level
> * shop_star_level,item_pv_level

----
> * shop_review_num_level,item_price_level
> * shop_review_num_level,item_sales_level
> * shop_review_num_level,item_city_id
> * shop_review_num_level,item_brand_id
> * shop_review_num_level,item_collected_level
> * shop_review_num_level,item_pv_level
----
> * shop_star_level,user_age_level
> * shop_star_level,user_gender_id
> * shop_star_level,user_occupation_id

> * shop_review_num_level,user_age_level
> * shop_review_num_level,user_gender_id
> * shop_review_num_level,user_occupation_id


In [23]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency #方差检验
import seaborn as sns
import copy 
import Ipynb_importer
from smooth import BayesianSmoothing,HyperParam
from file_path import create_path
import pickle
%matplotlib inline
plt.style.use("ggplot")

In [24]:
def gen_day_conversion_cnt():
    '''
    统计每一个feature每天的转化次数、曝光次数
    每天的转化率表示当天之前所有的转化次数、曝光次数
    '''
    train = pd.read_pickle("../数据/preprocess_path/train_1.pickle") #train
    df = copy.deepcopy(train)
    for feat_1 in ['item_id','item_category_list','item_property_list','item_brand_id'
                  ,"item_city_id","user_id","shop_id"]:
        if os.path.exists("../数据/temp_path/_4_{0}_converison_cnt.pickle".format(feat_1)):
            print("../数据/temp_path/_4_{0}_converison_cnt.pickle 已经存在".format(feat_1))
        else:
            res = pd.DataFrame()
            Id = train[[feat_1]].drop_duplicates() # feature,用来关联点击总次数、转化次数
            for day in range(17,25):
                maskTemp = df[df["day"]<day] #选取当天之前的所有天数
                click_cnt = maskTemp.groupby(feat_1,as_index=False)["is_trade"].count()
                click_cnt = click_cnt.rename(columns={"is_trade":"{0}_click_cnt".format(feat_1)}) #统计feat1总点击数
                temp = pd.merge(Id,click_cnt,how="left")
                
                #转化记录
                click_trade_cnt = maskTemp.groupby(feat_1,as_index=False)["is_trade"].sum()
                click_trade_cnt = click_trade_cnt.rename(columns={"is_trade":"{0}_trade_cnt".format(feat_1)}) 
                temp = pd.merge(temp,click_trade_cnt,how="left")
                temp = temp.fillna(0)
                temp["day"] = day
                
                #合并
                res = pd.concat([res,temp],ignore_index=True,axis=0) 
            res.to_pickle("../数据/temp_path/_4_{0}_trade_cnt.pickle".format(feat_1))
            print("保存到 ../数据/temp_path/_4_{0}_trade_cnt.pickle".format(feat_1))

In [25]:
def gen_combine_feature_conversion_cnt():
    '''
    统计两两组合特征之间的转化率
    '''
    train = pd.read_pickle("../数据/preprocess_path/train_1.pickle") #train
    df = copy.deepcopy(train)
    for feat_1,feat_2 in[("item_id","user_id"),
                         ("item_id","item_category_list"),
                         ("item_id","item_property_list"),
                         ("item_id","item_brand_id"),
                         ("item_id","item_city_id"),
                         ("item_id","item_collected_level"),
                         ("item_id","item_pv_level"),
                         ("item_id","user_gender_id"),
                         ("item_id","user_occupation_id"),
                         ("item_id","user_age_level"),
                         ("item_id","user_star_level"),
                         ("item_id","shop_id"),
                         ("item_id","shop_star_level"),
                         ("item_id","category_0"),
                         ("item_id","category_1"),
                         ("item_id","hour"),
                         ("item_id","shop_review_num_level"),
                         
                        ("user_gender_id","user_age_level"),
                        ("user_gender_id","category_0"),
                        ("user_gender_id","category_1"),
                        ("user_gender_id","item_category_list"),
                        ("user_gender_id","item_property_list"),
                        ("user_age_level","category_0"),
                        ("user_age_level","category_1"),
                        ("user_age_level","item_category_list"),
                        ("user_age_level","item_property_list"),
                        ("user_occupation_id","category_0"),
                        ("user_occupation_id","category_1"),
                        ("user_occupation_id","item_category_list"),
                        ("user_occupation_id","item_property_list"),
                        ("user_gender_id","hour"),
                        ("user_age_level","hour"),
                        ("user_occupation_id","hour"),
                        ("user_gender_id","shop_id"),
                        ("user_age_level","shop_id"),
                        ("user_occupation_id","shop_id"),
                                                 
                        ("user_age_level","item_price_level"),
                        ("user_age_level","item_sales_level"),
                        ("user_age_level","item_city_id"),
                        ("user_age_level","item_brand_id"),
                        ("user_age_level","item_collected_level"),
                        ("user_age_level","item_pv_level"),
                        ("user_gender_id","item_price_level"),
                        ("user_gender_id","item_sales_level"),
                        ("user_gender_id","item_city_id"),
                        ("user_gender_id","item_brand_id"),
                        ("user_gender_id","item_collected_level"),
                        ("user_gender_id","item_pv_level"),
                        ("user_occupation_id","item_price_level"),
                        ("user_occupation_id","item_sales_level"),
                        ("user_occupation_id","item_city_id"),
                        ("user_occupation_id","item_brand_id"),
                        ("user_occupation_id","item_collected_level"),
                        ("user_occupation_id","item_pv_level"),
                         
                        ("shop_id","user_age_level"),
                        ("shop_id","user_occupation_id"),
                        ("shop_id","item_category_list"),
                        ("shop_id","item_property_list"),
                        ("shop_id","category_0"),
                        ("shop_id","category_1"),
                        
                        ("shop_id","item_price_level"),
                        ("shop_id","item_sales_level"),
                        ("shop_id","item_city_id"),
                        ("shop_id","item_brand_id"),
                        ("shop_id","item_collected_level"),
                        ("shop_id","item_pv_level"),
                        ("shop_star_level","item_price_level"),
                        ("shop_star_level","item_sales_level"),
                        ("shop_star_level","item_city_id"),
                        ("shop_star_level","item_brand_id"),
                        ("shop_star_level","item_collected_level"),
                        ("shop_star_level","item_pv_level"),
                        ("shop_review_num_level","item_price_level"),
                        ("shop_review_num_level","item_sales_level"),
                        ("shop_review_num_level","item_city_id"),
                        ("shop_review_num_level","item_brand_id"),
                        ("shop_review_num_level","item_collected_level"),
                        ("shop_review_num_level","item_pv_level"),
                        ("shop_star_level","user_age_level"),
                        ("shop_star_level","user_gender_id"),
                        ("shop_star_level","user_occupation_id"),
                        ("shop_review_num_level","user_age_level"),
                        ("shop_review_num_level","user_gender_id"),
                        ("shop_review_num_level","user_occupation_id")]:
        if os.path.exists("../数据/temp_path/_4_{0}_and_{1}_trade_cnt.pickle".format(feat_1,feat_2)):
            print("../数据/temp_path/_4_{0}_and_{1}_trade_cnt.pickle 已存在!!!!".format(feat_1,feat_2))
        else:
            res = pd.DataFrame()
            Id = train[[feat_1,feat_2]].drop_duplicates() # feature,用来关联点击总次数、转化次数
            for day in range(17,25):
                maskTemp = df[df["day"]<day] #选取当天之前的所有天数
                click_cnt = maskTemp.groupby([feat_1,feat_2],as_index=False)["is_trade"].count()
                click_cnt = click_cnt.rename(columns={"is_trade":"{0}_{1}_click_cnt".format(feat_1,feat_2)}) #统计feat1总点击数
                temp = pd.merge(Id,click_cnt,how="left",left_on=[feat_1,feat_2],right_on=[feat_1,feat_2])
                
                #转化记录
                click_trade_cnt = maskTemp.groupby([feat_1,feat_2],as_index=False)["is_trade"].sum()
                click_trade_cnt = click_trade_cnt.rename(columns={"is_trade":"{0}_{1}_trade_cnt".format(feat_1,feat_2)}) 
                temp = pd.merge(temp,click_trade_cnt,how="left",left_on=[feat_1,feat_2],right_on=[feat_1,feat_2])
                temp = temp.fillna(0)
                temp["day"] = day
                
                #合并
                res = pd.concat([res,temp],ignore_index=True,axis=0) 
            res.to_pickle("../数据/temp_path/_4_{0}_and_{1}_trade_cnt.pickle".format(feat_1,feat_2))
            print("保存到 ../数据/temp_path/_4_{0}_and_{1}_trade_cnt.pickle".format(feat_1,feat_2))

In [26]:
def Bayes_smooth_pcvr(windowDay=[22,24]):
    '''
    通过贝叶斯平滑处理特征
    '''
    smooth_cvr_log = {}
    #单变量
    for feature in ['item_id','item_category_list','item_property_list','item_brand_id'
                  ,"item_city_id","user_id","shop_id"]:
        savePath = "../数据/smooth_cvr/_4_smooth_cvr_{0}_{1}_{2}.pickle".format(feature,windowDay[0],windowDay[1])
        if os.path.exists(savePath):
            print(savePath,"已经存在！！！")
        else:
            readPath = "../数据/temp_path/_4_{0}_trade_cnt.pickle".format(feature) 
            temp = pd.read_pickle(readPath) #读取需要处理的表
            temp = temp[(temp["day"]>=windowDay[0])&(temp["day"]<=windowDay[1])]
            print(temp.shape)
            HP = HyperParam(1, 1) 
            #HP.update_from_data_by_FPI(temp[feature + '_click_cnt'].values, temp[feature + '_trade_cnt'].values, 1000, 0.001)
            HP.update_from_data_by_moment(temp[feature + '_click_cnt'].values, temp[feature + '_trade_cnt'].values)
            temp[feature + '_smooth'] = (temp[feature + '_trade_cnt'] + HP.alpha) / (temp[feature + '_click_cnt'] + HP.alpha + HP.beta)
            temp.to_pickle(savePath)
            smooth_cvr_log[feature + '_smooth'] = HP.alpha/(HP.alpha+HP.beta)
            print("***************************")
            print("{0}对应的alpha为{1}，beta为{2}".format(feature,HP.alpha,HP.beta))
            print("****************************")
            print("保存{0}".format(savePath))
            del temp
    
    print("---------单变量的贝叶斯平滑结束，开始多变量的贝叶斯平滑-----------------")
    #组合变量
    for feat_1,feat_2 in[("item_id","user_id"),
                         ("item_id","item_category_list"),
                         ("item_id","item_property_list"),
                         ("item_id","item_brand_id"),
                         ("item_id","item_city_id"),
                         ("item_id","item_collected_level"),
                         ("item_id","item_pv_level"),
                         ("item_id","user_gender_id"),
                         ("item_id","user_occupation_id"),
                         ("item_id","user_age_level"),
                         ("item_id","user_star_level"),
                         ("item_id","shop_id"),
                         ("item_id","shop_star_level"),
                         ("item_id","category_0"),
                         ("item_id","category_1"),
                         ("item_id","hour"),
                         ("item_id","shop_review_num_level"),
                         
                        ("user_gender_id","user_age_level"),
                        ("user_gender_id","category_0"),
                        ("user_gender_id","category_1"),
                        ("user_gender_id","item_category_list"),
                        ("user_gender_id","item_property_list"),
                        ("user_age_level","category_0"),
                        ("user_age_level","category_1"),
                        ("user_age_level","item_category_list"),
                        ("user_age_level","item_property_list"),
                        ("user_occupation_id","category_0"),
                        ("user_occupation_id","category_1"),
                        ("user_occupation_id","item_category_list"),
                        ("user_occupation_id","item_property_list"),
                        ("user_gender_id","hour"),
                        ("user_age_level","hour"),
                        ("user_occupation_id","hour"),
                        ("user_gender_id","shop_id"),
                        ("user_age_level","shop_id"),
                        ("user_occupation_id","shop_id"),
                                                 
                        ("user_age_level","item_price_level"),
                        ("user_age_level","item_sales_level"),
                        ("user_age_level","item_city_id"),
                        ("user_age_level","item_brand_id"),
                        ("user_age_level","item_collected_level"),
                        ("user_age_level","item_pv_level"),
                        ("user_gender_id","item_price_level"),
                        ("user_gender_id","item_sales_level"),
                        ("user_gender_id","item_city_id"),
                        ("user_gender_id","item_brand_id"),
                        ("user_gender_id","item_collected_level"),
                        ("user_gender_id","item_pv_level"),
                        ("user_occupation_id","item_price_level"),
                        ("user_occupation_id","item_sales_level"),
                        ("user_occupation_id","item_city_id"),
                        ("user_occupation_id","item_brand_id"),
                        ("user_occupation_id","item_collected_level"),
                        ("user_occupation_id","item_pv_level"),
                         
                        ("shop_id","user_age_level"),
                        ("shop_id","user_occupation_id"),
                        ("shop_id","item_category_list"),
                        ("shop_id","item_property_list"),
                        ("shop_id","category_0"),
                        ("shop_id","category_1"),
                        
                        ("shop_id","item_price_level"),
                        ("shop_id","item_sales_level"),
                        ("shop_id","item_city_id"),
                        ("shop_id","item_brand_id"),
                        ("shop_id","item_collected_level"),
                        ("shop_id","item_pv_level"),
                        ("shop_star_level","item_price_level"),
                        ("shop_star_level","item_sales_level"),
                        ("shop_star_level","item_city_id"),
                        ("shop_star_level","item_brand_id"),
                        ("shop_star_level","item_collected_level"),
                        ("shop_star_level","item_pv_level"),
                        ("shop_review_num_level","item_price_level"),
                        ("shop_review_num_level","item_sales_level"),
                        ("shop_review_num_level","item_city_id"),
                        ("shop_review_num_level","item_brand_id"),
                        ("shop_review_num_level","item_collected_level"),
                        ("shop_review_num_level","item_pv_level"),
                        ("shop_star_level","user_age_level"),
                        ("shop_star_level","user_gender_id"),
                        ("shop_star_level","user_occupation_id"),
                        ("shop_review_num_level","user_age_level"),
                        ("shop_review_num_level","user_gender_id"),
                        ("shop_review_num_level","user_occupation_id")]:
        savePath = "../数据/smooth_cvr/_4_smooth_cvr_{0}_and_{1}_{2}_{3}.pickle".format(feat_1,feat_2,windowDay[0],windowDay[1])
        if os.path.exists(savePath):
            print(savePath,"已经存在！！！")
        else:
            readPath = "../数据/temp_path/_4_{0}_and_{1}_trade_cnt.pickle".format(feat_1,feat_2) 
            temp = pd.read_pickle(readPath) #读取需要处理的表
            temp = temp[(temp["day"]>=windowDay[0])&(temp["day"]<=windowDay[1])]
            print(temp.shape)
            HP = HyperParam(1, 1) 
            #HP.update_from_data_by_FPI(temp[feature + '_click_cnt'].values, temp[feature + '_trade_cnt'].values, 1000, 0.001)
            HP.update_from_data_by_moment(temp[feat_1+"_"+feat_2 + '_click_cnt'].values, temp[feat_1+"_"+feat_2 + '_trade_cnt'].values)
            temp[feat_1+"_"+feat_2 + '_smooth'] = (temp[feat_1+"_"+feat_2 + '_trade_cnt'] + HP.alpha) / (temp[feat_1+"_"+feat_2 + '_click_cnt'] + HP.alpha + HP.beta)
            smooth_cvr_log[feat_1+"_"+feat_2 + '_smooth'] = HP.alpha/(HP.alpha + HP.beta) #记录每一特征的初始值
            temp.to_pickle(savePath)
            del temp
            print("***************************")
            print("{0}和{1}组合特征对应的alpha为{2}，beta为{3}".format(feat_1,feat_2,HP.alpha,HP.beta))
            print("****************************")
            print("保存{0}".format(savePath))
            
    #-------存下每一个特征对应的经验初始值，用于填充空值
    print("保存每个特征的经验初始值，用于之后的填充空值")
    with open("../数据/smooth_cvr/_4_smooth_cvr_log.pickle","wb") as f1:
        pickle.dump(smooth_cvr_log,f1)

In [27]:
%%time
if __name__ == "__main__":
    gen_day_conversion_cnt() #统计每天的转化次数
    gen_combine_feature_conversion_cnt() # 统计组合特征的转化次数
    Bayes_smooth_pcvr(windowDay=[21,24]) #贝叶斯平滑

保存到 ../数据/temp_path/_4_item_id_trade_cnt.pickle
保存到 ../数据/temp_path/_4_item_category_list_trade_cnt.pickle
保存到 ../数据/temp_path/_4_item_property_list_trade_cnt.pickle
保存到 ../数据/temp_path/_4_item_brand_id_trade_cnt.pickle
保存到 ../数据/temp_path/_4_item_city_id_trade_cnt.pickle
保存到 ../数据/temp_path/_4_user_id_trade_cnt.pickle
保存到 ../数据/temp_path/_4_shop_id_trade_cnt.pickle
../数据/temp_path/_4_item_id_and_user_id_trade_cnt.pickle 已存在!!!!
../数据/temp_path/_4_item_id_and_item_category_list_trade_cnt.pickle 已存在!!!!
../数据/temp_path/_4_item_id_and_item_property_list_trade_cnt.pickle 已存在!!!!
../数据/temp_path/_4_item_id_and_item_brand_id_trade_cnt.pickle 已存在!!!!
../数据/temp_path/_4_item_id_and_item_city_id_trade_cnt.pickle 已存在!!!!
../数据/temp_path/_4_item_id_and_item_collected_level_trade_cnt.pickle 已存在!!!!
../数据/temp_path/_4_item_id_and_item_pv_level_trade_cnt.pickle 已存在!!!!
../数据/temp_path/_4_item_id_and_user_gender_id_trade_cnt.pickle 已存在!!!!
../数据/temp_path/_4_item_id_and_user_occupation_id_trade_cnt.