# 组合所有的特征

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency #方差检验
import seaborn as sns
import copy 
import Ipynb_importer
from smooth import BayesianSmoothing,HyperParam
from file_path import create_path
%matplotlib inline
plt.style.use("ggplot")

importing Jupyter notebook from smooth.ipynb
importing Jupyter notebook from file_path.ipynb


In [2]:
def merge_click_data(timeWindow = [21,24]):
    '''
    合并有关点击特征的字段
    '''
    save_path_train = "../数据/merge_data/_5_merge_data_train_basic_click_{0}_{1}.pickle".format(timeWindow[0],timeWindow[1])
    save_path_test = "../数据/merge_data/_5_merge_data_test_basic_click_{0}_{1}.pickle".format(timeWindow[0],timeWindow[1])
    
    if os.path.exists(save_path_train) and os.path.exists(save_path_test) :
        print("{0} already exists".format(save_path_train))
        print("{0} already exists".format(save_path_test))
    else:
    
        train = pd.read_pickle("../数据/preprocess_path/train_1.pickle") #训练集
        test = pd.read_pickle("../数据/preprocess_path/test_1.pickle")  #测试集
        #所需要的字段
        basic_feature = ["instance_id","user_id","context_timestamp","date","hour"]
        ads_feature = ['item_id','item_category_list','item_property_list'
                       ,'item_brand_id','item_city_id',"category_0","category_1","category_2"]
        context_feature = ['shop_id','context_page_id'] #上下文
        stats_feature = ads_feature+context_feature
        cols = basic_feature + ads_feature + context_feature


        #-----选取特定的日子---------
        train = train[(train["day"]>=timeWindow[0])&(train["day"]<=timeWindow[1])] #
        test = test[(test["day"]>=timeWindow[0])&(test["day"]<=timeWindow[1])] #
        train = train[cols]
        test = test[cols]

        #合并基本点击数统计
        click_stats = pd.read_pickle("../数据/temp_path/_2_get_click_stats.pickle") #基本点击数
        train = pd.merge(train,click_stats,how="left",left_on="user_id",right_on="user_id")
        test = pd.merge(test,click_stats,how="left",left_on="user_id",right_on="user_id")
        del click_stats

        #合并点击时间间隔表
        time_dist = pd.read_pickle("../数据/temp_path/_2_first_time_to_last_margin.pickle")
        train = pd.merge(train,time_dist[["user_id","first_2_last_dist"]],how="left",left_on=["user_id"],right_on=["user_id"])
        test = pd.merge(test,time_dist[["user_id","first_2_last_dist"]],how="left",left_on=["user_id"],right_on=["user_id"]) 
        train = train.drop_duplicates()
        test = test.drop_duplicates()

        #合并当前点击行为到第一次和最后一次点击的行为间隔
        cols = ["user_id","context_timestamp","now_2_first_dist","now_2_last_dist"]
        train = pd.merge(train,time_dist[cols],how="left",on=["user_id","context_timestamp"])
        test = pd.merge(test,time_dist[cols],how="left",on=["user_id","context_timestamp"]) 
        train = train.drop_duplicates()
        test = test.drop_duplicates()
        del time_dist

        #合并倒数第一次点击和倒数第二次点击行为的间隔
        last_1_2_dist =  pd.read_pickle("../数据/temp_path/_2_last_1_2_dist.pickle")
        train = pd.merge(train,last_1_2_dist,how="left",on=["user_id"])
        test = pd.merge(test,last_1_2_dist,how="left",on=["user_id"]) 
        train = train.drop_duplicates()
        test = test.drop_duplicates()
        del last_1_2_dist

        #合并滑动窗口的间隔
        slide_windows_stats =  pd.read_pickle("../数据/temp_path/_2_slide_windows_stats.pickle")
        train = pd.merge(train,slide_windows_stats,how="left",on=["user_id"])
        test = pd.merge(test,slide_windows_stats,how="left",on=["user_id"]) 
        train = train.drop_duplicates()
        test = test.drop_duplicates()
        del slide_windows_stats


        #组合特征
        print("以天为时间粒度的用户点击")
        for feature in stats_feature:
            print(feature)
            temp_path = "../数据/temp_path/_2_user_date_click_{0}".format(feature)
            temp = pd.read_pickle(temp_path) #读取文件
            train = pd.merge(train,temp,how="left",on=["user_id","date",feature]).drop_duplicates()
            test = pd.merge(test,temp,how="left",on=["user_id","date",feature]).drop_duplicates()
            del temp

        print("以小时为为时间粒度的用户点击")
        for feature in stats_feature:
            print(feature)
            temp_path = "../数据/temp_path/_2_user_date_hour_click_{0}".format(feature)
            temp = pd.read_pickle(temp_path) #读取文件
            train = pd.merge(train,temp,how="left",on=["user_id","date","hour",feature]).drop_duplicates()
            test = pd.merge(test,temp,how="left",on=["user_id","date","hour",feature]).drop_duplicates()
            del temp

        #组合点击行为的统计数据
        print("以天和小时为时间粒度的用户点击行为的统计")
        for feature in stats_feature:
            print(feature) 
            click_stats_save_path = "../数据/temp_path/_2_user_date_click_stats_{0}.pickle".format(feature) #结果存储路径"
            date_click_stats = pd.read_pickle(click_stats_save_path) 
            train = pd.merge(train,date_click_stats,how="left",on=["user_id"]).drop_duplicates()
            test = pd.merge(test,date_click_stats,how="left",on=["user_id"]).drop_duplicates()
            del date_click_stats

        for feature in stats_feature:
            print(feature) 
            click_stats_save_path = "../数据/temp_path/_2_user_date_hour_click_stats_{0}.pickle".format(feature) 
            date_click_stats = pd.read_pickle(click_stats_save_path) 
            train = pd.merge(train,date_click_stats,how="left",on=["user_id"]).drop_duplicates()
            test = pd.merge(test,date_click_stats,how="left",on=["user_id"]).drop_duplicates()
            del date_click_stats
        
        print("save the train and test files !!!")
        train = train.fillna(0)
        test = test.fillna(0)
        train.to_pickle(save_path_train)
        test.to_pickle(save_path_test)
        print("save the files on {0}".format(save_path_train))
        print("save the files on {0}".format(save_path_test))
    

In [3]:
def merge_ID_global_sum(timeWindow=[21,24]):
    '''
    合并全局的ID统计和
    '''
    feature_ls = ["item_id","item_category_list","category_0","category_1"
                  ,"category_2","item_property_list","item_brand_id"
                   ,"item_city_id","user_id","user_gender_id"
                   ,"user_occupation_id","shop_id"]
    save_path_train = "../数据/merge_data/_5_merge_data_train_ID_global_sum_{0}_{1}.pickle".format(timeWindow[0],timeWindow[1])
    save_path_test = "../数据/merge_data/_5_merge_data_test_ID_global_sum_{0}_{1}.pickle".format(timeWindow[0],timeWindow[1])
    if os.path.exists(save_path_train) and os.path.exists(save_path_test):
        print("{0} 已经存在了".format(save_path_train))
        print("{0} 已经存在了".format(save_path_train))
    else:
        train = pd.read_pickle("../数据/preprocess_path/train_1.pickle") #训练集
        test = pd.read_pickle("../数据/preprocess_path/test_1.pickle")  #测试集
        train = train[(train["day"]>=timeWindow[0])&(train["day"]<=timeWindow[1])] #
        test = test[(test["day"]>=timeWindow[0])&(test["day"]<=timeWindow[1])] #
        cols = ["instance_id"]
        cols = feature_ls + cols
        train = train[cols]
        test = test[cols] 
        
        #合并ID的历史出现次数
        for i in feature_ls:
            path = "../数据/temp_path/_3_ID_global_sum_cnt_{0}.pickle".format(i)
            temp = pd.read_pickle(path)
            train = pd.merge(train,temp,how="left",on = [i]).drop_duplicates()
            test = pd.merge(test,temp,how="left",on = [i]).drop_duplicates()
            del temp
        train = train.drop(feature_ls,axis=1)
        test = test.drop(feature_ls,axis=1)
        
        train.to_pickle(save_path_train) #保存
        test.to_pickle(save_path_test) #保存
        print("保存在{0}!!".format(save_path_train))
        print("保存在{0}！！".format(save_path_test))
        del train
        del test

In [55]:
# %%time
# merge_ID_global_sum()

保存在../数据/merge_data/_5_merge_data_train_ID_global_sum_21_24.pickle!!
保存在../数据/merge_data/_5_merge_data_test_ID_global_sum_21_24.pickle！！
CPU times: user 8.17 s, sys: 404 ms, total: 8.58 s
Wall time: 11.1 s


In [56]:
# %%time
# merge_click_data()

以天为时间粒度的用户点击
item_id
item_category_list
item_property_list
item_brand_id
item_city_id
category_0
category_1
category_2
shop_id
context_page_id
以小时为为时间粒度的用户点击
item_id
item_category_list
item_property_list
item_brand_id
item_city_id
category_0
category_1
category_2
shop_id
context_page_id
以天和小时为时间粒度的用户点击行为的统计
item_id
item_category_list
item_property_list
item_brand_id
item_city_id
category_0
category_1
category_2
shop_id
context_page_id
item_id
item_category_list
item_property_list
item_brand_id
item_city_id
category_0
category_1
category_2
shop_id
context_page_id
save the train and test files !!!
save the files on ../数据/merge_data/_5_merge_data_train_basic_click_21_24.pickle
save the files on ../数据/merge_data/_5_merge_data_test_basic_click_21_24.pickle
CPU times: user 1min 11s, sys: 15.3 s, total: 1min 26s
Wall time: 1min 35s


In [4]:
def merge_smooth_cvr(timeWindow=[21,24]):
    '''
    merge the smoothed cvr 
    '''
    #读取每一个特征对应的转换率经验初始值（alpha / (alpha + beta)）
    smooth_cvr_log = pd.read_pickle("../数据/smooth_cvr/_4_smooth_cvr_log.pickle")
    save_path_train = "../数据/merge_data/_5_merge_data_train_smooth_cvr_{0}_{1}.pickle".format(timeWindow[0],timeWindow[1])
    save_path_test = "../数据/merge_data/_5_merge_data_test_smooth_cvr_{0}_{1}.pickle".format(timeWindow[0],timeWindow[1])
    
    if os.path.exists(save_path_train):
        print("{0} already exists".format(save_path_train))
    else:
        #-----------------筛选训练集和测试集---------------------------
        train = pd.read_pickle("../数据/preprocess_path/train_1.pickle") #训练集
        test = pd.read_pickle("../数据/preprocess_path/test_1.pickle")  #测试集
        train = train[(train["day"]>=timeWindow[0])&(train["day"]<=timeWindow[1])] #
        test = test[(test["day"]>=timeWindow[0])&(test["day"]<=timeWindow[1])] #
        
        #---------单id类型的贝叶斯平滑转化率特征---------------------------------
        feature_1 = ['item_id','item_category_list','item_property_list','item_brand_id'
                  ,"item_city_id","user_id","shop_id"]
        feature_2 = ['item_property_list',
                     'item_pv_level',
                     'user_star_level',
                     'shop_star_level',
                     'item_price_level',
                     'item_brand_id',
                     'item_sales_level',
                     'shop_id',
                     'category_0',
                     'user_id',
                     'user_age_level',
                     'item_city_id',
                     'category_1',
                     'item_category_list',
                     'user_occupation_id',
                     'item_id',
                     'user_gender_id',
                     'shop_review_num_level',
                     'hour',
                     'item_collected_level']
        
        feature_3 = ["day"]
        combine_feature = [("item_id","user_id"),
                         ("item_id","item_category_list"),
                         ("item_id","item_property_list"),
                         ("item_id","item_brand_id"),
                         ("item_id","item_city_id"),
                         ("item_id","item_collected_level"),
                         ("item_id","item_pv_level"),
                         ("item_id","user_gender_id"),
                         ("item_id","user_occupation_id"),
                         ("item_id","user_age_level"),
                         ("item_id","user_star_level"),
                         ("item_id","shop_id"),
                         ("item_id","shop_star_level"),
                         ("item_id","category_0"),
                         ("item_id","category_1"),
                         ("item_id","hour"),
                         ("item_id","shop_review_num_level"),
                         
                        ("user_gender_id","user_age_level"),
                        ("user_gender_id","category_0"),
                        ("user_gender_id","category_1"),
                        ("user_gender_id","item_category_list"),
                        ("user_gender_id","item_property_list"),
                        ("user_age_level","category_0"),
                        ("user_age_level","category_1"),
                        ("user_age_level","item_category_list"),
                        ("user_age_level","item_property_list"),
                        ("user_occupation_id","category_0"),
                        ("user_occupation_id","category_1"),
                        ("user_occupation_id","item_category_list"),
                        ("user_occupation_id","item_property_list"),
                        ("user_gender_id","hour"),
                        ("user_age_level","hour"),
                        ("user_occupation_id","hour"),
                        ("user_gender_id","shop_id"),
                        ("user_age_level","shop_id"),
                        ("user_occupation_id","shop_id"),
                                                 
                        ("user_age_level","item_price_level"),
                        ("user_age_level","item_sales_level"),
                        ("user_age_level","item_city_id"),
                        ("user_age_level","item_brand_id"),
                        ("user_age_level","item_collected_level"),
                        ("user_age_level","item_pv_level"),
                        ("user_gender_id","item_price_level"),
                        ("user_gender_id","item_sales_level"),
                        ("user_gender_id","item_city_id"),
                        ("user_gender_id","item_brand_id"),
                        ("user_gender_id","item_collected_level"),
                        ("user_gender_id","item_pv_level"),
                        ("user_occupation_id","item_price_level"),
                        ("user_occupation_id","item_sales_level"),
                        ("user_occupation_id","item_city_id"),
                        ("user_occupation_id","item_brand_id"),
                        ("user_occupation_id","item_collected_level"),
                        ("user_occupation_id","item_pv_level"),
                         
                        ("shop_id","user_age_level"),
                        ("shop_id","user_occupation_id"),
                        ("shop_id","item_category_list"),
                        ("shop_id","item_property_list"),
                        ("shop_id","category_0"),
                        ("shop_id","category_1"),
                        
                        ("shop_id","item_price_level"),
                        ("shop_id","item_sales_level"),
                        ("shop_id","item_city_id"),
                        ("shop_id","item_brand_id"),
                        ("shop_id","item_collected_level"),
                        ("shop_id","item_pv_level"),
                        ("shop_star_level","item_price_level"),
                        ("shop_star_level","item_sales_level"),
                        ("shop_star_level","item_city_id"),
                        ("shop_star_level","item_brand_id"),
                        ("shop_star_level","item_collected_level"),
                        ("shop_star_level","item_pv_level"),
                        ("shop_review_num_level","item_price_level"),
                        ("shop_review_num_level","item_sales_level"),
                        ("shop_review_num_level","item_city_id"),
                        ("shop_review_num_level","item_brand_id"),
                        ("shop_review_num_level","item_collected_level"),
                        ("shop_review_num_level","item_pv_level"),
                        ("shop_star_level","user_age_level"),
                        ("shop_star_level","user_gender_id"),
                        ("shop_star_level","user_occupation_id"),
                        ("shop_review_num_level","user_age_level"),
                        ("shop_review_num_level","user_gender_id"),
                        ("shop_review_num_level","user_occupation_id")]
        cols = ["instance_id"]
        cols = list(set(cols + feature_1 + feature_2 + feature_3))
        train = train[cols].drop_duplicates()
        test = test[cols].drop_duplicates()
        
        #内存有限，因此把每个字段合并的中间结果先存起来
        for i in feature_1:
        #合并每一个特征
            savePath = "../数据/smooth_cvr/_4_smooth_cvr_{0}_{1}_{2}.pickle".format(i,timeWindow[0],timeWindow[1])
            temp = pd.read_pickle(savePath).drop_duplicates()
            temp_tr = pd.merge(train[["instance_id",i,"day"]],temp,how="left",on=[i,"day"]).drop_duplicates()
            temp_ts = pd.merge(test[["instance_id",i,"day"]],temp,how="left",on=[i,"day"]).drop_duplicates()
            temp_tr = temp_tr[["instance_id",i + '_smooth']]
            temp_ts = temp_ts[["instance_id",i + '_smooth']]
            
            #填充空值
            temp_tr = temp_tr.fillna(smooth_cvr_log[i + '_smooth'])
            temp_ts = temp_ts.fillna(smooth_cvr_log[i + '_smooth'])
            #保存
            temp_tr.to_pickle("../数据/merge_data/_5_smooth_cvr_train_{0}_merge_0".format(i))
            temp_ts.to_pickle("../数据/merge_data/_5_smooth_cvr_test_{0}_merge_0".format(i))
            print("保存在 ../数据/merge_data/_5_smooth_cvr_train_{0}_merge_0".format(i))
            print("保存在 ../数据/merge_data/_5_smooth_cvr_test_{0}_merge_0".format(i))
            del temp
            
        #组合双特征组合的转化率
        for feat_1,feat_2 in combine_feature:
            savePath = "../数据/smooth_cvr/_4_smooth_cvr_{0}_and_{1}_{2}_{3}.pickle".format(feat_1,feat_2,timeWindow[0],timeWindow[1])
            temp = pd.read_pickle(savePath)
            temp_tr = pd.merge(train[["instance_id","day",feat_1,feat_2]],temp,on=["day",feat_1,feat_2],how="left")
            temp_ts = pd.merge(test[["instance_id","day",feat_1,feat_2]],temp,on=["day",feat_1,feat_2],how="left")
            temp_tr = temp_tr[["instance_id",feat_1+"_"+feat_2 + '_smooth']]
            temp_ts = temp_ts[["instance_id",feat_1+"_"+feat_2 + '_smooth']]
            
            #------填充空值-------------------
            temp_tr = temp_tr.fillna(smooth_cvr_log[feat_1+"_"+feat_2 + '_smooth'])
            temp_ts = temp_ts.fillna(smooth_cvr_log[feat_1+"_"+feat_2 + '_smooth'])
            #----------保存----------
            temp_tr.to_pickle("../数据/merge_data/_5_smooth_cvr_train_{0}_and_{1}_meger_0".format(feat_1,feat_2))
            temp_ts.to_pickle("../数据/merge_data/_5_smooth_cvr_test_{0}_and_{1}_meger_0".format(feat_1,feat_2))
            print("保存在 ../数据/merge_data/_5_smooth_cvr_train_{0}_and_{1}_meger_0".format(feat_1,feat_2))
            print("保存在 ../数据/merge_data/_5_smooth_cvr_test_{0}_and_{1}_meger_0".format(feat_1,feat_2))
        
        
        #通过中间文件，根据instance_id将有所有的中间表连接起来
        smooth_cvr_tr = train[["instance_id"]]
        smooth_cvr_ts = test[["instance_id"]]
        
        #合并单特征的转化率
        for i in feature_1:
            temp_tr = pd.read_pickle("../数据/merge_data/_5_smooth_cvr_train_{0}_merge_0".format(i))
            temp_ts = pd.read_pickle("../数据/merge_data/_5_smooth_cvr_test_{0}_merge_0".format(i))
            smooth_cvr_tr = pd.merge(smooth_cvr_tr,temp_tr,on=["instance_id"],how="left")
            smooth_cvr_ts = pd.merge(smooth_cvr_ts,temp_ts,on=["instance_id"],how="left")
        
        #合并多特征的转化率
        for feat_1,feat_2 in combine_feature:
            temp_tr = pd.read_pickle("../数据/merge_data/_5_smooth_cvr_train_{0}_and_{1}_meger_0".format(feat_1,feat_2))
            temp_ts = pd.read_pickle("../数据/merge_data/_5_smooth_cvr_test_{0}_and_{1}_meger_0".format(feat_1,feat_2))
            smooth_cvr_tr = pd.merge(smooth_cvr_tr,temp_tr,on=["instance_id"],how="left")
            smooth_cvr_ts = pd.merge(smooth_cvr_ts,temp_ts,on=["instance_id"],how="left")
        smooth_cvr_tr.to_pickle(save_path_train)
        smooth_cvr_ts.to_pickle(save_path_test)
        print("保存在 ",save_path_train)
        print("保存在 ",save_path_test)

In [65]:
# %%time
# merge_smooth_cvr(timeWindow=[21,24])

保存在 ../数据/merge_data/_5_smooth_cvr_train_item_id_merge_0
保存在 ../数据/merge_data/_5_smooth_cvr_test_item_id_merge_0
保存在 ../数据/merge_data/_5_smooth_cvr_train_item_category_list_merge_0
保存在 ../数据/merge_data/_5_smooth_cvr_test_item_category_list_merge_0
保存在 ../数据/merge_data/_5_smooth_cvr_train_item_property_list_merge_0
保存在 ../数据/merge_data/_5_smooth_cvr_test_item_property_list_merge_0
保存在 ../数据/merge_data/_5_smooth_cvr_train_item_brand_id_merge_0
保存在 ../数据/merge_data/_5_smooth_cvr_test_item_brand_id_merge_0
保存在 ../数据/merge_data/_5_smooth_cvr_train_item_city_id_merge_0
保存在 ../数据/merge_data/_5_smooth_cvr_test_item_city_id_merge_0
保存在 ../数据/merge_data/_5_smooth_cvr_train_user_id_merge_0
保存在 ../数据/merge_data/_5_smooth_cvr_test_user_id_merge_0
保存在 ../数据/merge_data/_5_smooth_cvr_train_shop_id_merge_0
保存在 ../数据/merge_data/_5_smooth_cvr_test_shop_id_merge_0
保存在 ../数据/merge_data/_5_smooth_cvr_train_item_id_and_user_id_meger_0
保存在 ../数据/merge_data/_5_smooth_cvr_test_item_id_and_user_id_meger_0
保存在 ..

In [18]:
def merge_origin_feature(timeWindow=[21,24]):
    '''
    原表当中的特征
    '''
    #筛选特征
    train = pd.read_pickle("../数据/preprocess_path/train_1.pickle")
    test = pd.read_pickle("../数据/preprocess_path/test_1.pickle")
    cols_1 = ["item_price_level","item_sales_level","item_collected_level",
           "user_star_level","item_pv_level","user_gender_id","user_age_level",
           "user_occupation_id","shop_review_num_level","shop_star_level",
              "category_0","category_1","category_2"]
    cols_2 = ["shop_review_positive_rate","shop_score_service","shop_score_delivery",
              "shop_score_description"]
    id_col = ["instance_id"]
    cols = id_col + cols_1 + cols_2 
    
    #-------合并train和test并且打上标签------------
    train = train[(train["day"]>=timeWindow[0])&(train["day"]<=timeWindow[1])]
    test = test[(test["day"]>=timeWindow[0])&(test["day"]<=timeWindow[1])]
    train= train[cols]
    test = test[cols]
    train["is_train"] = 1
    test["is_train"] = 0
    data = pd.concat([train,test],axis=0,ignore_index = True)
    
    #one-hot encoding
    data = pd.get_dummies(data,columns=cols_1)
    train = data[data["is_train"]==1]
    test = data[data["is_train"]==0]
    
    save_path_train = "../数据/merge_data/_5_merge_data_train_originated_{0}_{1}".format(timeWindow[0],timeWindow[1])
    save_path_test = "../数据/merge_data/_5_merge_data_test_originated_{0}_{1}".format(timeWindow[0],timeWindow[1])
    train.to_pickle(save_path_train) #保存到文件
    test.to_pickle(save_path_test)  #保存到文件
    print("保存在 {0}".format(save_path_train))
    print("保存在 {0}".format(save_path_test))

In [102]:
%%time
merge_origin_feature()

保存在 ../数据/merge_data/_5_merge_data_train_originated_21_24
保存在 ../数据/merge_data/_5_merge_data_test_originated_21_24
CPU times: user 1.17 s, sys: 196 ms, total: 1.37 s
Wall time: 1.61 s


In [7]:
def merge_data():
    '''
    合并所有的中间表，行成入模宽表
    '''
    save_path_train = "../数据/merge_data/_5_merge_data_train.pickle" #训练集
    save_path_test = "../数据/merge_data/_5_merge_data_test.pickle" #测试集
    save_path_val = "../数据/merge_data/_5_merge_data_val.pickle" #验证集
    
    condition = os.path.exists(save_path_train)&os.path.exists(save_path_test)&os.path.exists(save_path_val)
    if condition:
        print("{0} 已存在".format(save_path_train))
        print("{0} 已存在".format(save_path_test))
        print("{0} 已存在".format(save_path_val))
    else:
        print("开始合并训练集")
        #train和test
        train = pd.read_pickle("../数据/preprocess_path/train_1.pickle")
        train = train[(train["day"]>=21)&(train["day"]<=24)][["instance_id","day","is_trade"]]
        
        #读取基本的点击行为表
        basic_click_tr = pd.read_pickle("../数据/merge_data/_5_merge_data_train_basic_click_21_24.pickle")
        drop_cols = ['user_id', 'context_timestamp', 'date', 'hour', 'item_id', 
                     'item_category_list', 'item_property_list', 'item_brand_id', 
                     'item_city_id', 'category_0', 'category_1', 'category_2', 'shop_id',
                     'context_page_id']
        basic_click_tr = basic_click_tr.drop(drop_cols,axis=1) #删除id类型的字段
        train = pd.merge(train,basic_click_tr,how="left",on=["instance_id"]).drop_duplicates()
        del basic_click_tr
        print("合并 basic_click_tr 完毕！！！！")
        
        #id类型
        id_tr = pd.read_pickle("../数据/merge_data/_5_merge_data_train_ID_global_sum_21_24.pickle")
        train = pd.merge(train,id_tr,how="left",on=["instance_id"]).drop_duplicates()
        del id_tr
        print("合并 id_tr 完毕！！！！")
        
        #转化率
        smooth_cvr_tr = pd.read_pickle("../数据/merge_data/_5_merge_data_train_smooth_cvr_21_24.pickle")
        train = pd.merge(train,smooth_cvr_tr,how="left",on=["instance_id"]).drop_duplicates()
        del smooth_cvr_tr
        print("合并 smooth_cvr_tr 完毕！！！！")
        
        #原始数据
        origin_tr = pd.read_pickle("../数据/merge_data/_5_merge_data_train_originated_21_24")
        train = pd.merge(train,origin_tr,how="left",on=["instance_id"]).drop_duplicates()
        del origin_tr
        print("合并 origin_tr 完毕！！！！")
        
        print("拆分训练集和验证集！！！")
        #分割训练集和验证集
        val = train[train["day"]==23]
        train = train[train["day"].isin([21,22])]
        val = val.drop(["day"],axis=1)
        train = train.drop(["day"],axis=1)
        train.to_pickle(save_path_train)
        val.to_pickle(save_path_val)
        print("训练集已经保存在 {0}".format(save_path_train))
        print("验证集已经保存在 {0}".format(save_path_val))
        
        del train
        del val
        
        print("开始合并测试集")
        test = pd.read_pickle("../数据/preprocess_path/test_1.pickle")
        test = test[["instance_id"]]
        #点击次数
        basic_click_ts = pd.read_pickle("../数据/merge_data/_5_merge_data_test_basic_click_21_24.pickle")
        basic_click_ts = basic_click_ts.drop(drop_cols,axis=1) #删除id类型的字段
        test = pd.merge(test,basic_click_ts,how="left",on=["instance_id"]).drop_duplicates()
        del basic_click_ts
        
        #id类型
        id_ts = pd.read_pickle("../数据/merge_data/_5_merge_data_test_ID_global_sum_21_24.pickle")
        test = pd.merge(test,id_ts,how="left",on=["instance_id"]).drop_duplicates()
        del id_ts
        
        #转化率
        smooth_cvr_ts = pd.read_pickle("../数据/merge_data/_5_merge_data_test_smooth_cvr_21_24.pickle")
        test = pd.merge(test,smooth_cvr_ts,how="left",on=["instance_id"]).drop_duplicates()
        del smooth_cvr_ts
        
        #原始数据
        origin_ts = pd.read_pickle("../数据/merge_data/_5_merge_data_test_originated_21_24")
        test = pd.merge(test,origin_ts,how="left",on=["instance_id"]).drop_duplicates()
        del origin_ts
        
        test.to_pickle(save_path_test)
        print("测试集保存在 {0}".format(save_path_test))

In [8]:
%%time
merge_data()

开始合并训练集
合并 basic_click_tr 完毕！！！！
合并 id_tr 完毕！！！！
合并 smooth_cvr_tr 完毕！！！！
合并 origin_tr 完毕！！！！
拆分训练集和验证集！！！
训练集已经保存在 ../数据/merge_data/_5_merge_data_train.pickle
验证集已经保存在 ../数据/merge_data/_5_merge_data_val.pickle
开始合并测试集
测试集保存在 ../数据/merge_data/_5_merge_data_test.pickle
CPU times: user 13.9 s, sys: 2.45 s, total: 16.3 s
Wall time: 19.2 s


In [9]:
%%time
if __name__ =="__main__":
    merge_ID_global_sum() #合并ID次数类型
    merge_click_data() #合并点击次数
    merge_smooth_cvr(timeWindow=[21,24]) #合并转化率
    merge_origin_feature()  #合并原始特征
    merge_data() #合并所有中间表