## Build user click Feature
---
阿里妈妈提供了用户点击的广告日志给我们，我们先将广告的展示时间当做是用户点击此广告的时间
https://github.com/shenweichen/Tencent_Social_Ads2017_Mobile_App_pCVR/blob/master/code/_2_1_gen_user_click_features.py
---
本节代码主要生成以下的特征：


In [1]:
import pandas as pd
import numpy as np
import os
import time
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency #方差检验
import seaborn as sns
import copy 
%matplotlib inline
plt.style.use("ggplot")

In [2]:
train = pd.read_pickle("../数据/preprocess_path/train_1.pickle")
test = pd.read_pickle("../数据/preprocess_path/test_1.pickle")

In [12]:
pd.DatetimeIndex(train["context_timestamp"]).shift(-16,freq="H").min()

Timestamp('2018-09-17 00:00:01')

In [3]:
def parse_time(df,time_col):
    '''
    功能：
    -----
    将时间进行解析，并提取各层次的时间
    '''
    data = copy.deepcopy(df) #复制
    data[time_col] = pd.to_datetime(data[time_col],unit="s")
    data["date"] = data[time_col].dt.date  #日期
    data["month"] = data[time_col].dt.month #月份
    data["day"] = data[time_col].dt.day   #一个月的第几日
    data["weekday"] = data[time_col].dt.weekday_name  #曜日
    data["hour"] = data[time_col].dt.hour  #时间
    data["minute"] = data[time_col].dt.minute #分钟
    return data

In [13]:
def preprocessing_data(readpath,savepath,update=False):
    '''
    初步清洗数据
    data:需要清洗的数据
    readpath:读取文件
    savepath:清洗完的数据存储路径
    update:是否要重新清洗，update = True表示重新清洗数据
    '''
    if os.path.exists(savepath):
        print("{0}存在！".format(savepath))
        if update:
            print("重新清洗数据")
            df = pd.read_csv(readpath,sep=" ") #读取
            category = df["item_category_list"].str.split(";",expand=True) #分解类目属性
            category.columns = ["category_0","category_1","category_2"] #重命名
            df = pd.concat([df,category],axis=1) #关联
            df["context_timestamp"] = pd.to_datetime(df["context_timestamp"],unit="s") #解析时间格式
            df["context_timestamp"] = pd.DatetimeIndex(df["context_timestamp"]).shift(-16,freq="H")#平移16个小时     
            df["context_timestamp"] = pd.to_datetime(df["context_timestamp"],unit="s") #解析时间格式
            df = parse_time(df,"context_timestamp") #解析出天、小时
            df.to_pickle(savepath) #存储文件
            print("保存在文件路径{0}".format(savepath)) #
        else:
            print("不重新清洗数据")
    else:
        df = pd.read_csv(readpath,sep=" ") #读取
        category = df["item_category_list"].str.split(";",expand=True) #分解类目属性
        category.columns = ["category_0","category_1","category_2"] #重命名
        df = pd.concat([df,category],axis=1) #关联
        df["context_timestamp"] = pd.to_datetime(df["context_timestamp"],unit="s") #解析时间格式
        df["context_timestamp"] = pd.DatetimeIndex(df["context_timestamp"]).shift(-16,freq="H")#平移16个小时 
        df["context_timestamp"] = pd.to_datetime(df["context_timestamp"],unit="s") #解析时间格式
        df = parse_time(df,"context_timestamp") #解析出天、小时
        df.to_pickle(savepath) #存储文件
        print("保存在文件路径{0}".format(savepath)) #

In [14]:
preprocessing_data("../数据/raw_path/round1_ijcai_18_train_20180301.txt","../数据/preprocess_path/train_1.pickle",True)

../数据/preprocess_path/train_1.pickle存在！
重新清洗数据
保存在文件路径../数据/preprocess_path/train_1.pickle


In [15]:
preprocessing_data("../数据/raw_path/round1_ijcai_18_test_a_20180301.txt","../数据/preprocess_path/test_1.pickle",True)

../数据/preprocess_path/test_1.pickle存在！
重新清洗数据
保存在文件路径../数据/preprocess_path/test_1.pickle


In [16]:
test = pd.read_pickle("../数据/preprocess_path/test_1.pickle")
test["context_timestamp"].min(),test["context_timestamp"].max()

(Timestamp('2018-09-24 00:00:02'), Timestamp('2018-09-24 23:59:25'))

In [18]:
test["day"].unique()

array([24])

In [20]:
train = pd.read_pickle("../数据/preprocess_path/train_1.pickle")
train["context_timestamp"].min(),train["context_timestamp"].max()

(Timestamp('2018-09-17 00:00:01'), Timestamp('2018-09-23 23:59:47'))

In [22]:
train["day"].unique()

array([17, 20, 18, 19, 21, 22, 23])

## 1.创建用户点击广告的特征

#### 1.1 统计用户点击行为的基本统计值

#### 我们转换时间格式可以看到，我们现有的广告时间都是在2018-09-17 到 2018-09-24

In [23]:
def getMask(data,startDate,endDate):
    '''
    生成筛选时间窗口的条件
    '''
    df = copy.deepcopy(data) 
    startDate = pd.to_datetime(startDate) #开始日期
    endDate = pd.to_datetime(endDate) + pd.Timedelta("1 days") #结束日期
    mask = (df["context_timestamp"] >= startDate) & (df["context_timestamp"]<= endDate) #筛选时间日期
    return mask

In [25]:
def gen_Click_stats():
    '''
    生成用户点击统计量
    1.总点击数
    2.日平均点击量、最大、最小、中位数
    3.最后一天点击次数
    4.点击广告的时间间隔的最大值、最小值、平均值、中位数
    '''
    if os.path.exists("../数据/temp_path/_2_get_click_stats.pickle"):
        print("../数据/temp_path/_2_get_click_stats.pickle 已经存在！！！")
    else:
        train = pd.read_pickle("../数据/preprocess_path/train_1.pickle") #训练集
        train = train.drop(["is_trade"],axis=1)
        test = pd.read_pickle("../数据/preprocess_path/test_1.pickle") #测试集
        df = train.append(test,ignore_index = True) #合并数据
        adClick = df.groupby(["user_id"])["context_timestamp"].count() #统计点击次数

        #统计天点击量
        dayClick = df.pivot_table(index="user_id",columns="date",values="context_timestamp",aggfunc="count")
        dayClick = dayClick.fillna(0)
        dayClick_currence = dayClick.iloc[:,-1]
        dayClick_mean = dayClick.mean(axis=1) #日均点击量
        dayClick_max = dayClick.max(axis=1) #日最大点击量
        dayClick_min = dayClick.min(axis=1) #日最小点击量
        dayClick_median = dayClick.median(axis=1) #日中位数点击量

        #以小时的维度点击
        df["date_hour"] = df["date"].map(str) +"_" + df["hour"].map(str)
        hourClick = df.pivot_table(index="user_id",columns="date_hour",values="context_timestamp",aggfunc="count")
        hourClick = hourClick.fillna(0)
        hourClick_mean = hourClick.mean(axis=1) #小时均点击量
        hourClick_max = hourClick.max(axis=1) #小时最大点击量
        hourClick_min = hourClick.min(axis=1) #小时最小点击量
        hourClick_median = hourClick.median(axis=1) #小时中位数点击量

        #以分钟的维度点击
        df["date_hour_minute"] = df["date"].map(str) +"_" + df["hour"].map(str) + "_" + df["minute"].map(str)
        minuteClick = df.groupby(["user_id","date_hour_minute"])[["context_timestamp"]].count().reset_index()
        minuteClick_mean = minuteClick.groupby("user_id")["context_timestamp"].mean() #分钟均点击量
        minuteClick_max = minuteClick.groupby("user_id")["context_timestamp"].max() #分钟最大点击量
        minuteClick_min = minuteClick.groupby("user_id")["context_timestamp"].min() #分钟最小点击量
        minuteClick_median = minuteClick.groupby("user_id")["context_timestamp"].median() #分钟中位数点击量

        df = df.sort_values(by="context_timestamp") #时间排序
        #计算点击频率
        df["lastClick"] = df.groupby(["user_id"])[["context_timestamp"]].shift(1) #点击上一次
        df["clickDist"] = (df["context_timestamp"] - df["lastClick"])
        df.iloc[:,-1] = df.iloc[:,-1].map(lambda x:x.total_seconds()) #点击频率

        #计算时间间隔,得到最大值、最小值、平均值和中卫值
        dist_max = df.groupby("user_id")[["clickDist"]].max()
        dist_max = df.groupby("user_id")[["clickDist"]].min()
        dist_mean = df.groupby("user_id")[["clickDist"]].mean()
        dist_median = df.groupby("user_id")[["clickDist"]].median()
        dist_std = df.groupby("user_id")["clickDist"].agg(np.std)
        dist_std = dist_std.reset_index()
        dist_std.columns = ["user_id","std"]
        dist_std = dist_std.set_index("user_id")

        #合并
        var_ls = [adClick,dayClick_currence,
                  dayClick_mean,dayClick_max,dayClick_min,dayClick_median,
                 hourClick_mean,hourClick_max,hourClick_min,hourClick_median,
                  minuteClick_mean,minuteClick_max,minuteClick_min,minuteClick_median,
                 dist_max,dist_max,dist_mean,dist_median,dist_std]
        var_name_ls = ["adClick","dayClick_currence",
                       "dayClick_mean","dayClick_max","dayClick_min","dayClick_median",
                       "hourClick_mean","hourClick_max","hourClick_min","hourClick_median",
                       "minuteClick_mean","minuteClick_max","minuteClick_min","minuteClick_median",
                        "dist_max","dist_min","dist_mean","dist_median","dist_std"]

        clickData = df[["user_id"]].drop_duplicates()

        #合并所有的变量
        for var,name in zip(var_ls,var_name_ls):
            var = var.reset_index()
            var.columns = ["user_id",name]
            clickData = pd.merge(clickData,var,how="left",left_on="user_id",right_on="user_id")

        clickData = clickData.fillna(0) #填充0
        clickData_path = "../数据/temp_path/_2_get_click_stats.pickle"
        clickData.to_pickle(clickData_path)
        print("保存在.... ",clickData_path)

In [26]:
%%time
gen_Click_stats()

保存在....  ../数据/temp_path/_2_get_click_stats.pickle
CPU times: user 16 s, sys: 628 ms, total: 16.6 s
Wall time: 16.8 s


In [27]:
def gen_first_time_to_last_margin():
    '''
    功能：
    -----
    1.获取最初一次和最后一次的行为时间差
    2.获取当前点击时间和最初点击时间的时间差
    3.获取当前点击时间和最后一次点击时间的时间差
    '''
    if os.path.exists("../数据/temp_path/_2_first_time_to_last_margin.pickle"):
        print("../数据/temp_path/_2_first_time_to_last_margin.pickle 已经存在")
    else:
        #算第一次有有行为到最后一次有行为的时间距离
        train = pd.read_pickle("../数据/preprocess_path/train_1.pickle") #训练集
        train = train.drop(["is_trade"],axis=1)
        test = pd.read_pickle("../数据/preprocess_path/test_1.pickle") #测试集
        df = train.append(test,ignore_index = True) #合并数据
        temp = df[["user_id","context_timestamp"]].drop_duplicates()
        earlist = temp.groupby("user_id")['context_timestamp'].min().reset_index()  #最早日期
        latest = temp.groupby("user_id")['context_timestamp'].max().reset_index()    #最迟日期
        earlist = earlist.rename(columns={"context_timestamp":"earlist_date"})  #重命名
        latest = latest.rename(columns={"context_timestamp":"latest_date"})#重命名
        temp = pd.merge(temp,earlist,how="left") 
        temp = pd.merge(temp,latest,how="left")
        temp["first_2_last_dist"] = (temp["latest_date"] - temp["earlist_date"]) #计算第一次和最后一次点击差

        #计算当前点击时间与第一次点击时间、最后一次点击时间的时间差
        temp["first_2_last_dist"] = temp["first_2_last_dist"].map(lambda x:x.total_seconds())
        temp["now_2_first_dist"] = (temp["context_timestamp"] - temp["earlist_date"])
        temp["now_2_first_dist"] = temp["now_2_first_dist"].map(lambda x:x.total_seconds()) 
        temp["now_2_last_dist"] = (temp["latest_date"] - temp["context_timestamp"])
        temp["now_2_last_dist"] = temp["now_2_last_dist"].map(lambda x:x.total_seconds()) 
        temp = temp.fillna(0)
        temp = temp[["user_id","context_timestamp","first_2_last_dist","now_2_first_dist","now_2_last_dist"]]
        path = "../数据/temp_path/_2_first_time_to_last_margin.pickle"
        temp.to_pickle(path)
        print(path)

In [28]:
%%time
gen_first_time_to_last_margin()

../数据/temp_path/_2_first_time_to_last_margin.pickle
CPU times: user 26.7 s, sys: 156 ms, total: 26.8 s
Wall time: 26.8 s


In [29]:
def get_last_2(x):
    try:
        a = x.iloc[-2]
    except:
        a = np.nan
    return a

def get_last_3(x):
    try:
        a = x.iloc[-3]
    except:
        a = np.nan
    return a

def get_last_4(x):
    try:
        a = x.iloc[-4]
    except:
        a = np.nan
    return a

def gen_last_1_2_dist():
    '''
    计算倒数第一次点击与倒数第二次点击的时间差
    '''
    if os.path.exists("../数据/temp_path/_2_last_1_2_dist.pickle"):
        print("../数据/temp_path/_2_last_1_2_dist.pickle 已经存在")
    else:
        train = pd.read_pickle("../数据/preprocess_path/train_1.pickle") #训练集
        train = train.drop(["is_trade"],axis=1)
        test = pd.read_pickle("../数据/preprocess_path/test_1.pickle") #测试集
        df = train.append(test,ignore_index = True) #合并数据

        #最后一次点击
        temp = df[["user_id","context_timestamp"]].drop_duplicates() 
        temp = temp.sort_values(by=["context_timestamp"])
        lastClick_time = df.groupby(["user_id"])["context_timestamp"].max() #最后一次点击
        last2Click_time = df.groupby(["user_id"])["context_timestamp"].agg(get_last_2) #倒数第二次点击
        last3Click_time = df.groupby(["user_id"])["context_timestamp"].agg(get_last_3) #倒数第三次点击

        var_name_ls = ["lastClick_time","last2Click_time","last3Click_time"]
        var_ls = [lastClick_time,last2Click_time,last3Click_time]

        #合并
        for var,name in zip(var_ls,var_name_ls):
            var = var.reset_index()
            var.columns = ["user_id",name]
            temp = pd.merge(temp,var,how="left")

        #最后一次与倒数第二次
        temp["last_click_1_2_dist"] = temp["lastClick_time"] - temp["last2Click_time"]
        temp["last_click_1_2_dist"] = temp["last_click_1_2_dist"].map(lambda x:x.total_seconds())

        #倒数第二次与倒数第三次
        temp["last_click_2_3_dist"] = temp["last2Click_time"] - temp["last3Click_time"]
        temp["last_click_2_3_dist"] = temp["last_click_2_3_dist"].map(lambda x:x.total_seconds())

        #最后一次与倒数第3次
        temp["last_click_1_3_dist"] = temp["lastClick_time"] - temp["last3Click_time"]
        temp["last_click_1_3_dist"] = temp["last_click_1_3_dist"].map(lambda x:x.total_seconds())

        temp = temp.drop(["context_timestamp","lastClick_time","last2Click_time","last3Click_time"],axis=1)
        temp = temp.fillna(0)

        path = "../数据/temp_path/_2_last_1_2_dist.pickle"
        print("保存在... ",path)
        temp.to_pickle(path)


In [30]:
%%time
gen_last_1_2_dist()

保存在...  ../数据/temp_path/_2_last_1_2_dist.pickle
CPU times: user 41.6 s, sys: 60 ms, total: 41.7 s
Wall time: 41.5 s


In [31]:
def slideWindows(n=3,start=17,end=24):
    '''
    统计滑动窗口,默认是3天活动窗口
    '''
    #读取train、test
    train = pd.read_pickle("../数据/preprocess_path/train_1.pickle") #训练集
    train = train.drop(["is_trade"],axis=1)
    test = pd.read_pickle("../数据/preprocess_path/test_1.pickle") #测试集
    df = train.append(test,ignore_index = True) #合并数据

    slide_fea = df[["user_id"]].drop_duplicates() #获取关联匹配的id
    for i in range(start,end-n+2): #滑动窗口
        temp = df[(df["day"]>=i)&(df["day"]<i+n)] #获取3天的活动窗口
        stats = temp.groupby("user_id",as_index=False)["context_timestamp"].count() #点击次数
        stats.columns = ["user_id","click_cnt"]
        new_columns = [col if col=="user_id" else '{0}_{1}_{2}'.format(i,i+n-1,col) for col in stats.columns]
        stats.columns = new_columns#关联
        slide_fea = pd.merge(slide_fea,stats,how="left",left_on="user_id",right_on="user_id")
    slide_fea  = slide_fea.drop_duplicates().fillna(0) #填充为0
    path = "../数据/temp_path/_2_slide_windows_stats.pickle"
    slide_fea.to_pickle(path) #保存
    print("保存在",path)

In [32]:
%%time
slideWindows()

保存在 ../数据/temp_path/_2_slide_windows_stats.pickle
CPU times: user 2.22 s, sys: 104 ms, total: 2.32 s
Wall time: 2.38 s


#### 1.2 用户ID结合各类型的ID的点击行为

In [35]:
def gen_user_day_click_count():
    '''
    以天为时间粒度，生成用户每天的点击行为    
    拼接组键['user_id','date',feature]
    '''
    train = pd.read_pickle("../数据/preprocess_path/train_1.pickle") #训练集
    train = train.drop(["is_trade"],axis=1)
    test = pd.read_pickle("../数据/preprocess_path/test_1.pickle") #测试集
    df = train.append(test,ignore_index = True) #合并数据
    ads_feature = ['item_id','item_category_list','item_property_list'
                   ,'item_brand_id','item_city_id',"category_0","category_1","category_2"]
    context_feature = ['shop_id','context_page_id'] #上下文
    stats_feature = ads_feature+context_feature
    
    #统计每一个用户每一天点击每一个feature的次数
    for feature in stats_feature:
        temp_path = "../数据/temp_path/_2_user_date_click_{0}".format(feature)
        user_feature_click_day = df.groupby(['user_id','date',feature]).size().reset_index().rename(columns={0:'user_'+feature+'_click_day'})
        user_feature_click_day.to_pickle(temp_path)
        print("保存在{0}".format(temp_path))

In [36]:
%%time
gen_user_day_click_count()

保存在../数据/temp_path/_2_user_date_click_item_id
保存在../数据/temp_path/_2_user_date_click_item_category_list
保存在../数据/temp_path/_2_user_date_click_item_property_list
保存在../数据/temp_path/_2_user_date_click_item_brand_id
保存在../数据/temp_path/_2_user_date_click_item_city_id
保存在../数据/temp_path/_2_user_date_click_category_0
保存在../数据/temp_path/_2_user_date_click_category_1
保存在../数据/temp_path/_2_user_date_click_category_2
保存在../数据/temp_path/_2_user_date_click_shop_id
保存在../数据/temp_path/_2_user_date_click_context_page_id
CPU times: user 4.7 s, sys: 276 ms, total: 4.98 s
Wall time: 5.51 s


In [37]:
def gen_user_date_hour_click_count():
    '''
    以小时为时间粒度，生成用户每天每小时的点击行为    
    拼接组键['user_id','date',"hour",feature_id]
    '''
    train = pd.read_pickle("../数据/preprocess_path/train_1.pickle") #训练集
    train = train.drop(["is_trade"],axis=1)
    test = pd.read_pickle("../数据/preprocess_path/test_1.pickle") #测试集
    df = train.append(test,ignore_index = True) #合并数据
    
    ads_feature = ['item_id','item_category_list','item_property_list'
                   ,'item_brand_id','item_city_id',"category_0","category_1","category_2"]
    context_feature = ['shop_id','context_page_id'] #上下文
    stats_feature = ads_feature+context_feature
    #统计每一个用户每一天每小时点击每一个feature的次数
    for feature in stats_feature:
        temp_path = "../数据/temp_path/_2_user_date_hour_click_{0}".format(feature)
        user_feature_click_date_hour = df.groupby(['user_id','date',"hour",feature]).size().reset_index().rename(columns={0:'user_'+feature+'_click_date_hour'})
        user_feature_click_date_hour.to_pickle(temp_path)
        print("保存在{0}".format(temp_path))

In [38]:
gen_user_date_hour_click_count()

保存在../数据/temp_path/_2_user_date_hour_click_item_id
保存在../数据/temp_path/_2_user_date_hour_click_item_category_list
保存在../数据/temp_path/_2_user_date_hour_click_item_property_list
保存在../数据/temp_path/_2_user_date_hour_click_item_brand_id
保存在../数据/temp_path/_2_user_date_hour_click_item_city_id
保存在../数据/temp_path/_2_user_date_hour_click_category_0
保存在../数据/temp_path/_2_user_date_hour_click_category_1
保存在../数据/temp_path/_2_user_date_hour_click_category_2
保存在../数据/temp_path/_2_user_date_hour_click_shop_id
保存在../数据/temp_path/_2_user_date_hour_click_context_page_id


In [39]:
def gen_click_stats(data,col):
    '''
    通过函数gen_user_day_click_count 和 gen_user_date_hour_click_count我们得到了
    每个用户以不同的时间粒度（天、小时）对不同的id进行提取点击特征。
    如：user_item_id_day,user_item_id_date_hour
    现在对这些点击特征提取统计特征
    '''
    df = copy.deepcopy(data) #拷贝对象
    clicks_user_day = pd.DataFrame(data.groupby(['user_id', col])['context_timestamp'].count(), ) 
    clicks_user_day.rename(columns={'context_timestamp': col+'_m'}, inplace=True)
    clicks_user_day.reset_index(inplace=True)
    #提取统计特征
    clicks_user_day_mean = pd.DataFrame(clicks_user_day.groupby(['user_id'])[col+'_m'].mean()).rename(columns={col+'_m':col+'_mean'}).reset_index()
    clicks_user_day_max = pd.DataFrame(clicks_user_day.groupby(['user_id'])[col+'_m'].max()).rename(columns={col+'_m':col+'_max'}).reset_index()
    clicks_user_day_min = pd.DataFrame(clicks_user_day.groupby(['user_id'])[col+'_m'].min()).rename(columns={col+'_m':col+'_min'}).reset_index()
    stats_columns = [col+'_max',col+'_mean',col+'_min']
    
    click_stats = df[["user_id"]].drop_duplicates()
    click_stats = pd.merge(click_stats,clicks_user_day_mean,how='left',left_on='user_id',right_on="user_id") #关联平均值
    click_stats = pd.merge(click_stats, clicks_user_day_max, how='left', left_on='user_id',right_on="user_id") #关联最大值
    click_stats = pd.merge(click_stats, clicks_user_day_min, how='left', left_on='user_id',right_on="user_id") #关联最小值
    return click_stats

In [40]:
def gen_date_and_hour_click_stats():
    '''
    调用函数gen_click_stats，生成每个用户以天为粒度和小时为粒度对不同的id的点击次数的统计
    '''
    train = pd.read_pickle("../数据/preprocess_path/train_1.pickle") #训练集
    train = train.drop(["is_trade"],axis=1)
    test = pd.read_pickle("../数据/preprocess_path/test_1.pickle") #测试集
    df = train.append(test,ignore_index = True) #合并数据

    ads_feature = ['item_id','item_category_list','item_property_list'
                   ,'item_brand_id','item_city_id',"category_0","category_1","category_2"]
    context_feature = ['shop_id','context_page_id'] #上下文
    stats_feature = ads_feature+context_feature
    
    #以天为时间粒度的统计变量
    for i in stats_feature:
        click_stats_save_path = "../数据/temp_path/_2_user_date_click_stats_{0}.pickle".format(i) #结果存储路径
        temp_day_path = "../数据/temp_path/_2_user_date_click_{0}".format(i) 
        print("正在读取.....",temp_day_path)
        temp_day_df = pd.read_pickle(temp_day_path) #读取数据
        temp_day_df = pd.merge(df,temp_day_df,how="left",left_on=["user_id","date",i],right_on=["user_id","date",i]) #关联原来表
        click_stats = gen_click_stats(temp_day_df,'user_'+i+'_click_day')  #获取点击次数的统计值
        click_stats.to_pickle(click_stats_save_path) #保存路径
        print("正在保存.....",click_stats_save_path) 
        
    #以小时为时间粒度的统计变量
    for i in stats_feature:
        click_stats_save_path = "../数据/temp_path/_2_user_date_hour_click_stats_{0}.pickle".format(i) #结果存储路径
        temp_day_path = "../数据/temp_path/_2_user_date_hour_click_{0}".format(i) 
        print("正在读取.....",temp_day_path)
        temp_day_df = pd.read_pickle(temp_day_path) #读取数据
        temp_day_df = pd.merge(df,temp_day_df,how="left",left_on=["user_id","date","hour",i],right_on=["user_id","date","hour",i]) #关联原来表
        click_stats = gen_click_stats(temp_day_df,'user_'+i+'_click_date_hour')  #获取点击次数的统计值
        click_stats.to_pickle(click_stats_save_path) #保存路径
        print("正在保存.....",click_stats_save_path)
        

In [41]:
%%time
gen_date_and_hour_click_stats()

正在读取..... ../数据/temp_path/_2_user_date_click_item_id
正在保存..... ../数据/temp_path/_2_user_date_click_stats_item_id.pickle
正在读取..... ../数据/temp_path/_2_user_date_click_item_category_list
正在保存..... ../数据/temp_path/_2_user_date_click_stats_item_category_list.pickle
正在读取..... ../数据/temp_path/_2_user_date_click_item_property_list
正在保存..... ../数据/temp_path/_2_user_date_click_stats_item_property_list.pickle
正在读取..... ../数据/temp_path/_2_user_date_click_item_brand_id
正在保存..... ../数据/temp_path/_2_user_date_click_stats_item_brand_id.pickle
正在读取..... ../数据/temp_path/_2_user_date_click_item_city_id
正在保存..... ../数据/temp_path/_2_user_date_click_stats_item_city_id.pickle
正在读取..... ../数据/temp_path/_2_user_date_click_category_0
正在保存..... ../数据/temp_path/_2_user_date_click_stats_category_0.pickle
正在读取..... ../数据/temp_path/_2_user_date_click_category_1
正在保存..... ../数据/temp_path/_2_user_date_click_stats_category_1.pickle
正在读取..... ../数据/temp_path/_2_user_date_click_category_2
正在保存..... ../数据/temp_path/_2_user

In [85]:
if __name__ == "__main__":
    t1 = time.time()
    gen_Click_stats() #获取点击数行为
    gen_first_time_to_last_margin() #获取时间间隔
    gen_last_1_2_dist() #倒数第几次的点击行为时间间隔差
    slideWindows() #滑动时间窗口
    gen_user_day_click_count() #统计每个用户每天点击行为
    gen_user_date_hour_click_count() #统计每个用户每个小时的点击行为
    gen_date_and_hour_click_stats() #统计每天点击行为和每天每小时点击行为
    t2 = time.time()
    print("一共花费了{0}秒".format(t2-t1))

../数据/temp_path/_2_get_click_stats.pickle 已经存在！！！
../数据/temp_path/_2_first_time_to_last_margin.pickle 已经存在
../数据/temp_path/_2_last_1_2_dist.pickle 已经存在
保存在 ../数据/temp_path/_2_slide_windows_stats.pickle
保存在../数据/temp_path/_2_user_date_click_item_id
保存在../数据/temp_path/_2_user_date_click_item_category_list
保存在../数据/temp_path/_2_user_date_click_item_property_list
保存在../数据/temp_path/_2_user_date_click_item_brand_id
保存在../数据/temp_path/_2_user_date_click_item_city_id
保存在../数据/temp_path/_2_user_date_click_category_0
保存在../数据/temp_path/_2_user_date_click_category_1
保存在../数据/temp_path/_2_user_date_click_category_2
保存在../数据/temp_path/_2_user_date_click_shop_id
保存在../数据/temp_path/_2_user_date_click_context_page_id
保存在../数据/temp_path/_2_user_date_hour_click_item_id
保存在../数据/temp_path/_2_user_date_hour_click_item_category_list
保存在../数据/temp_path/_2_user_date_hour_click_item_property_list
保存在../数据/temp_path/_2_user_date_hour_click_item_brand_id
保存在../数据/temp_path/_2_user_date_hour_click_item_city_id
