In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import gc
from sklearn.preprocessing import LabelEncoder,MinMaxScaler,MultiLabelBinarizer

In [2]:
import datetime as dt
from multiprocessing import Pool,Process
import stats as sts
from notify import send_msg
import traceback

# 数据读取

In [3]:
df_user_reg = pd.read_csv('../data/user_register_log.txt',sep='\t',header=None,names=['user_id','register_day','register_type','device_type'])
df_app_launch = pd.read_csv('../data/app_launch_log.txt',sep='\t',header=None,names=['user_id','day'])
df_video_create = pd.read_csv('../data/video_create_log.txt',sep='\t',header=None,names=['user_id','day'])
df_user_activity = pd.read_csv('../data/user_activity_log.txt',sep='\t',header=None,names=['user_id','day','page','video_id','author_id','action_type'])

In [4]:
df_user_activity.action_type.value_counts()
# “播放“、”关注“、”点赞“、”转发“、”举报“和”减少此类作品“

0    19798261
1      555671
2      206079
3       46078
5         982
4         157
Name: action_type, dtype: int64

In [5]:
df_app_launch.groupby('user_id').day.count().describe()

count    51709.000000
mean         4.872324
std          5.543451
min          1.000000
25%          1.000000
50%          2.000000
75%          6.000000
max         30.000000
Name: day, dtype: float64

In [6]:
df_video_create.groupby('user_id').day.count().describe()

count    7606.000000
mean        4.621483
std         8.541389
min         1.000000
25%         1.000000
50%         2.000000
75%         5.000000
max       236.000000
Name: day, dtype: float64

In [7]:
df_user_activity.groupby('user_id').day.count().describe()

count    43708.000000
mean       471.474970
std        955.013699
min          1.000000
25%         23.000000
50%        118.000000
75%        476.000000
max      37882.000000
Name: day, dtype: float64

# 数据集构造

In [8]:
def get_active_users(d_start,d_end, hist_registers=None):
    actives = set()
    for df in [df_app_launch, df_video_create, df_user_activity]:
        actives.update(df[(df.day>=d_start) & (df.day<=d_end)].user_id.unique())
    if hist_registers:
        actives = actives&set(hist_registers)
    return actives

def build_train(train_weeks=[(10,16),(17,23),(24,30),(31,37)], train_end=30):
# def build_train(train_weeks=[(17,23),(24,30),(31,37)], train_end=30):
    df_train = pd.DataFrame()
    week_num = 0
    for week_start,week_end in train_weeks:
        # 选择这周之前注册的用户
        df_user = df_user_reg[df_user_reg.register_day<week_start]
        df_tmp = pd.DataFrame(df_user['user_id']).drop_duplicates(['user_id'])
        df_tmp['data_weeknum'] = week_num
        df_tmp['data_weekstart'] = week_start
        df_tmp['data_weekend'] = week_end
        if week_start <= train_end:
            # 查看用户活跃
            active_users = get_active_users(week_start, week_end, hist_registers=df_tmp.user_id.unique().tolist())
            df_tmp['label'] = df_tmp.user_id.map(lambda x:int(x in active_users))
        else:
            df_tmp['label'] = -1
        df_train = pd.concat([df_train, df_tmp])
        week_num += 1
    return df_train
df = build_train()

In [9]:
df.shape

(125057, 5)

# 基础特征

In [10]:
(df.label==1).sum()/(df.label!=-1).sum()

0.5062987402519497

In [11]:
# 用户基础特征
df = df.merge(df_user_reg, on='user_id',how='left')

In [12]:
df['user_reg_days'] = df['data_weekstart'] - df['register_day']

# 历史特征

In [13]:
df.columns

Index(['user_id', 'data_weeknum', 'data_weekstart', 'data_weekend', 'label',
       'register_day', 'register_type', 'device_type', 'user_reg_days'],
      dtype='object')

In [14]:
def get_user_continuedays(x, df_name, suffix='_hist'):
    ret = {}
    ret['user_id'] = x['user_id'].unique()[0]
    # 天数
    days = sorted(x.day.unique())
    ret['user_%s_days'%df_name + suffix] = len(days)
#     ret['user_max_%s_date'%df_name + suffix] = np.max(days)
    ret['user_mean_%s_date'%df_name + suffix] = np.mean(days)
    ret['user_min_%s_date'%df_name + suffix] = np.min(days)
    ret['user_range_%s_date'%df_name + suffix] = np.max(days) - np.min(days)
    
    if len(days) > 1:
        days_dist = np.diff(days, 1) - 1
        ret['user_max_no_%s_days'%df_name + suffix] = np.max(days_dist)
        ret['user_mean_no_%s_days'%df_name + suffix] = np.mean(days_dist)
        ret['user_min_no_%s_days'%df_name + suffix] = np.min(days_dist)
        ret['user_var_no_%s_days'%df_name + suffix] = np.var(days_dist)
        ret['user_median_no_%s_days'%df_name + suffix] = np.median(days_dist)
        try:
            ret['user_kurt_no_%s_days'%df_name + suffix] = sts.kurtosis(days_dist)
        except:
            ret['user_kurt_no_%s_days'%df_name + suffix] = -1
        try:
            ret['user_skew_no_%s_days'%df_name + suffix] = sts.skewness(days_dist)
        except:
            ret['user_skew_no_%s_days'%df_name + suffix] = -1
        continue_days = []
        count = 1
        for i in range(days_dist.shape[0]):
            if days_dist[i] == 0:
                count += 1
            else:
                if count > 0:
                    continue_days.append(count)
                count = 1
        continue_days.append(count)
        ret['user_continue_%s_dayslices'%df_name + suffix] = len(continue_days)
        ret['user_max_continue_%s_days'%df_name + suffix] = np.max(continue_days)
        ret['user_mean_continue_%s_days'%df_name + suffix] = np.mean(continue_days)
        ret['user_min_continue_%s_days'%df_name + suffix] = np.min(continue_days)
        ret['user_var_continue_%s_days'%df_name + suffix] = np.var(continue_days)
        ret['user_median_continue_%s_days'%df_name + suffix] = np.median(continue_days)
        try:
            ret['user_kurt_continue_%s_days'%df_name + suffix] = sts.kurtosis(continue_days)
        except:
            ret['user_kurt_continue_%s_days'%df_name + suffix] = -1
        try:
            ret['user_skew_continue_%s_days'%df_name + suffix] = sts.skewness(continue_days)
        except:
            ret['user_skew_continue_%s_days'%df_name + suffix] = -1
    else:
        ret['user_continue_%s_slices'%df_name + suffix] = 1
        ret['user_max_no_%s_days'%df_name + suffix] = -1
        ret['user_mean_no_%s_days'%df_name+ suffix] = -1
        ret['user_min_no_%s_days'%df_name+ suffix] = -1
        ret['user_median_no_%s_days'%df_name + suffix] = -1
        ret['user_var_no_%s_days'%df_name+ suffix] = 0
        ret['user_kurt_no_%s_days'%df_name + suffix] = -1
        ret['user_skew_no_%s_days'%df_name + suffix] = -1
        
        ret['user_max_continue_%s_days'%df_name+ suffix] = 1
        ret['user_mean_continue_%s_days'%df_name+ suffix] = 1
        ret['user_min_continue_%s_days'%df_name+ suffix] = 1
        ret['user_median_continue_%s_days'%df_name + suffix] = 1
        ret['user_var_continue_%s_days'%df_name+ suffix] = 0
        ret['user_kurt_continue_%s_days'%df_name + suffix] = -1
        ret['user_skew_continue_%s_days'%df_name + suffix] = -1
    # 次数
    days = sorted(x.day.values)
    ret['user_%s_times'%df_name+ suffix] = len(days)
    if len(days)>1:
        days_dist = np.diff(days, 1)
        count = 1
        continue_times = []
        for i in range(days_dist.shape[0]):
            if days_dist[i] <=1:
                count += 1
            else:
                if count > 0:
                    continue_times.append(count)
                count = 1
        continue_times.append(count)
        ret['user_max_continue_%s_times'%df_name + suffix]  = np.max(continue_times)
        ret['user_min_continue_%s_times'%df_name + suffix]  = np.min(continue_times)
        ret['user_mean_continue_%s_times'%df_name + suffix] = np.mean(continue_times)
        ret['user_var_continue_%s_times'%df_name + suffix]  = np.var(continue_times)
        ret['user_median_continue_%s_times'%df_name + suffix]  = np.median(continue_times)
        try:
            ret['user_kurt_continue_%s_times'%df_name + suffix]  = sts.kurtosis(continue_times)
        except:
            ret['user_kurt_continue_%s_times'%df_name + suffix]  = -1
        try:
            ret['user_skew_continue_%s_times'%df_name + suffix]  = sts.skewness(continue_times)
        except:
            ret['user_skew_continue_%s_times'%df_name + suffix]  = -1
    else:
        ret['user_max_continue_%s_times'%df_name + suffix]  = -1
        ret['user_min_continue_%s_times'%df_name + suffix]  = -1
        ret['user_mean_continue_%s_times'%df_name + suffix] = -1
        ret['user_var_continue_%s_times'%df_name + suffix]  = 0
        ret['user_median_continue_%s_times'%df_name + suffix]  = -1
        ret['user_kurt_continue_%s_times'%df_name + suffix]  = -1
        ret['user_skew_continue_%s_times'%df_name + suffix]  = -1
    daytimes = x.groupby('day').user_id.count().tolist()
    if len(daytimes) > 1:
        ret['user_max_%s_daytimes'%df_name+suffix] = np.max(daytimes)
        ret['user_min_%s_daytimes'%df_name+suffix] = np.min(daytimes)
        ret['user_mean_%s_daytimes'%df_name+suffix]= np.mean(daytimes)
        ret['user_var_%s_daytimes'%df_name+suffix] = np.var(daytimes)
        ret['user_median_%s_daytimes'%df_name+suffix] = np.median(daytimes)
        try:
            ret['user_kurt_%s_daytimes'%df_name+suffix] = sts.kurtosis(daytimes)
        except:
            ret['user_kurt_%s_daytimes'%df_name+suffix] = -1
        try:
            ret['user_skew_%s_daytimes'%df_name+suffix] = sts.skewness(daytimes)
        except:
            ret['user_skew_%s_daytimes'%df_name+suffix] = -1
        # 日活次数变化趋势
        daytimes_diff = np.diff(daytimes, 1)
        ret['user_%s_daytimes_diff1_mean'%df_name+suffix] = np.mean(daytimes_diff)
    else:
        ret['user_max_%s_daytimes'%df_name+suffix] = -1
        ret['user_min_%s_daytimes'%df_name+suffix] = -1
        ret['user_mean_%s_daytimes'%df_name+suffix]= -1
        ret['user_var_%s_daytimes'%df_name+suffix] = 0
        ret['user_median_%s_daytimes'%df_name+suffix] = -1
        ret['user_kurt_%s_daytimes'%df_name+suffix] = -1
        ret['user_skew_%s_daytimes'%df_name+suffix] = -1
        ret['user_%s_daytimes_diff1_mean'%df_name+suffix] = -1
    return pd.DataFrame([ret])

In [15]:
# 按照开始日期滑窗造特征
def get_history(df):
    ret = pd.DataFrame()
    for w in sorted(df.data_weeknum.unique()):
        # 历史数据
        start = sorted(df.data_weekstart.unique())[w]
        launch_hist = df_app_launch[(df_app_launch.day<start)]
        video_hist  = df_video_create[(df_video_create.day<start)]
        activity    = df_user_activity[(df_user_activity.day<start)]
        # 用户数据
        df_tmp = df[df.data_weeknum==w][['user_id','data_weeknum']]
        origin_shape = df_tmp.shape[0]
        
        # 历史/上周 统计启动天数、上传视频数量、交互数
        user_launch = launch_hist.groupby('user_id').apply(lambda x:get_user_continuedays(x, df_name='launch', suffix='_hist'))
        launch_hist_tmp = launch_hist.groupby(['user_id']).day.count().reset_index().rename(columns={'day':'user_hist_launchday'})
        launch_week_tmp = launch_hist[launch_hist.day>=start-7]
        user_launch_week = launch_week_tmp.groupby('user_id').apply(lambda x: get_user_continuedays(x, df_name='launch', suffix='_lastweek'))
        launch_week_tmp = launch_week_tmp.groupby(['user_id']).day.count().reset_index().rename(columns={'day':'user_lastweek_launchday'})
        df_tmp = df_tmp.merge(launch_hist_tmp, on='user_id',how='left')
        df_tmp = df_tmp.merge(launch_week_tmp, on='user_id',how='left')
        df_tmp = df_tmp.merge(user_launch, on='user_id', how='left')
        df_tmp = df_tmp.merge(user_launch_week, on='user_id', how='left')
        
        user_video = video_hist.groupby('user_id').apply(lambda x:get_user_continuedays(x, df_name='createvideo', suffix='_hist'))
        video_hist_tmp = video_hist.groupby(['user_id']).day.count().reset_index().rename(columns={'day':'user_hist_videocount'})
        video_week_tmp = video_hist[video_hist.day>=start-7]
        user_video_week = video_week_tmp.groupby('user_id').apply(lambda x: get_user_continuedays(x, df_name='createvideo', suffix='_lastweek'))
        video_week_tmp = video_week_tmp.groupby(['user_id']).day.count().reset_index().rename(columns={'day':'user_lastweek_videocount'})
        df_tmp = df_tmp.merge(video_hist_tmp, on='user_id',how='left')
        df_tmp = df_tmp.merge(video_week_tmp, on='user_id',how='left')
        df_tmp = df_tmp.merge(user_video, on='user_id', how='left')
        df_tmp = df_tmp.merge(user_video_week, on='user_id', how='left')
        
        user_active = activity.groupby(['user_id']).apply(lambda x:get_user_continuedays(x, df_name='activity', suffix='_hist'))
        active_hist_tmp = activity.groupby(['user_id']).day.count().reset_index().rename(columns={'day':'user_hist_actcount'})
        active_week = activity[activity.day>=start-7]
        user_active_week = active_week.groupby('user_id').apply(lambda x: get_user_continuedays(x, df_name='activity', suffix='_lastweek'))
        active_week_tmp = active_week.groupby(['user_id']).day.count().reset_index().rename(columns={'day':'user_lastweek_actcount'})
        df_tmp = df_tmp.merge(active_hist_tmp, on='user_id',how='left')
        df_tmp = df_tmp.merge(active_week_tmp, on='user_id',how='left')
        df_tmp = df_tmp.merge(user_active, on='user_id', how='left')
        df_tmp = df_tmp.merge(user_active_week, on='user_id', how='left')
        
        # 前2-6天（窗口）的统计量(1天 = 前1天当天， 7天 = lastweek(not hotel.))
        for n in range(2,7):
            user_launch_n_day_win = launch_hist[launch_hist.day >= start - n].groupby('user_id').apply(lambda x:get_user_continuedays(x, df_name='launch',suffix='_%sdaywin'%n))
            user_video_n_day_win  = video_hist[video_hist.day >= start - n].groupby('user_id').apply(lambda x:get_user_continuedays(x, df_name='createvideo',suffix='_%sdaywin'%n))
            user_active_n_day_win = activity[activity.day >= start - n].groupby('user_id').apply(lambda x:get_user_continuedays(x, df_name='activity',suffix='_%sdaywin'%n))
            df_tmp = df_tmp.merge(user_launch_n_day_win, on='user_id', how='left')
            df_tmp = df_tmp.merge(user_video_n_day_win,  on='user_id', how='left')
            df_tmp = df_tmp.merge(user_active_n_day_win, on='user_id', how='left')
        
        df_tmp.fillna(0, inplace=True)
        # 前1-7天 当天统计量
        for n in range(1,8):
            # 是否启动过
            # 是否/计数 发布视频
            # 是否/计数 进行activety
            # 进行了哪几类activity，各多少
            # 访问了哪几个page 各多少

            user_launch_n_day_onday = launch_hist[launch_hist.day == start - n]
            user_video_n_day_onday =  video_hist[video_hist.day == start - n]
            user_active_n_day_onday = activity[activity.day == start - n]
            user_n_day_launch_onday = user_launch_n_day_onday.groupby('user_id').day.count().reset_index().rename(columns={'day': 'user_%sdaybefore_launch'%n})
            user_n_day_videocount_onday = user_video_n_day_onday.groupby('user_id').day.count().reset_index().rename(columns={'day': 'user_%sdaybefore_video_count'%n})
            user_n_day_actcount_onday = user_active_n_day_onday.groupby('user_id').day.count().reset_index().rename(columns={'day': 'user_%sdaybefore_act_count'%n})
            
            df_tmp = df_tmp.merge(user_n_day_launch_onday, on='user_id', how='left').fillna(0)
            df_tmp = df_tmp.merge(user_n_day_videocount_onday,  on='user_id', how='left').fillna(0)
            df_tmp = df_tmp.merge(user_n_day_actcount_onday, on='user_id', how='left').fillna(0)
            df_tmp['user_%sdaybefore_video'%n] = df_tmp['user_%sdaybefore_video_count'%n].map(lambda x:1 if x>0 else 0)
            df_tmp['user_%sdaybefore_act'%n] = df_tmp['user_%sdaybefore_act_count'%n].map(lambda x:1 if x>0 else 0)
            
            user_n_day_activity_type_count_onday = user_active_n_day_onday.groupby(['user_id','action_type']).day.count().reset_index().pivot(index='user_id', columns='action_type', values='day').fillna(0)
            for col in user_n_day_activity_type_count_onday.columns.tolist():
                user_n_day_activity_type_count_onday.rename(columns={col: 'user_%sdaybefore_act_%s_count'%(n, col)},inplace=True)
                user_n_day_activity_type_count_onday['user_%sdaybefore_act_%s'%(n, col)] = user_n_day_activity_type_count_onday['user_%sdaybefore_act_%s_count'%(n, col)].map(lambda x: int(x>0))
            user_n_day_activity_type_count_onday.reset_index(inplace=True)
                
            user_n_day_page_count_onday = user_active_n_day_onday.groupby(['user_id', 'page']).day.count().reset_index().pivot(index='user_id',columns='page',values='day').fillna(0)
            for col in user_n_day_page_count_onday.columns.tolist():
                user_n_day_page_count_onday.rename(columns={col: 'user_%sdaybefore_act_page_%s_count'%(n, col)},inplace=True)
                user_n_day_page_count_onday['user_%sdaybefore_act_page_%s'%(n, col)] = user_n_day_page_count_onday['user_%sdaybefore_act_page_%s_count'%(n, col)].map(lambda x: int(x>0))
            user_n_day_page_count_onday.reset_index(inplace=True)
            df_tmp = df_tmp.merge(user_n_day_activity_type_count_onday, on='user_id', how='left').fillna(0)
            df_tmp = df_tmp.merge(user_n_day_page_count_onday, on='user_id', how='left').fillna(0)
            
        # 不同类型的行为数
        for act in sorted(df_user_activity.action_type.unique()):
            column_name1 = 'user_hist_act_%d_count'%act
            column_name2 = 'user_lastweek_act_%d_count'%act
            act_hist = activity[activity.action_type==act].groupby(['user_id']).day.count().reset_index().rename(columns={'day':column_name1})
            act_lastweek = active_week[active_week.action_type==act].groupby(['user_id']).day.count().reset_index().rename(columns={'day':column_name2})
            df_tmp = df_tmp.merge(act_hist, on='user_id',how='left')
            df_tmp = df_tmp.merge(act_lastweek, on='user_id',how='left')
        df_tmp.fillna(0,inplace=True)
        df_tmp['user_hist_goodact_count'] = df_tmp['user_hist_act_0_count'] +  df_tmp['user_hist_act_1_count'] + df_tmp['user_hist_act_2_count']+ df_tmp['user_hist_act_3_count']
        df_tmp['user_hist_badact_count'] = df_tmp['user_hist_act_4_count'] + df_tmp['user_hist_act_5_count']
        df_tmp['user_hist_badact_div_good_act'] = df_tmp['user_hist_badact_count'] / df_tmp['user_hist_goodact_count']
        df_tmp['user_lastweek_goodact_count'] = df_tmp['user_lastweek_act_0_count'] +  df_tmp['user_lastweek_act_1_count'] + df_tmp['user_lastweek_act_2_count']+ df_tmp['user_lastweek_act_3_count']
        df_tmp['user_lastweek_badact_count'] = df_tmp['user_lastweek_act_4_count'] + df_tmp['user_lastweek_act_5_count']
        df_tmp['user_lastweek_badact_div_good_act'] = df_tmp['user_lastweek_badact_count'] / df_tmp['user_lastweek_goodact_count']
        
        # 用户浏览不同page的次数
        for page in sorted(df_user_activity.page.unique()):
            column_name1 = 'user_hist_act_page_%s_count'%page
            column_name2 = 'user_lastweek_act_page_%s_count'%page
            page_hist = activity[activity.page == page].groupby(['user_id']).day.count().reset_index().rename(columns={'day':column_name1})
            page_lastweek = active_week[active_week.page == page].groupby(['user_id']).day.count().reset_index().rename(columns={'day':column_name2})
            df_tmp = df_tmp.merge(page_hist, on='user_id', how='left')
            df_tmp = df_tmp.merge(page_lastweek, on='user_id', how='left')
        # 用户发生行为的视频数unique count
        user_active_video_uniquecount = activity.drop_duplicates(['user_id','video_id']).groupby(['user_id']).day.count().reset_index().rename(columns={'day':'user_hist_act_video_uniquecount'})
        user_active_video_uniquecount_week = active_week.drop_duplicates(['user_id','video_id']).groupby(['user_id']).day.count().reset_index().rename(columns={'day':'user_lastweek_act_video_uniquecount'})
        df_tmp = df_tmp.merge(user_active_video_uniquecount, on='user_id', how='left')
        df_tmp = df_tmp.merge(user_active_video_uniquecount_week, on='user_id', how='left')
        # 用户发生行为的作者数
        user_active_author_count = activity.drop_duplicates(['user_id','author_id']).groupby(['user_id']).day.count().reset_index().rename(columns={'day':'user_hist_act_author_count'})
        user_active_author_count_week = active_week.drop_duplicates(['user_id','author_id']).groupby(['user_id']).day.count().reset_index().rename(columns={'day':'user_lastweek_act_author_count'})
        df_tmp = df_tmp.merge(user_active_author_count, on='user_id', how='left')
        df_tmp = df_tmp.merge(user_active_author_count_week, on='user_id', how='left')
        # 用户平均每个视频看几次、最多看几遍
        user_video_act_counts_mean = activity.groupby(['user_id','video_id']).day.count().reset_index().groupby(['user_id']).day.mean().reset_index().rename(columns={'day':'user_hist_act_video_meancount'})
        user_video_act_counts_max = activity.groupby(['user_id','video_id']).day.count().reset_index().groupby(['user_id']).day.max().reset_index().rename(columns={'day':'user_hist_act_video_maxcount'})
        df_tmp = df_tmp.merge(user_video_act_counts_mean, on='user_id', how='left')
        df_tmp = df_tmp.merge(user_video_act_counts_max, on='user_id', how='left')
        df_tmp.fillna(0,inplace=True)
        # 距离最后一次启动/上传/活动多少天
        user_last_launch = launch_hist.groupby('user_id').day.max().reset_index().rename(columns={'day':'user_last_launch_date'})
        user_last_video = video_hist.groupby('user_id').day.max().reset_index().rename(columns={'day':'user_last_video_date'})
        user_last_active = activity.groupby('user_id').day.max().reset_index().rename(columns={'day':'user_last_act_date'})
        df_tmp = df_tmp.merge(user_last_launch, on='user_id', how='left').fillna(-1)
        df_tmp = df_tmp.merge(user_last_video, on='user_id', how='left').fillna(-1)
        df_tmp = df_tmp.merge(user_last_active, on='user_id', how='left').fillna(-1)
        df_tmp['user_last_launch_dist'] = df_tmp['user_last_launch_date'].map(lambda x: start - x if x!=-1 else -1)
        df_tmp['user_last_video_dist'] = df_tmp['user_last_video_date'].map(lambda x:start - x if x!=-1 else -1)
        df_tmp['user_last_act_dist'] = df_tmp['user_last_act_date'].map(lambda x:start - x if x!=-1 else -1)
        # 用户视频的受关注情况, 历史/上周，及比例
        df_video_act = activity.groupby(['author_id','action_type']).day.count().reset_index().pivot(index='author_id',columns='action_type',values='day').fillna(0)
        df_video_act_week = active_week.groupby(['author_id','action_type']).day.count().reset_index().pivot(index='author_id',columns='action_type',values='day').fillna(0)
        df_video_act['user_hist_video_goodact_sum'] = 0
        df_video_act['user_hist_video_activity_sum'] = df_video_act.sum(axis=1)
        df_video_act_week['user_lastweek_video_activity_sum'] = df_video_act_week.sum(axis=1)
        df_video_act_week['user_lastweek_video_goodact_sum'] = 0
        for col in df_video_act.columns:
            if type(col) == str:
                continue
            if col < 4:
                df_video_act['user_hist_video_goodact_sum'] += df_video_act[col]
            df_video_act.rename(columns={col:'user_hist_video_activity_'+str(col)+'_count'},inplace=True)
            df_video_act['user_hist_video_activity_'+str(col)+'_rate'] = df_video_act['user_hist_video_activity_'+str(col)+'_count'] / df_video_act['user_hist_video_activity_sum']
        df_video_act.fillna(0, inplace=True)
        df_video_act['user_hist_video_goodact_rate'] = df_video_act['user_hist_video_goodact_sum'] / df_video_act['user_hist_video_activity_sum']
        for col in df_video_act_week.columns:
            if type(col) == str:
                continue
            if col < 4:
                df_video_act_week['user_lastweek_video_goodact_sum'] += df_video_act_week[col]
            df_video_act_week.rename(columns={col:'user_lastweek_video_activity_'+str(col)+'_count'}, inplace=True)
            df_video_act_week['user_lastweek_video_activity_'+str(col)+'_rate'] = df_video_act_week['user_lastweek_video_activity_'+str(col)+'_count'] / df_video_act_week['user_lastweek_video_activity_sum']
        df_video_act_week.fillna(0, inplace=True)
        df_video_act_week['user_lastweek_video_goodact_rate'] = df_video_act_week['user_lastweek_video_goodact_sum'] / df_video_act_week['user_lastweek_video_activity_sum']
        df_video_act = df_video_act.reset_index()
        df_video_act_week = df_video_act_week.reset_index()
        df_tmp = df_tmp.merge(df_video_act, left_on='user_id', right_on='author_id', how='left').fillna(0)
        del df_tmp['author_id']
        df_tmp = df_tmp.merge(df_video_act_week, left_on='user_id', right_on='author_id', how='left').fillna(0)
        del df_tmp['author_id']
        # 用户视频出现在不同page的unique count/访问量count
        video_unique_page_count = activity.drop_duplicates(['page','video_id']).groupby(['author_id','page']).day.count().reset_index().pivot(index='author_id',columns='page',values='day').fillna(0)
        for col in video_unique_page_count.columns:
            video_unique_page_count.rename(columns={col: 'user_video_page_'+str(col)+'_uniquecount'},inplace=True)
        video_unique_page_count = video_unique_page_count.reset_index()
        df_tmp = df_tmp.merge(video_unique_page_count, left_on='user_id', right_on='author_id', how='left').fillna(0)
        del df_tmp['author_id']
        video_page_count = activity.groupby(['author_id','page']).day.count().reset_index().pivot(index='author_id',columns='page',values='day').fillna(0)
        for col in video_page_count.columns:
            video_page_count.rename(columns={col:'user_video_page_'+str(col)+'_count'},inplace=True)
        video_page_count = video_page_count.reset_index()
        df_tmp = df_tmp.merge(video_page_count, left_on='user_id', right_on='author_id', how='left').fillna(0)
        del df_tmp['author_id']
        video_page_count_week = active_week.groupby(['author_id','page']).day.count().reset_index().pivot(index='author_id',columns='page',values='day').fillna(0)
        for col in video_page_count_week.columns:
            video_page_count_week.rename(columns={col:'user_video_page_'+str(col)+'_lastweek_count'}, inplace=True)
        video_page_count_week = video_page_count_week.reset_index()
        df_tmp = df_tmp.merge(video_page_count_week, left_on='user_id', right_on='author_id', how='left').fillna(0)
        del df_tmp['author_id']
        
        # 0-1特征
        df_tmp['user_hist_launch'] = df_tmp['user_hist_launchday'].map(lambda x: int(x>0))
        df_tmp['user_lastweek_launch'] = df_tmp['user_lastweek_launchday'].map(lambda x: int(x>0))
        df_tmp['user_hist_video'] = df_tmp['user_hist_videocount'].map(lambda x: int(x>0))
        df_tmp['user_lastweek_video'] = df_tmp['user_lastweek_videocount'].map(lambda x: int(x>0))
        df_tmp['user_hist_act'] = df_tmp['user_hist_actcount'].map(lambda x: int(x>0))
        df_tmp['user_lastweek_act'] = df_tmp['user_lastweek_actcount'].map(lambda x: int(x>0))
        df_tmp['user_hist_act_types'] = 0#todo
        df_tmp['user_lastweek_act_types'] = 0
        df_tmp['user_hist_video_activity_types'] = 0
        df_tmp['user_lastweek_video_activity_types'] = 0
        df_tmp['user_hist_page_types'] = 0
        df_tmp['user_lastweek_page_types'] = 0
        df_tmp['user_hist_video_page_types'] = 0
        df_tmp['user_lastweek_video_page_types'] = 0
        for act in sorted(df_user_activity.action_type.unique()):
            if 'user_hist_act_%d_count'%act in df_tmp.columns:
                df_tmp['user_hist_act_%d'%act] = df_tmp['user_hist_act_%d_count'%act].map(lambda x: int(x>0))
                df_tmp['user_hist_act_types'] += df_tmp['user_hist_act_%d'%act]
            if 'user_lastweek_act_%d_count'%act in df_tmp.columns:
                df_tmp['user_lastweek_act_%d'%act] = df_tmp['user_lastweek_act_%d_count'%act].map(lambda x: int(x>0))
                df_tmp['user_lastweek_act_types'] += df_tmp['user_lastweek_act_%d'%act]
            if 'user_hist_video_activity_%s_count'%act in df_tmp.columns:
                df_tmp['user_hist_video_activity_%s'%act] = df_tmp['user_hist_video_activity_%s_count'%act].map(lambda x: int(x>0))
                df_tmp['user_hist_video_activity_types'] += df_tmp['user_hist_video_activity_%s'%act]
            if 'user_lastweek_video_activity_%s_count'%act in df_tmp.columns:
                df_tmp['user_lastweek_video_activity_%s'%act] = df_tmp['user_lastweek_video_activity_%s_count'%act].map(lambda x: int(x>0))
                df_tmp['user_lastweek_video_activity_types'] += df_tmp['user_lastweek_video_activity_%s'%act]
        
        df_tmp['user_hist_badact'] = df_tmp['user_hist_badact_count'].map(lambda x:int(x>0))
        
        for page in sorted(df_user_activity.page.unique()):
            if 'user_hist_act_page_%s_count'%page in df_tmp.columns:
                df_tmp['user_hist_act_page_%s'%page] = df_tmp['user_hist_act_page_%s_count'%page].map(lambda x: int(x>0))
                df_tmp['user_hist_page_types'] += df_tmp['user_hist_act_page_%s'%page]
            if 'user_lastweek_act_page_%s_count'%page in df_tmp.columns:
                df_tmp['user_lastweek_act_page_%s'%page] = df_tmp['user_lastweek_act_page_%s_count'%page].map(lambda x: int(x>0))
                df_tmp['user_lastweek_page_types'] += df_tmp['user_lastweek_act_page_%s'%page]
            if 'user_video_page_%s_count'%page in df_tmp.columns:
                df_tmp['user_video_page_%s'%page] = df_tmp['user_video_page_%s_count'%page].map(lambda x: int(x>0))
                df_tmp['user_hist_video_page_types'] += df_tmp['user_video_page_%s'%page]
            if 'user_video_page_%s_lastweek_count'%page in df_tmp.columns:
                df_tmp['user_video_page_%s_lastweek'%page] = df_tmp['user_video_page_%s_lastweek_count'%page].map(lambda x: int(x>0))
                df_tmp['user_lastweek_video_page_types'] += df_tmp['user_video_page_%s_lastweek'%page]
        df_tmp['user_hist_lastweek_act_types_dist'] = df_tmp['user_hist_act_types'] - df_tmp['user_lastweek_act_types']
        df_tmp['user_hist_lastweek_video_activity_types_dist'] = df_tmp['user_hist_video_activity_types'] - df_tmp['user_lastweek_video_activity_types']
        df_tmp['user_hist_lastweek_page_types_dist'] = df_tmp['user_hist_page_types'] - df_tmp['user_lastweek_page_types']
        df_tmp['user_hist_lastweek_video_page_types_dist'] = df_tmp['user_hist_video_page_types'] - df_tmp['user_lastweek_video_page_types']
        # check行数是否有问题
        assert(df_tmp.shape[0]==origin_shape)
        ret = pd.concat([df_tmp,ret])
    return ret
df_merge = get_history(df)


  n, total = _generalised_sum(data, lambda x: ((x-m)/s)**4)
  n, total = _generalised_sum(data, lambda x: ((x-m)/s)**3)


In [16]:
df = df.merge(df_merge, on=['user_id','data_weeknum'],how='left').fillna(0)

In [17]:
# 使用频率特征
df['user_hist_launch_freq'] = df['user_hist_launchday'] / df['user_reg_days']
df['user_lastweek_launch_freq'] = df['user_lastweek_launchday'] / 7
df.loc[df.user_reg_days<7, 'user_lastweek_launch_freq'] = df.loc[df.user_reg_days<7, 'user_hist_launch_freq']
df['user_lastweek_hist_launch_freq_dist'] = df['user_lastweek_launch_freq'] - df['user_hist_launch_freq']

df['user_hist_video_freq'] = df['user_hist_videocount'] / df['user_reg_days']
df['user_lastweek_video_freq'] = df['user_lastweek_videocount'] / 7
df.loc[df.user_reg_days<7, 'user_lastweek_video_freq'] = df.loc[df.user_reg_days<7, 'user_hist_video_freq']
df['user_lastweek_hist_video_freq_dist'] = df['user_lastweek_video_freq'] - df['user_hist_video_freq']

df['user_hist_act_freq'] = df['user_hist_actcount'] / df['user_reg_days']
df['user_hist_act_0_freq'] = df['user_hist_act_0_count'] / df['user_reg_days']
df['user_hist_act_1_freq'] = df['user_hist_act_1_count'] / df['user_reg_days']
df['user_hist_act_2_freq'] = df['user_hist_act_2_count'] / df['user_reg_days']
df['user_hist_act_3_freq'] = df['user_hist_act_3_count'] / df['user_reg_days']
df['user_hist_act_4_freq'] = df['user_hist_act_4_count'] / df['user_reg_days']
df['user_hist_act_5_freq'] = df['user_hist_act_5_count'] / df['user_reg_days']

df['user_lastweek_act_freq'] = df['user_lastweek_actcount'] / 7
df['user_lastweek_act_0_freq'] = df['user_lastweek_act_0_count'] / 7
df['user_lastweek_act_1_freq'] = df['user_lastweek_act_1_count'] / 7
df['user_lastweek_act_2_freq'] = df['user_lastweek_act_2_count'] / 7
df['user_lastweek_act_3_freq'] = df['user_lastweek_act_3_count'] / 7
df['user_lastweek_act_4_freq'] = df['user_lastweek_act_4_count'] / 7
df['user_lastweek_act_5_freq'] = df['user_lastweek_act_5_count'] / 7

df.loc[df.user_reg_days<7, 'user_lastweek_act_freq']   = df.loc[df.user_reg_days<7, 'user_hist_act_freq']
df.loc[df.user_reg_days<7, 'user_lastweek_act_0_freq'] = df.loc[df.user_reg_days<7, 'user_hist_act_0_freq']
df.loc[df.user_reg_days<7, 'user_lastweek_act_1_freq'] = df.loc[df.user_reg_days<7, 'user_hist_act_1_freq']
df.loc[df.user_reg_days<7, 'user_lastweek_act_2_freq'] = df.loc[df.user_reg_days<7, 'user_hist_act_2_freq']
df.loc[df.user_reg_days<7, 'user_lastweek_act_3_freq'] = df.loc[df.user_reg_days<7, 'user_hist_act_3_freq']
df.loc[df.user_reg_days<7, 'user_lastweek_act_4_freq'] = df.loc[df.user_reg_days<7, 'user_hist_act_4_freq']
df.loc[df.user_reg_days<7, 'user_lastweek_act_5_freq'] = df.loc[df.user_reg_days<7, 'user_hist_act_5_freq']

In [18]:
df['user_last_launch_dist_morethan_histmax'] = (df.user_last_launch_dist > df.user_max_no_launch_days_hist).astype(int)
df['user_last_launch_dist_morethan_histmean'] = (df.user_last_launch_dist > df.user_mean_no_launch_days_hist).astype(int)
df['user_last_launch_dist_morethan_histmedian'] = (df.user_last_launch_dist > df.user_median_no_launch_days_hist).astype(int)

In [19]:
df['user_lastweek_act_types_hist_rate'] = (df['user_lastweek_act_types'] / df['user_hist_act_types']).fillna(0)
df['user_lastweek_video_activity_types_hist_rate'] = (df['user_lastweek_video_activity_types'] / df['user_hist_video_activity_types']).fillna(0)
df['user_lastweek_page_types_hist_rate'] = (df['user_lastweek_page_types'] / df['user_hist_page_types']).fillna(0)
df['user_lastweek_video_page_types_hist_rate'] = (df['user_lastweek_video_page_types'] / df['user_hist_video_page_types']).fillna(0)

df['user_lastweek_act_types_hist_dist'] = df['user_lastweek_act_types'] - df['user_hist_act_types']
df['user_lastweek_video_activity_types_hist_dist'] = df['user_lastweek_video_activity_types'] - df['user_hist_video_activity_types']
df['user_lastweek_page_types_hist_dist'] = df['user_lastweek_page_types'] - df['user_hist_page_types']
df['user_lastweek_video_page_types_hist_dist'] = df['user_lastweek_video_page_types'] - df['user_hist_video_page_types']

In [20]:
df['user_activity_div_launch_days_hist'] = df['user_activity_days_hist'] / df['user_launch_days_hist']
df['user_createvideo_div_launch_days_hist'] = df['user_createvideo_days_hist'] / df['user_launch_days_hist']
df['user_activity_div_launch_days_lastweek'] = df['user_activity_days_lastweek'] / df['user_launch_days_lastweek']
df['user_createvideo_div_launch_days_lastweek'] = df['user_createvideo_days_lastweek'] / df['user_launch_days_lastweek']

In [21]:
# 差距特征
for col in df.columns:
    if 'lastweek' in col and 'hist' not in col and 'dist' not in col:
        if col.replace('lastweek','hist') in df.columns:
            colname = col.replace('lastweek','lastweek_hist') + '_dist'
            if colname not in df.columns:
                df[colname] = df[col] - df[col.replace('lastweek','hist')]
            else:
                print(colname, 'exist.')

user_lastweek_hist_launch_freq_dist exist.
user_lastweek_hist_video_freq_dist exist.


In [22]:
df['user_lastweek_launch_days_div_hist'] = df['user_launch_days_lastweek'] / df['user_launch_days_hist']
df['user_lastweek_video_days_div_hist'] = df['user_createvideo_days_lastweek'] / df['user_createvideo_days_hist']
df['user_lastweek_video_count_div_hist'] = df['user_createvideo_times_lastweek'] / df['user_createvideo_times_hist']
df['user_lastweek_act_days_dic_hist'] = df['user_activity_days_lastweek'] / df['user_activity_times_hist']
df['user_lastweek_act_count_div_hist'] = df['user_activity_times_lastweek'] / df['user_activity_times_hist']

In [23]:
df['user_hist_mean_launch_date_dist']  = df.data_weekstart - df['user_mean_launch_date_hist']
df['user_hist_min_launch_date_dist']   = df.data_weekstart - df['user_min_launch_date_hist']
df['user_hist_mean_createvideo_date_dist']  = df.data_weekstart - df['user_mean_createvideo_date_hist']
df['user_hist_min_createvideo_date_dist']   = df.data_weekstart - df['user_min_createvideo_date_hist']
df['user_hist_mean_activity_date_dist']  = df.data_weekstart - df['user_mean_activity_date_hist']
df['user_hist_min_activity_date_dist']   = df.data_weekstart - df['user_min_activity_date_hist']

df['user_lastweek_mean_launch_date_dist']  = df.data_weekstart - df['user_mean_launch_date_lastweek']
df['user_lastweek_min_launch_date_dist']   = df.data_weekstart - df['user_min_launch_date_lastweek']
df['user_lastweek_mean_createvideo_date_dist']  = df.data_weekstart - df['user_mean_createvideo_date_lastweek']
df['user_lastweek_min_createvideo_date_dist']   = df.data_weekstart - df['user_min_createvideo_date_lastweek']
df['user_lastweek_mean_activity_date_dist']  = df.data_weekstart - df['user_mean_activity_date_lastweek']
df['user_lastweek_min_activity_date_dist']   = df.data_weekstart - df['user_min_activity_date_lastweek']

df['user_launch_range_percent'] = df['user_range_launch_date_hist'] / df['user_reg_days']
df['user_activity_range_percent'] = df['user_range_activity_date_hist'] / df['user_reg_days']
df['user_createvideo_range_percent'] = df['user_range_activity_date_hist'] / df['user_reg_days']

In [24]:
df.fillna(-1,inplace=True)

In [25]:
df.to_csv('../features/baseline_features9.csv',index=False)