In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
# import seaborn as sns
import pickle
import time
import gc
from tqdm import tqdm, tqdm_notebook

%matplotlib inline

#每次可以输出多个变量
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from pylab import rcParams
rcParams['figure.figsize'] = 14, 6

import warnings
warnings.filterwarnings("ignore")

#中文字体
import matplotlib
matplotlib.use('qt4agg')
#指定默认字体
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.family'] = 'sans-serif'
#解决负号'-'显示为方块的问题
matplotlib.rcParams['axes.unicode_minus'] = False

In [2]:
df_user_reg = pd.read_csv('./data/b/user_register_log.txt',sep='\t',header=None,names=['user_id','register_day','register_type','device_type'])
df_app_launch = pd.read_csv('./data/b/app_launch_log.txt',sep='\t',header=None,names=['user_id','day'])
df_video_create = pd.read_csv('./data/b/video_create_log.txt',sep='\t',header=None,names=['user_id','day'])
df_user_activity = pd.read_csv('./data/b/user_activity_log.txt',sep='\t',header=None,names=['user_id','day','page','video_id','author_id','action_type'])

## 趋势特征

In [4]:
def get_active_users(d_start,d_end, hist_registers=None):
    actives = set()
    for df in [df_app_launch, df_video_create, df_user_activity]:
        actives.update(df[(df.day>=d_start) & (df.day<=d_end)].user_id.unique())
    if hist_registers:
        actives = actives&set(hist_registers)
    return actives

def build_train(train_weeks=[(10,16),(17,23),(24,30),(31,37)], train_end=30):
# def build_train(train_weeks=[(17,23),(24,30),(31,37)], train_end=30):
    df_train = pd.DataFrame()
    week_num = 0
    for week_start,week_end in train_weeks:
        # 选择这周之前注册的用户
        df_user = df_user_reg[df_user_reg.register_day<week_start]
        df_tmp = pd.DataFrame(df_user['user_id']).drop_duplicates(['user_id'])
        df_tmp['data_weeknum'] = week_num
        df_tmp['data_weekstart'] = week_start
        df_tmp['data_weekend'] = week_end
        if week_start <= train_end:
            # 查看用户活跃
            active_users = get_active_users(week_start, week_end, hist_registers=df_tmp.user_id.unique().tolist())
            df_tmp['label'] = df_tmp.user_id.map(lambda x:int(x in active_users))
        else:
            df_tmp['label'] = -1
        df_train = pd.concat([df_train, df_tmp])
        week_num += 1
    return df_train
df = build_train()

#### 总趋势

In [66]:
# def get_trend(x, i):
#     if i < 7:
#         day = x.data_weekstart - i
#         user_curr_count = df_user_activity[(df_user_activity.user_id == x.user_id) & (df_user_activity.day >= day) & (df_user_activity.day < x.data_weekstart)].shape[0]
#         user_his_count = df_user_activity[(df_user_activity.user_id == x.user_id) & (df_user_activity.day >= x.data_weekstart - 7) & (df_user_activity.day < day)].shape[0]
#         user_curr_count = user_curr_count / i
#         user_his_count = user_his_count / (7 - i)
#         return (user_curr_count + 1) / (user_his_count + 1)
#     else:
#         day = x.data_weekstart - i
#         user_curr_count = df_user_activity[(df_user_activity.user_id == x.user_id) & (df_user_activity.day >= day) & (df_user_activity.day < x.data_weekstart)].shape[0]
#         user_his_count = df_user_activity[(df_user_activity.user_id == x.user_id) & (df_user_activity.day >= x.data_weekstart - 14) & (df_user_activity.day < day)].shape[0]
#         return (user_curr_count + 1) / (user_his_count + 1)

In [5]:
def get_trend(x):
    tmp_list = []
    user = df_user_activity[df_user_activity.user_id == x.user_id]
    for i in range(1, 8):
        if i < 7:
            day = x.data_weekstart - i
            user_curr_count = user[(user.day >= day) & (user.day < x.data_weekstart)].shape[0]
            user_his_count = user[(user.day >= x.data_weekstart - 7) & (user.day < day)].shape[0]
            user_curr_count = user_curr_count / i
            user_his_count = user_his_count / (7 - i)
            tmp_list.append((user_curr_count + 1) / (user_his_count + 1))
        else:
            day = x.data_weekstart - i
            user_curr_count = user[(user.day >= day) & (user.day < x.data_weekstart)].shape[0]
            user_his_count = user[(user.day >= x.data_weekstart - 14) & (user.day < day)].shape[0]
            tmp_list.append((user_curr_count + 1) / (user_his_count + 1))
    return tmp_list

In [6]:
%%time
df['trend'] = df.apply(get_trend, axis=1)
df.head()

CPU times: user 1h 59min 37s, sys: 684 ms, total: 1h 59min 37s
Wall time: 1h 59min 38s


In [7]:
for i in range(1, 8)[::-1]:
    df['trend_' + str(i)] = df['trend'].apply(lambda x:x.pop())

In [8]:
df = df.drop(['trend'], axis=1)
df.head()

Unnamed: 0,user_id,data_weeknum,data_weekstart,data_weekend,label,trend_7,trend_6,trend_5,trend_4,trend_3,trend_2,trend_1
0,167777,0,10,16,0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,886972,0,10,16,0,0.034483,1.0,1.0,1.0,1.0,1.0,1.0
2,921231,0,10,16,0,0.009615,1.0,1.0,1.0,1.0,1.0,1.0
3,904908,0,10,16,1,0.563025,0.32,0.389189,0.043478,0.057143,0.070423,0.083333
4,460291,0,10,16,1,2.177778,0.413793,0.392157,0.556034,0.214559,0.385714,0.847458


In [9]:
df.to_csv('./features/trend_df_b.csv', index=None)

#### 每种类型/页码趋势

In [10]:
def get_trend_act(x, j):
    tmp_list = []
    act = df_user_activity[(df_user_activity.user_id == x.user_id) & (df_user_activity.action_type == j)]
    if act.shape[0] == 0:
        tmp_list = [1, 1, 1, 1, 1, 1]
        day = x.data_weekstart - 7
        user_his_count = act[(act.day >= x.data_weekstart - 14) & (act.day < day)].shape[0]
        tmp_list.append(1 / (user_his_count + 1))
    else:
        for i in range(1, 8):
            if i < 7:
                day = x.data_weekstart - i
                user_curr_count = act[(act.day >= day) & (act.day < x.data_weekstart)].shape[0]
                user_his_count = act[(act.day >= x.data_weekstart - 7) & (act.day < day)].shape[0]
                user_curr_count = user_curr_count / i
                user_his_count = user_his_count / (7 - i)
                tmp_list.append((user_curr_count + 1) / (user_his_count + 1))
            else:
                day = x.data_weekstart - i
                user_curr_count = act[(act.day >= day) & (act.day < x.data_weekstart)].shape[0]
                user_his_count = act[(act.day >= x.data_weekstart - 14) & (act.day < day)].shape[0]
                tmp_list.append((user_curr_count + 1) / (user_his_count + 1))
    return tmp_list
    
def get_trend_page(x, j):
    tmp_list = []
    page = df_user_activity[(df_user_activity.user_id == x.user_id) & (df_user_activity.page == j)]
    if page.shape[0] == 0:
        tmp_list = [1, 1, 1, 1, 1, 1]
        day = x.data_weekstart - 7
        user_his_count = page[(page.day >= x.data_weekstart - 14) & (page.day < day)].shape[0]
        tmp_list.append(1 / (user_his_count + 1))
    else:
        for i in range(1, 8):
            if i < 7:
                day = x.data_weekstart - i
                user_curr_count = page[(page.day >= day) & (page.day < x.data_weekstart)].shape[0]
                user_his_count = page[(page.day >= x.data_weekstart - 7) & (page.day < day)].shape[0]
                user_curr_count = user_curr_count / i
                user_his_count = user_his_count / (7 - i)
                tmp_list.append((user_curr_count + 1) / (user_his_count + 1))
            else:
                day = x.data_weekstart - i
                user_curr_count = page[(page.day >= day) & (page.day < x.data_weekstart)].shape[0]
                user_his_count = page[(page.day >= x.data_weekstart - 14) & (page.day < day)].shape[0]
                tmp_list.append((user_curr_count + 1) / (user_his_count + 1))
    return tmp_list

In [None]:
for j in tqdm_notebook(df_user_activity.action_type.unique()):
    df['trend_act_' + str(j)] = df.apply(get_trend_act, args=(j,), axis=1)

for j in tqdm_notebook(df_user_activity.page.unique()):
    df['trend_page_' + str(j)] = df.apply(get_trend_page, args=(j,), axis=1)

In [19]:
%%time
for i in range(1, 8)[::-1]:
    for j in df_user_activity.action_type.unique():
        df['trend_act_' + str(j) + '_' + str(i)] = df['trend_act_' + str(j)].apply(lambda x:x.pop())
    for j in df_user_activity.page.unique():
        df['trend_page_' + str(j) + '_' + str(i)] = df['trend_page_' + str(j)].apply(lambda x:x.pop())

CPU times: user 15.5 s, sys: 193 ms, total: 15.7 s
Wall time: 21.8 s


In [22]:
df.shape
df.head()

(125057, 100)

Unnamed: 0,user_id,data_weeknum,data_weekstart,data_weekend,label,trend_7,trend_6,trend_5,trend_4,trend_3,trend_2,trend_1,trend_act_0,trend_act_2,trend_act_3,trend_act_1,trend_act_4,trend_act_5,trend_page_3,trend_page_1,trend_page_4,trend_page_0,trend_page_2,trend_act_0_7,trend_act_2_7,trend_act_3_7,trend_act_1_7,trend_act_4_7,trend_act_5_7,trend_page_3_7,trend_page_1_7,trend_page_4_7,trend_page_0_7,trend_page_2_7,trend_act_0_6,trend_act_2_6,trend_act_3_6,trend_act_1_6,trend_act_4_6,trend_act_5_6,trend_page_3_6,trend_page_1_6,trend_page_4_6,trend_page_0_6,trend_page_2_6,trend_act_0_5,trend_act_2_5,trend_act_3_5,trend_act_1_5,trend_act_4_5,trend_act_5_5,trend_page_3_5,trend_page_1_5,trend_page_4_5,trend_page_0_5,trend_page_2_5,trend_act_0_4,trend_act_2_4,trend_act_3_4,trend_act_1_4,trend_act_4_4,trend_act_5_4,trend_page_3_4,trend_page_1_4,trend_page_4_4,trend_page_0_4,trend_page_2_4,trend_act_0_3,trend_act_2_3,trend_act_3_3,trend_act_1_3,trend_act_4_3,trend_act_5_3,trend_page_3_3,trend_page_1_3,trend_page_4_3,trend_page_0_3,trend_page_2_3,trend_act_0_2,trend_act_2_2,trend_act_3_2,trend_act_1_2,trend_act_4_2,trend_act_5_2,trend_page_3_2,trend_page_1_2,trend_page_4_2,trend_page_0_2,trend_page_2_2,trend_act_0_1,trend_act_2_1,trend_act_3_1,trend_act_1_1,trend_act_4_1,trend_act_5_1,trend_page_3_1,trend_page_1_1,trend_page_4_1,trend_page_0_1,trend_page_2_1
0,744025,0,10,16,0,1.756098,12.833333,15.2,18.75,24.666667,0.246479,0.077922,[],[],[],[],[],[],[],[],[],[],[],1.775,1.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,2.258065,0.090909,12.666667,1.0,1.0,1.166667,1.0,1.0,1.333333,1.0,1.0,12.5,1.0,15.0,1.0,1.0,1.2,1.0,1.0,1.4,1.0,1.0,14.8,1.0,18.5,1.0,1.0,1.25,1.0,1.0,1.5,1.0,1.0,18.25,1.0,24.333333,1.0,1.0,1.333333,1.0,1.0,1.666667,1.0,1.0,24.0,1.0,0.25,1.0,1.0,0.833333,1.0,1.0,0.714286,1.0,1.0,0.253623,1.0,0.078947,1.0,1.0,0.857143,1.0,1.0,0.75,1.0,1.0,0.08,1.0
1,1270299,0,10,16,0,0.011628,1.0,1.0,1.0,1.0,1.0,1.0,[],[],[],[],[],[],[],[],[],[],[],0.012195,1.0,0.2,1.0,1.0,1.0,0.052632,0.018182,1.0,0.071429,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,571220,0,10,16,0,0.013889,1.0,1.0,1.0,1.0,1.0,1.0,[],[],[],[],[],[],[],[],[],[],[],0.014085,0.5,1.0,1.0,1.0,1.0,0.058824,1.0,1.0,0.03125,0.04,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,1308501,0,10,16,1,0.743902,0.070922,0.158333,0.047619,0.0625,0.076923,0.090909,[],[],[],[],[],[],[],[],[],[],[],0.512195,1.0,2.0,19.0,1.0,1.0,0.972222,0.65,1.0,0.4,0.25,0.109195,1.0,1.166667,0.052632,1.0,1.0,0.028571,0.277778,1.0,0.5,1.0,0.24,1.0,1.2,0.1,1.0,1.0,0.055556,0.584615,1.0,0.666667,1.0,0.068182,1.0,0.75,0.142857,1.0,1.0,0.081081,0.107143,1.0,0.75,1.0,0.088889,1.0,0.8,0.181818,1.0,1.0,0.105263,0.137931,1.0,0.8,1.0,0.108696,1.0,0.833333,0.217391,1.0,1.0,0.128205,0.166667,1.0,0.833333,1.0,0.12766,1.0,0.857143,0.25,1.0,1.0,0.15,0.193548,1.0,0.857143,1.0
4,745554,0,10,16,1,4.5,5.333333,6.2,0.796875,0.866667,0.16129,0.1875,[],[],[],[],[],[],[],[],[],[],[],4.166667,2.0,1.0,2.0,1.0,1.0,1.0,0.5,1.0,9.0,3.8,5.0,1.166667,1.0,1.166667,1.0,1.0,1.0,1.0,1.0,2.333333,4.0,5.8,1.2,1.0,1.2,1.0,1.0,1.0,1.0,1.0,2.6,4.6,0.910714,0.75,1.0,0.75,1.0,1.0,1.0,1.0,1.0,3.0,0.421875,0.962963,0.8,1.0,0.8,1.0,1.0,1.0,1.0,1.0,3.666667,0.333333,0.172414,0.833333,1.0,0.833333,1.0,1.0,1.0,1.0,1.0,0.384615,0.217391,0.2,0.857143,1.0,0.857143,1.0,1.0,1.0,1.0,1.0,0.428571,0.25


In [23]:
df = df.drop(['trend_act_0','trend_act_2','trend_act_3','trend_act_1','trend_act_4','trend_act_5','trend_page_3','trend_page_1','trend_page_4','trend_page_0','trend_page_2'], axis=1)
df.head()

Unnamed: 0,user_id,data_weeknum,data_weekstart,data_weekend,label,trend_7,trend_6,trend_5,trend_4,trend_3,trend_2,trend_1,trend_act_0_7,trend_act_2_7,trend_act_3_7,trend_act_1_7,trend_act_4_7,trend_act_5_7,trend_page_3_7,trend_page_1_7,trend_page_4_7,trend_page_0_7,trend_page_2_7,trend_act_0_6,trend_act_2_6,trend_act_3_6,trend_act_1_6,trend_act_4_6,trend_act_5_6,trend_page_3_6,trend_page_1_6,trend_page_4_6,trend_page_0_6,trend_page_2_6,trend_act_0_5,trend_act_2_5,trend_act_3_5,trend_act_1_5,trend_act_4_5,trend_act_5_5,trend_page_3_5,trend_page_1_5,trend_page_4_5,trend_page_0_5,trend_page_2_5,trend_act_0_4,trend_act_2_4,trend_act_3_4,trend_act_1_4,trend_act_4_4,trend_act_5_4,trend_page_3_4,trend_page_1_4,trend_page_4_4,trend_page_0_4,trend_page_2_4,trend_act_0_3,trend_act_2_3,trend_act_3_3,trend_act_1_3,trend_act_4_3,trend_act_5_3,trend_page_3_3,trend_page_1_3,trend_page_4_3,trend_page_0_3,trend_page_2_3,trend_act_0_2,trend_act_2_2,trend_act_3_2,trend_act_1_2,trend_act_4_2,trend_act_5_2,trend_page_3_2,trend_page_1_2,trend_page_4_2,trend_page_0_2,trend_page_2_2,trend_act_0_1,trend_act_2_1,trend_act_3_1,trend_act_1_1,trend_act_4_1,trend_act_5_1,trend_page_3_1,trend_page_1_1,trend_page_4_1,trend_page_0_1,trend_page_2_1
0,744025,0,10,16,0,1.756098,12.833333,15.2,18.75,24.666667,0.246479,0.077922,1.775,1.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,2.258065,0.090909,12.666667,1.0,1.0,1.166667,1.0,1.0,1.333333,1.0,1.0,12.5,1.0,15.0,1.0,1.0,1.2,1.0,1.0,1.4,1.0,1.0,14.8,1.0,18.5,1.0,1.0,1.25,1.0,1.0,1.5,1.0,1.0,18.25,1.0,24.333333,1.0,1.0,1.333333,1.0,1.0,1.666667,1.0,1.0,24.0,1.0,0.25,1.0,1.0,0.833333,1.0,1.0,0.714286,1.0,1.0,0.253623,1.0,0.078947,1.0,1.0,0.857143,1.0,1.0,0.75,1.0,1.0,0.08,1.0
1,1270299,0,10,16,0,0.011628,1.0,1.0,1.0,1.0,1.0,1.0,0.012195,1.0,0.2,1.0,1.0,1.0,0.052632,0.018182,1.0,0.071429,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,571220,0,10,16,0,0.013889,1.0,1.0,1.0,1.0,1.0,1.0,0.014085,0.5,1.0,1.0,1.0,1.0,0.058824,1.0,1.0,0.03125,0.04,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,1308501,0,10,16,1,0.743902,0.070922,0.158333,0.047619,0.0625,0.076923,0.090909,0.512195,1.0,2.0,19.0,1.0,1.0,0.972222,0.65,1.0,0.4,0.25,0.109195,1.0,1.166667,0.052632,1.0,1.0,0.028571,0.277778,1.0,0.5,1.0,0.24,1.0,1.2,0.1,1.0,1.0,0.055556,0.584615,1.0,0.666667,1.0,0.068182,1.0,0.75,0.142857,1.0,1.0,0.081081,0.107143,1.0,0.75,1.0,0.088889,1.0,0.8,0.181818,1.0,1.0,0.105263,0.137931,1.0,0.8,1.0,0.108696,1.0,0.833333,0.217391,1.0,1.0,0.128205,0.166667,1.0,0.833333,1.0,0.12766,1.0,0.857143,0.25,1.0,1.0,0.15,0.193548,1.0,0.857143,1.0
4,745554,0,10,16,1,4.5,5.333333,6.2,0.796875,0.866667,0.16129,0.1875,4.166667,2.0,1.0,2.0,1.0,1.0,1.0,0.5,1.0,9.0,3.8,5.0,1.166667,1.0,1.166667,1.0,1.0,1.0,1.0,1.0,2.333333,4.0,5.8,1.2,1.0,1.2,1.0,1.0,1.0,1.0,1.0,2.6,4.6,0.910714,0.75,1.0,0.75,1.0,1.0,1.0,1.0,1.0,3.0,0.421875,0.962963,0.8,1.0,0.8,1.0,1.0,1.0,1.0,1.0,3.666667,0.333333,0.172414,0.833333,1.0,0.833333,1.0,1.0,1.0,1.0,1.0,0.384615,0.217391,0.2,0.857143,1.0,0.857143,1.0,1.0,1.0,1.0,1.0,0.428571,0.25


In [24]:
df.to_csv('./features/trend_df.csv', index=None)