In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import datetime
from deep_tools import f
from deep_tools import DataGenerator

In [2]:
register = pd.read_csv('user_register_log.txt.gz', sep='\t', names=[
                       'user_id', 'register_day', 'register_type', 'device_type'])
launch = pd.read_csv('app_launch_log.txt.gz', sep='\t',
                     names=['user_id', 'launch_day'])
create = pd.read_csv('video_create_log.txt.gz', sep='\t',
                     names=['user_id', 'create_day'])
activity = pd.read_csv('user_activity_log.txt.gz', sep='\t', names=[
                       'user_id', 'act_day', 'page', 'video_id', 'author_id', 'act_type'])

In [3]:
register.head()

Unnamed: 0,user_id,register_day,register_type,device_type
0,744025,1,1,283
1,1270299,1,1,259
2,571220,1,1,2
3,1308501,1,0,23
4,745554,1,2,0


![title](./img/1.png)

In [4]:
launch.head()

Unnamed: 0,user_id,launch_day
0,383135,1
1,330986,4
2,330986,9
3,330986,11
4,330986,12


![title](./img/2.png)

In [5]:
create.head()

Unnamed: 0,user_id,create_day
0,720497,1
1,720497,1
2,720497,1
3,1075211,6
4,1075211,12


![title](./img/3.png)

In [6]:
activity.head()

Unnamed: 0,user_id,act_day,page,video_id,author_id,act_type
0,1062323,22,3,2877472,880271,0
1,639898,17,3,740662,210200,0
2,1260200,5,3,3332414,162866,0
3,817201,22,3,1129617,530246,0
4,817201,23,3,1129617,530246,0


![title](./img/4.png)

计算序列长度，持续时间=数据总时间-注册时间

In [7]:
register['seq_length'] = 31-register['register_day']
register.head()

Unnamed: 0,user_id,register_day,register_type,device_type,seq_length
0,744025,1,1,283,30
1,1270299,1,1,259,30
2,571220,1,1,2,30
3,1308501,1,0,23,30
4,745554,1,2,0,30


构建字典存储用户在持续时间内，不同日期的数据

In [8]:
user_queue = {i: [] for i in range(1, 31)}

In [9]:
for index, row in register.iterrows():
    user_queue[row[-1]].append(row[0])  # row[-1]是seq_length,row[0]是user_id

key表示序列长度（持续天数），value表示用户

In [10]:
user_queue

{1: [355948,
  1141243,
  885314,
  473037,
  1161976,
  1105249,
  1034711,
  444663,
  629789,
  684292,
  1220188,
  38479,
  1166260,
  739798,
  1023596,
  731282,
  759110,
  327708,
  1102438,
  466836,
  496107,
  1222898,
  166312,
  236360,
  473209,
  91356,
  393059,
  439146,
  956525,
  49020,
  1306827,
  1198470,
  857247,
  316571,
  136844,
  691785,
  1272376,
  265670,
  560000,
  757319,
  1133544,
  822853,
  979976,
  664211,
  480676,
  277894,
  919793,
  107804,
  1191738,
  1095109,
  431613,
  153984,
  398413,
  612132,
  356852,
  146009,
  78955,
  335070,
  719818,
  560879,
  541350,
  1138509,
  38011,
  304986,
  1289748,
  864223,
  642020,
  771941,
  307046,
  774769,
  1254790,
  1308445,
  1296278,
  817006,
  761905,
  663294,
  1170617,
  272899,
  238407,
  937861,
  1021300,
  1150167,
  118095,
  1218437,
  1035477,
  343686,
  720640,
  546495,
  174056,
  478759,
  1285841,
  707129,
  833944,
  499087,
  320094,
  621343,
  309432,
  5366

In [11]:
class user_seq:

    def __init__(self, register_day, seq_length, n_features):
        self.register_day = register_day
        self.seq_length = seq_length
        # 构建矩阵：持续天数*特征个数，后续新创建的特征来往里面填充
        self.array = np.zeros([self.seq_length, n_features])
        self.array[0, 0] = 1
        self.page_rank = np.zeros([self.seq_length])
        self.pointer = 1

    def put_feature(self, feature_number, string):
        for i in string.split(','):
            pos, value = i.split(':')  # 注册后第几天进行了登录，1为指示符
            self.array[int(pos)-self.register_day, feature_number] = 1

    def put_PR(self, string):
        for i in string.split(','):
            pos, value = i.split(':')
            self.page_rank[int(pos)-self.register_day] = value

    def get_array(self):
        return self.array

    def get_label(self):
        self.label = np.array([None]*self.seq_length)
        active = self.array[:, :10].sum(axis=1)
        for i in range(self.seq_length-7):
            self.label[i] = 1*(np.sum(active[i+1:i+8]) > 0)  # 在未来7天内使用过APP
        return self.label

In [12]:
n_features = 12
data = {row[0]: user_seq(register_day=row[1], seq_length=row[-1],
                         n_features=n_features) for index, row in register.iterrows()}

得到每个用户特征实例，组成字典

In [13]:
data

{744025: <__main__.user_seq at 0x1b1801544a8>,
 1270299: <__main__.user_seq at 0x1b180154470>,
 571220: <__main__.user_seq at 0x1b180154518>,
 1308501: <__main__.user_seq at 0x1b180154550>,
 745554: <__main__.user_seq at 0x1b180154588>,
 1031012: <__main__.user_seq at 0x1b1801545c0>,
 913297: <__main__.user_seq at 0x1b1801545f8>,
 266500: <__main__.user_seq at 0x1b180154630>,
 475120: <__main__.user_seq at 0x1b180154668>,
 547944: <__main__.user_seq at 0x1b1801546a0>,
 916655: <__main__.user_seq at 0x1b1801546d8>,
 719262: <__main__.user_seq at 0x1b180154710>,
 1026175: <__main__.user_seq at 0x1b180154748>,
 1140342: <__main__.user_seq at 0x1b180154780>,
 688100: <__main__.user_seq at 0x1b1801547b8>,
 1342459: <__main__.user_seq at 0x1b1801547f0>,
 926263: <__main__.user_seq at 0x1b180154828>,
 40710: <__main__.user_seq at 0x1b180154860>,
 246954: <__main__.user_seq at 0x1b180154898>,
 153579: <__main__.user_seq at 0x1b1801548d0>,
 161418: <__main__.user_seq at 0x1b180154908>,
 649526:

In [14]:
data[1308501].array

array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.,

In [15]:
data[1308501].seq_length

30

In [16]:
data[1308501].page_rank

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

#### 登录信息

In [17]:
launch['launch'] = 1  # 每个用户每天登录次数，有登陆为1
launch_table = launch.groupby(
    ['user_id', 'launch_day'], as_index=False).agg({'launch': 'sum'})
launch_table.head()

Unnamed: 0,user_id,launch_day,launch
0,16,13,1
1,16,14,1
2,16,15,1
3,16,18,1
4,16,19,1


In [18]:
launch_table['launch'].max()

1

In [19]:
def record_to_sequence(table):  # 得到用户特征序列表
    table.columns = ['user_id', 'day', 'value']
    table.sort_values(by=['user_id', 'day'], inplace=True)
    table['string'] = table.day.map(str)+':'+table.value.map(str)  # 变成字符串拼接
    table = table.groupby(['user_id'], as_index=False).agg(
        {'string': lambda x: ','.join(x)})  # 将每日登陆的字符再拼接
    return table

In [20]:
launch_table = record_to_sequence(launch_table)  # 序列特征结果
launch_table.head()

Unnamed: 0,user_id,string
0,16,"13:1,14:1,15:1,18:1,19:1,20:1,21:1,22:1,23:1"
1,30,24:1
2,98,16:1
3,105,"12:1,14:1,15:1,16:1,17:1,18:1,19:1,20:1,21:1,2..."
4,176,"27:1,28:1,29:1,30:1"


In [21]:
launch_table['string'].str.len().max()

140

例如ID=16的用户会在其特征表的指定位置的（13,14,15,18...行进行填充）

In [22]:
for index, row in launch_table.iterrows():  # 根据登录信息对用户特征表进行填充
    data[row[0]].put_feature(1, row[1])  # 在指定特征位置上进行填充

In [23]:
data[1308501].register_day

1

In [24]:
data[1308501].array

array([[1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.,

#### 创作视频信息

In [25]:
create['create'] = 1
create_table = create.groupby(
    ['user_id', 'create_day'], as_index=False).agg({'create': 'sum'})
create_table = record_to_sequence(create_table)
for index, row in create_table.iterrows():
    data[row[0]].put_feature(2, row[1])

In [26]:
data[720497].array

array([[1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.,

#### 用户使用时行为特征，例如点赞，转发等


分别对不同行为进行统计，构建6种不同行为特征

In [27]:
for i in range(6):
    act = activity[activity.act_type == i].copy()
    act = act.groupby(['user_id', 'act_day'], as_index=False).agg(
        {'video_id': 'count'})
    act = record_to_sequence(act)
    for index, row in act.iterrows():
        data[row[0]].put_feature(i+3, row[1])

In [28]:
activity['act_type'].value_counts()  # 对应为播放、关注、点赞、转发、举报、减少此类作品

0    19798261
1      555671
2      206079
3       46078
5         982
4         157
Name: act_type, dtype: int64

#### 产生行为的界面信息

In [29]:
for i in range(1):  # 暂不作为特征
    act = activity[activity.page == i].copy()
    act = act.groupby(['user_id', 'act_day'], as_index=False).agg(
        {'video_id': 'count'})
    act = record_to_sequence(act)
    for index, row in act.iterrows():
        data[row[0]].put_feature(i+9, row[1])

In [30]:
data[720497].array

array([[1., 1., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 1., 0., 1., 1., 1., 0., 0., 0., 1., 0., 0.],
       [0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 1., 0., 1., 0., 1.,

#### 观看其他用户作品信息

In [31]:
watched = register.loc[:, ['user_id']].copy()
watched.columns = ['author_id']

In [32]:
watched

Unnamed: 0,author_id
0,744025
1,1270299
2,571220
3,1308501
4,745554
...,...
51704,925266
51705,1253628
51706,1108957
51707,731506


In [33]:
watched = pd.merge(watched, activity[activity.author_id !=
                                     activity.user_id], how='inner')  # 只得到交集，相当于看别人视频的

In [34]:
watched

Unnamed: 0,author_id,user_id,act_day,page,video_id,act_type
0,794136,688596,27,2,2652156,0
1,394049,656865,26,3,3539106,0
2,394049,656865,26,3,2066639,0
3,394049,656865,26,3,3149078,0
4,394049,656865,26,3,4193165,0
...,...,...,...,...,...,...
897,1160382,658934,30,3,226496,0
898,1160382,658934,30,3,3264844,0
899,1160382,658934,30,2,4129447,0
900,1160382,658934,30,2,4129447,0


In [35]:
watched = watched.groupby(['author_id', 'act_day'],
                          as_index=False).agg({'video_id': 'count'})
watched = record_to_sequence(watched)
for index, row in watched.iterrows():
    data[row[0]].put_feature(10, row[1])

In [36]:
watched

Unnamed: 0,user_id,string
0,5527,27:1
1,9327,24:1
2,11556,17:1
3,12891,24:1
4,26863,21:1
...,...,...
206,1332787,26:4
207,1333472,23:1
208,1345385,29:1
209,1347256,25:1


#### 观看自己的作品信息

In [37]:
watched = activity[activity.author_id == activity.user_id].copy()
watched = watched.groupby(['user_id', 'act_day'],
                          as_index=False).agg({'video_id': 'count'})

In [38]:
watched

Unnamed: 0,user_id,act_day,video_id
0,555,22,14
1,555,23,6
2,555,24,18
3,555,25,3
4,555,27,10
...,...,...,...
23223,1366834,27,43
23224,1366834,28,17
23225,1366834,29,7
23226,1367067,13,1


In [39]:
watched = record_to_sequence(watched)
for index, row in watched.iterrows():
    data[row[0]].put_feature(11, row[1])

In [40]:
watched

Unnamed: 0,user_id,string
0,555,"22:14,23:6,24:18,25:3,27:10,28:1,29:1"
1,973,"14:3,20:7,21:1,25:4,27:8,28:15"
2,1180,"21:3,22:2"
3,1296,"12:2,13:12,15:23,16:9,18:3"
4,1400,29:33
...,...,...
6527,1366419,"14:13,22:4,23:9"
6528,1366798,"23:37,25:2,26:1"
6529,1366834,"6:3,23:3,26:20,27:43,28:17,29:7"
6530,1367067,13:1


In [41]:
data[1366834].array

array([[1., 1., 1., 1., 1., 1., 0., 0., 0., 1., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 1., 1., 1.,

#### 制作数据标签

活跃用户定义为：在未来7天内使用过APP（在上述任一类型日志中出现过）

对用户从注册开始时进行统计，对于每1天的数据展开，如果其7天后仍有行为产生，则标签为1

In [42]:
label = {user_id: user.get_label() for user_id, user in data.items()}

In [43]:
label

{744025: array([1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, None, None, None, None, None, None, None], dtype=object),
 1270299: array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, None, None, None, None, None, None, None], dtype=object),
 571220: array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, None, None, None, None, None, None, None], dtype=object),
 1308501: array([1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, None, None, None, None, None, None, None], dtype=object),
 745554: array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, None, None, None, None, None, None, None], dtype=object),
 1031012: array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, None, None, None, None, None, None, None], dtype=object),
 913297: array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1

#### 用户特征数据即为之前提取的data中的各项特征，转换成ndarray即可

In [44]:
data = {user_id: user.get_array()
        for user_id, user in data.items()}  # 把用户实例变为数组

In [45]:
data[744025]

array([[1., 1., 0., 1., 1., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 1., 1., 0., 0., 0., 0., 1., 0., 0.],
       [0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.,

#### 合并上述提取方法

In [47]:
register = pd.read_csv('user_register_log.txt.gz', sep='\t', names=[
                       'user_id', 'register_day', 'register_type', 'device_type'])
launch = pd.read_csv('app_launch_log.txt.gz', sep='\t',
                     names=['user_id', 'launch_day'])
create = pd.read_csv('video_create_log.txt.gz', sep='\t',
                     names=['user_id', 'create_day'])
activity = pd.read_csv('user_activity_log.txt.gz', sep='\t', names=[
                       'user_id', 'act_day', 'page', 'video_id', 'author_id', 'act_type'])

data_generator = DataGenerator(register, launch, create, activity)

构建RNN网络模型

In [48]:
with tf.variable_scope('train'):

    # 变量与输入
    lr = tf.placeholder(tf.float32, [], name='learning_rate')  # 学习率

    W_out = tf.get_variable('W_out', [32, 1])  # 系数，n_hu隐藏层神经元个数
    b_out = tf.get_variable('b_out', [1])  # 偏置项

    x = tf.placeholder(tf.float32, [None, None, n_features])  # 输入的占位符
    y = tf.placeholder(tf.float32, [None, None])  # 输出的占位符

    batch_size = tf.shape(x)[0]
    seq_length = tf.shape(x)[1]  # 序列有长有短

    # RNN层
    cell = tf.nn.rnn_cell.GRUCell(32)  # 神经元类型
    initial_state = cell.zero_state(batch_size, dtype=tf.float32)  # 初始化状态为0
    outputs, state = tf.nn.dynamic_rnn(cell, x,
                                       initial_state=initial_state)

    # 输出层
    outputs = tf.reshape(outputs, [-1, 32])
    logits = tf.matmul(outputs, W_out)+b_out
    logits = tf.reshape(logits, tf.stack([batch_size, seq_length]))

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
This class is equivalent as tf.keras.layers.GRUCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [49]:
# 选择部分预测结果与标签当做训练损失计算
logits_local_train = logits[:, :-14]
label_local_train = y[:, :-14]

In [50]:
# 设置损失函数
regularizer = tf.contrib.layers.l2_regularizer(0.00001)
penalty = tf.contrib.layers.apply_regularization(
    regularizer, tf.trainable_variables())

obj_local = tf.losses.sigmoid_cross_entropy(
    label_local_train, logits_local_train)+penalty
optimizer = tf.train.AdamOptimizer(lr)
step_local = optimizer.minimize(obj_local)

# l选择部分预测结果与标签当做测试损失计算
logits_local_test = logits[:, -8]
label_local_test = y[:, -8]

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [51]:
def train(n_obs=1000, step=1000, lr_feed=0.01):

    date_seq = [31]+list(range(2, 16))+[16]*15  # 在哪些序列长度的数据进行选择，16是最多的
    variables = [step_local, obj_local, label_local_train, logits_local_train]
    print('label_local_train: ', label_local_train)
    print('logits_local_train: ', logits_local_train)

    for i in range(step):
        length, id_list, data_x, data_y = data_generator.next_batch(n_obs)
        _, los, lab, log = sess.run(variables,
                                    feed_dict={x: data_x,
                                               y: data_y,
                                               lr: lr_feed})

In [53]:
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))

InternalError: cudaGetDevice() failed. Status: CUDA driver version is insufficient for CUDA runtime version

In [54]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

InternalError: cudaGetDevice() failed. Status: CUDA driver version is insufficient for CUDA runtime version

In [None]:
train(n_obs=1000, step=2000, lr_feed=0.01)

In [None]:
def test():

    n_NA = 14
    date_seq = [31]+list(range(2, 16))+[16]*15
    variables_1 = [obj_local, logits_local_train, label_local_train]
    variables_2 = [logits_local_test, label_local_test]

    obs_count, cum_loss, correct = 0, 0, 0
    user, prob, real = [], [], []

    # 训练损失
    for length, id_list, data_x, data_y in zip(*data_generator.get_set('train')):
        _obj, _logits_train, _label_train = sess.run(variables_1,
                                                     feed_dict={x: data_x,
                                                                y: data_y,
                                                                lr: 0.001})
        obs_count += (length-n_NA)*len(id_list)
        cum_loss += _obj*(length-n_NA)*len(id_list)
        correct += np.sum((1*(_logits_train > 0) == _label_train))

    # 测试损失
    for length, id_list, data_x, data_y in zip(*data_generator.get_set('test')):
        _ = sess.run(variables_2,
                     feed_dict={x: data_x,
                                y: data_y,
                                lr: 0.001})

        _logits_test, _label_test = _
        real += list(_label_test)

        user += list(id_list)
        prob += list(1/(1+np.exp(-_logits_test.reshape([-1]))))

    # 训练损失
    print('train_loss', cum_loss/obs_count)

    # 测试损失
    result = pd.DataFrame({'user_id': user, 'prob': prob, 'label': real})
    print('test_score:', f(result))

    return result

### 评估标准：

![title](./img/5.png)

In [None]:
test()