In [1]:
import pandas as pd
import numpy as np
# import tensorflow as tf  # 深度学习框架 pip install tensorflow
import datetime
# from deep_tools import f  # conda install -c bioconda deeptools，pip方法不奏效
# from deep_tools import DataCenerator

In [3]:
"""读取数据集"""
register = pd.read_csv('user_register_log.txt', sep='\t', names=['user_id', 'register_day', 'register_type', 'device_type'])
launch = pd.read_csv('app_launch_log.txt', sep='\t', names=['user_id', 'launch_day'])
create = pd.read_csv('video_create_log.txt', sep='\t', names=['user_id', 'create_day'])
activity = pd.read_csv('user_activity_log.txt', sep='\t', names=['user_id', 'act_day', 'page', 'video_id', 'author_id', 'act_type'])

In [4]:
"""计算序列长度： 持续时间 = 数据总时间 - 注册时间"""
register['seq_length'] = 31 - register['register_day']
register.head()

Unnamed: 0,user_id,register_day,register_type,device_type,seq_length
0,744025,1,1,283,30
1,1270299,1,1,259,30
2,571220,1,1,2,30
3,1308501,1,0,23,30
4,745554,1,2,0,30


In [5]:
"""根据前面的记录天数，创建一个字典，来存储不同记录天数的用户到底有哪些 """
user_queue = {i: []  for i in range(1,31)}

for index, row in register.iterrows():  # 这个iterrows是对DataFrame进行行遍历，是在数据框中的行进行迭代的一个生成器，它返回每行的索引及一个包含行本身的对象。
    user_queue[row[-1]].append(row[0]) # row[-1]是seq_length, row[0]是user_id

In [8]:
"""定义一个user_seq类"""
class user_seq:
    
    def __init__(self, register_day, seq_length, n_feagures):
        """
            register_day: 用户第几天进行的登录
            seq_length: 用户序列的长度，就是记录了几天登录信息， 行数
            n_features: 每天提出的特征个数， 列数
        """
        self.register_day = register_day
        self.seq_length = seq_length
        self.array = np.zeros([self.seq_length, n_feagures])   # 这就是上面那个用户对应的矩阵形式，初始化位0
        self.array[0,0] = 1
        self.page_rank = np.zeros([self.seq_length])
        self.pointer = 1
    
    # 提取特征填入特征矩阵
    def put_feature(self, feature_number, string):
        for i in string.split(','):
            pos, value = i.split(':')     # 注册后的第几天进行了登录，1为指示符
            self.array[int(pos)-self.register_day, feature_number] = 1   # 从注册后开始记录
    
    def put_PR(self,string):
        for i in string.split(','):
            pop,value = i.split(';')
            self.page_rank[int(pos)-self.register_day] = value
    
    def get_array(self):
        return self.array
    
    # 得到标签  如果一个用户在未来七天活跃了，那么标记为1
    def get_label(self):
        self.label = np.array([None] * self.seq_length)    # 一个seq_length长度的数组
        active = self.array[:, :10].sum(axis=1)          # 这里选了一部分特征做了个sum，意思是不管是转发，登录，啥的，只要做了就算一次活动
        for i in range(self.seq_length-7):      # 这地方得控制一下，如果一个用户15-30的数据，那么我们标签最多只能到23天，因为30天之后的数据我们没有
            self.label[i] = 1 * (np.sum(active[i+1:i+8]) > 0)    # 这里对于当前的i，如果未来七天内活跃过，那么标签就是1
        return self.label

In [34]:
"""创建用户的记录矩阵"""
n_features = 12
data = {row[0]:user_seq(register_day=row[1], seq_length=row[-1],n_feagures=n_features) for index, row in register.iterrows()}

In [35]:
data

{744025: <__main__.user_seq at 0x1b78b7631d0>,
 1270299: <__main__.user_seq at 0x1b78b763f98>,
 571220: <__main__.user_seq at 0x1b78b763eb8>,
 1308501: <__main__.user_seq at 0x1b78b763f60>,
 745554: <__main__.user_seq at 0x1b78b763470>,
 1031012: <__main__.user_seq at 0x1b78b7634a8>,
 913297: <__main__.user_seq at 0x1b78b763208>,
 266500: <__main__.user_seq at 0x1b78b797ba8>,
 475120: <__main__.user_seq at 0x1b78b75a748>,
 547944: <__main__.user_seq at 0x1b78b74c160>,
 916655: <__main__.user_seq at 0x1b78b74cd30>,
 719262: <__main__.user_seq at 0x1b78b74c7f0>,
 1026175: <__main__.user_seq at 0x1b78b74cd68>,
 1140342: <__main__.user_seq at 0x1b78b74cc50>,
 688100: <__main__.user_seq at 0x1b78b74c828>,
 1342459: <__main__.user_seq at 0x1b78b74c7b8>,
 926263: <__main__.user_seq at 0x1b78b74cc88>,
 40710: <__main__.user_seq at 0x1b78b74c898>,
 246954: <__main__.user_seq at 0x1b78b74cbe0>,
 153579: <__main__.user_seq at 0x1b78b74cba8>,
 161418: <__main__.user_seq at 0x1b78b74da58>,
 649526:

In [36]:
launch['launch'] = 1
launch_table = launch.groupby(['user_id', 'launch_day'], 
                              as_index=False).agg({'launch':'sum'})
launch_table.head()

Unnamed: 0,user_id,launch_day,launch
0,16,13,1
1,16,14,1
2,16,15,1
3,16,18,1
4,16,19,1


In [37]:
def record_to_sequence(table):
    table.columns=['user_id','day','value']
    table.sort_values(by=['user_id','day'],inplace=True)
    table['string']=table.day.map(str)+':'+table.value.map(str)
    table=table.groupby(['user_id'],as_index=False).agg({'string':lambda x:','.join(x)})
    return table

In [38]:
launch_table=record_to_sequence(launch_table)
launch_table.head()

Unnamed: 0,user_id,string
0,16,"13:1,14:1,15:1,18:1,19:1,20:1,21:1,22:1,23:1"
1,30,24:1
2,98,16:1
3,105,"12:1,14:1,15:1,16:1,17:1,18:1,19:1,20:1,21:1,2..."
4,176,"27:1,28:1,29:1,30:1"


In [39]:
for index,row in launch_table.iterrows():
    data[row[0]].put_feature(1,row[1])

创作视频信息

In [40]:
create['create']=1
create_table = create.groupby(['user_id','create_day'],as_index=False).agg({'create':'sum'})
create_table = record_to_sequence(create_table)
for index,row in create_table.iterrows():
    data[row[0]].put_feature(2,row[1])

用户行为数据

In [41]:
for i in range(6):
    act=activity[activity.act_type==i].copy()
    act=act.groupby(['user_id','act_day'],as_index=False).agg({'video_id':'count'})
    act = record_to_sequence(act)
    for index,row in act.iterrows():
        data[row[0]].put_feature(i+3,row[1])

产生信息的界面信息

In [42]:
for i in range(1):
    act=activity[activity.page==i].copy()
    act=act.groupby(['user_id','act_day'],as_index=False).agg({'video_id':'count'})
    act = record_to_sequence(act)
    for index,row in act.iterrows():
        data[row[0]].put_feature(i+9,row[1])

观测其它用户作品信息

In [43]:
watched=register.loc[:,['user_id']].copy()
watched.columns=['author_id']
watched=pd.merge(watched,activity[activity.author_id!=activity.user_id],how='inner')
watched=watched.groupby(['user_id','act_day'],as_index=False).agg({'video_id':'count'})
watched=record_to_sequence(watched)
for index,row in watched.iterrows():
    data[row[0]].put_feature(10,row[1])

观看自己的作品信息

In [44]:
watched=pd.merge(watched,activity[activity.author_id==activity.user_id],how='inner')
watched=watched.groupby(['user_id','act_day'],as_index=False).agg({'video_id':'count'})
watched=record_to_sequence(watched)
for index,row in watched.iterrows():
    data[row[0]].put_feature(11,row[1])

制作数据标签

In [45]:
label = {user_id:user.get_label() for user_id, user in data.items()}

In [46]:
label

{744025: array([1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, None, None, None, None, None, None, None], dtype=object),
 1270299: array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, None, None, None, None, None, None, None], dtype=object),
 571220: array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, None, None, None, None, None, None, None], dtype=object),
 1308501: array([1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, None, None, None, None, None, None, None], dtype=object),
 745554: array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, None, None, None, None, None, None, None], dtype=object),
 1031012: array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, None, None, None, None, None, None, None], dtype=object),
 913297: array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1

In [47]:
data = {user_id: user.get_array() for user_id, user in data.items()}

In [56]:
import pandas as pd
import numpy as np
from random import shuffle

def f(table,name='prob'):
    table=table.copy()
    score=[]
    for i in [0.40,0.41,0.42,0.43,0.44,0.45]:
        table['pred']=1*(table[name]>i)
        c=((table.pred==1)&(table.label==1)).sum()
        p=c/table.pred.sum()
        r=c/table.label.sum()
        score.append(2*p*r/(p+r))
    return score

def record_to_sequence(table):
    table.columns=['user_id','day','value']
    table.sort_values(by=['user_id','day'],inplace=True)
    table['string']=table.day.map(str)+':'+table.value.map(str)
    table=table.groupby(['user_id'],as_index=False).agg({'string':lambda x:','.join(x)})
    return table

class user_seq:
    
    def __init__(self,register_day,seq_length,n_features):
        self.register_day=register_day
        self.seq_length=seq_length
        self.array=np.zeros([self.seq_length,n_features])
        self.array[0,0]=1
        self.page_rank=np.zeros([self.seq_length])
        self.pointer=1
        
    def put_feature(self,feature_number,string):
        for i in string.split(','):
            pos,value=i.split(':')
            self.array[int(pos)-self.register_day,feature_number]=1

    def put_PR(self,string):
        for i in string.split(','):
            pos,value=i.split(':')
            self.page_rank[int(pos)-self.register_day]=value

    def get_array(self):
        return self.array
    
    def get_label(self):
        self.label=np.array([None]*self.seq_length)
        active=self.array[:,:10].sum(axis=1)
        for i in range(self.seq_length-7):
            self.label[i]=1*(np.sum(active[i+1:i+8])>0)
        return self.label
    

class DataGenerator:
    
    def __init__(self,register,launch,create,activity):
        
        register=register.copy()
        launch=launch.copy()
        create=create.copy()
        activity=activity.copy()
        
        #user_queue
        register['seq_length']=31-register['register_day']
        self.user_queue={i:[] for i in range(1,31)}
        for index,row in register.iterrows():
            self.user_queue[row[-1]].append(row[0]) #row[-1]是seq_length,row[0]是user_id
        
        #初始化self.data
        n_features=12 #row[0]是user_id,row[1]是register_day,row[-1]是seq_length
        self.data={row[0]:user_seq(register_day=row[1],seq_length=row[-1],n_features=n_features) for index,row in register.iterrows()}
        

        #提取launch_seq
        launch['launch']=1
        launch_table=launch.groupby(['user_id','launch_day'],as_index=False).agg({'launch':'sum'})
        launch_table=record_to_sequence(launch_table)
        for index,row in launch_table.iterrows():
            self.data[row[0]].put_feature(1,row[1]) #row[0]是user_id,row[1]是string
            
        #提取create_seq
        create['create']=1
        create_table=create.groupby(['user_id','create_day'],as_index=False).agg({'create':'sum'})
        create_table=record_to_sequence(create_table)
        for index,row in create_table.iterrows():
            self.data[row[0]].put_feature(2,row[1]) #row[0]是user_id,row[1]是string

        #提取act_seq
        for i in range(6):
            act=activity[activity.act_type==i].copy()
            act=act.groupby(['user_id','act_day'],as_index=False).agg({'video_id':'count'})
            act=record_to_sequence(act)
            for index,row in act.iterrows():
                self.data[row[0]].put_feature(i+3,row[1]) #row[0]是user_id,row[1]是string

        #提取page_seq
        for i in range(1):
            act=activity[activity.page==i].copy()
            act=act.groupby(['user_id','act_day'],as_index=False).agg({'video_id':'count'})
            act=record_to_sequence(act)
            for index,row in act.iterrows():
                self.data[row[0]].put_feature(i+9,row[1]) #row[0]是user_id,row[1]是string

        #提取watched
        watched=register.loc[:,['user_id']].copy()
        watched.columns=['author_id']
        watched=pd.merge(watched,activity[activity.author_id!=activity.user_id],how='inner')
        watched=watched.groupby(['author_id','act_day'],as_index=False).agg({'video_id':'count'})
        watched=record_to_sequence(watched)
        for index,row in watched.iterrows():
            self.data[row[0]].put_feature(10,row[1]) #row[0]是user_id,row[1]是string

        #提取watched by self
        watched=activity[activity.author_id==activity.user_id].copy()
        watched=watched.groupby(['user_id','act_day'],as_index=False).agg({'video_id':'count'})
        watched=record_to_sequence(watched)
        for index,row in watched.iterrows():
            self.data[row[0]].put_feature(11,row[1]) #row[0]是user_id,row[1]是string

        #提取label
        self.label={user_id:user.get_label() for user_id,user in self.data.items()}
        
        #提取data
        self.data={user_id:user.get_array() for user_id,user in self.data.items()}


        #set sample strategy
        self.local_random_list=[]
        for i in range(15,31):
            self.local_random_list+=[i]*(i-14)
            
        self.online_random_list=[]
        for i in range(8,31):
            self.online_random_list+=[i]*(i-7)

        self.local_train_list=list(range(15,31))
        self.local_test_list=list(range(8,31))
        self.online_train_list=list(range(8,31))
        self.online_test_list=list(range(1,31))

        self.pointer={i:0 for i in range(1,31)}
        
    
    def reset_pointer(self):
        self.pointer={i:0 for i in range(1,31)}
        
        
    def next_batch(self,batch_size=1000):

        seq_length=self.local_random_list[np.random.randint(len(self.local_random_list))]
        batch_size=batch_size//(seq_length-14)+1

        if self.pointer[seq_length]+batch_size>len(self.user_queue[seq_length]):
            self.pointer[seq_length]=0
            shuffle(self.user_queue[seq_length])
            #print('---------------------',seq_length,'shuffled ------------------------------')
        start=self.pointer[seq_length]
        user_list=self.user_queue[seq_length][start:start+batch_size]
        self.pointer[seq_length]+=batch_size

        user_matrix=np.array(user_list)
        data_matrix=np.array([self.data[i] for i in user_list])
        label_matrix=np.array([self.label[i] for i in user_list])
        
        return seq_length,user_matrix,data_matrix,label_matrix
    
    
    def get_set(self,usage='train'):
        
        if usage=='train':
            test_list=self.local_train_list
        else:
            test_list=self.local_test_list
        
        user_list=[np.array(self.user_queue[seq_length]) for seq_length in test_list]
        data_list=[np.array([self.data[user_id] for user_id in self.user_queue[seq_length]]) for seq_length in test_list]
        label_list=[np.array([self.label[user_id] for user_id in self.user_queue[seq_length]]) for seq_length in test_list]
        return test_list,user_list,data_list,label_list

In [57]:
# register=pd.read_csv('user_register_log.txt',sep='\t',names=['user_id','register_day','register_type','device_type'])
# launch=pd.read_csv('app_launch_log.txt',sep='\t',names=['user_id','launch_day'])
# create=pd.read_csv('video_create_log.txt',sep='\t',names=['user_id','create_day'])
# activity=pd.read_csv('user_activity_log.txt',sep='\t',names=['user_id','act_day','page','video_id','author_id','act_type'])

data_generator=DataGenerator(register,launch,create,activity)

In [48]:
data_generator = data

In [49]:
n_features = 12
n_hu = 8
with tf.variable_scope('train'):     # tf.variable_scope用来指定变量的作用域
    
    # 变量与输入
    lr = tf.placeholder(tf.float32, [], name='learning_rate')    # 定义学习率
    
    # 隐藏层到输出层的参数w, b    w_shape(n_hu,1)   b_shape(1)  n_huWie隐藏单元的个数
    W_out = tf.get_variable('W_out', [n_hu, 1])   
    b_out = tf.get_variable('b_out', [1])
    
    # x和y  x_shape(batch_size, seq_length, n_features)
    x = tf.placeholder(tf.float32, [None, None, n_features])
    y = tf.placeholder(tf.float32, [None, None])
    
    # batch_size和seq_length的大小
    batch_size = tf.shape(x)[0]
    seq_length = tf.shape(x)[1]
    
    # RNN 层
    cell = tf.nn.rnn_cell.GRUCell(n_hu)     # n_hu表示每个GRUcell里面的单元个数
    initial_state = cell.zero_state(batch_size, dtype=tf.float32)   # 指定初识状态，因为之前没有训练过
    outputs, state = tf.nn.dynamic_rnn(cell, x, initial_state=initial_state)  # 使用的动态Rnn
    # outputs(batch_size, max_seq_length, n_hu)     这是所有时间步的输出
    # state (batch_size, n_hu)   这是最后一个时间步的输出
    # 具体：https://blog.csdn.net/u010960155/article/details/81707498
    
    # 输出层
    outputs = tf.reshape(outputs, [-1, n_hu])    # （batch_size*max_seq_length, n_hu）
    logits = tf.matmul(outputs, W_out) + b_out    # (batch_size*max_seq_length)
    logits = tf.reshape(logits, tf.stack([batch_size, seq_length]))

NameError: name 'tf' is not defined