In [17]:
import pandas as pd
import numpy as np
import random
import pickle
from random import randint, sample
import os
import re
from utils import save_pkl

# title words vocab

In [2]:
# load mid_title
root_titles = pd.read_csv("/media/yuting/TOSHIBA EXT/weibo_new/root_titles_filtered_chinese_only.csv", \
                          sep='\t', names=['mid','title_orig','title'], encoding='utf8')
# root_titles

In [3]:
def process_news(news_dict):
    word_dict_raw = {'PADDING': [0, 999999]}
    for mid in news_dict:
        for word in eval(news_dict[mid]):
            if word in word_dict_raw:
                word_dict_raw[word][1] += 1
            else:
                word_dict_raw[word] = [len(word_dict_raw),1]
    word_dict = {}
    for word in word_dict_raw:
        if word_dict_raw[word][1] >= 3:
            word_dict[word] = [len(word_dict),word_dict_raw[word][1]]
    print(len(word_dict),len(word_dict_raw))
    return word_dict

In [4]:
news_dict = dict(zip(root_titles.mid,root_titles.title))

In [5]:
word_dict = process_news(news_dict)

7275 27231


In [6]:
# load repsot_idlist to map the mid to post_id
file_path ='/media/yuting/TOSHIBA EXT/weibo_new/repost_idlist.txt'
with open(file_path, "r", encoding="gbk") as f:
    lines_repost_idlist = f.readlines()
# map mid to postid
postid_mid_map = {}
for i, mid in enumerate(lines_repost_idlist):
    postid_mid_map[mid.strip()] = i

In [90]:
length_list = []
for k in news_dict:
    length = len(eval(news_dict[k]))
    length_list.append(length)

postid_mid_map : mid -> post_id (30w)
news_index : mid -> 13153
news_dict: mid -> title words

# mapping word to index

In [7]:
# title_length = 10
def titleword2index(news_dict, word_dict, title_length=10):
    news_words = [[0]*title_length]
    news_index = {0:0}
    for mid in news_dict:
        word_id = []
        news_index[mid] = len(news_index)
        for word in eval(news_dict[mid]):
            if word in word_dict:
                word_id.append(word_dict[word][0])
        word_id = word_id[:title_length]
        news_words.append(word_id + [0]*(title_length-len(word_id)))
    news_words = np.array(news_words,dtype='int32')
    print(len(news_words))
    return news_words,news_index

In [8]:
news_words, news_index = titleword2index(news_dict,word_dict)

13154


news_index : mid -> 13153
news_words : 13154 x 10 (words matrix for each line of news)

# get words embedding

In [9]:
from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format('/media/yuting/TOSHIBA EXT/weibo_words/sgns.weibo.bigram.bz2', \
                                          encoding='utf-8',binary=False)

In [10]:
def get_embedding(word_dict,model):
    embedding_dict = {}
    cnt = 0
    for word in word_dict:
        try:
            embedding_dict[word] = model[word]
        except:
            pass
        if cnt%1000 == 0:
            print(cnt,word)
        cnt += 1
    
    embedding_matrix = [0]*len(word_dict)
    cand = []
    for i in embedding_dict:
        embedding_matrix[word_dict[i][0]] = np.array(embedding_dict[i], dtype='float32')
        cand.append(embedding_matrix[word_dict[i][0]])
    cand = np.array(cand, dtype='float32')
    mu = np.mean(cand, axis=0)
    Sigma = np.cov(cand.T)
    norm = np.random.multivariate_normal(mu, Sigma, 1)
    for i in range(len(embedding_matrix)):
        if type(embedding_matrix[i]) == int:
            embedding_matrix[i] = np.reshape(norm, 300)
    embedding_matrix[0] = np.zeros(300, dtype='float32')
    embedding_matrix = np.array(embedding_matrix, dtype='float32')
    print(embedding_matrix.shape)
    return embedding_matrix

In [11]:
word_embedding_matrix = get_embedding(word_dict,model)

0 PADDING
1000 现身
2000 男声
3000 文件
4000 特殊
5000 连锁
6000 帮助
7000 总是
(7275, 300)


word_embedding : 7275 x 300 (vocab x embedding_dim)

# get node embedding

In [12]:
# load target embedding
path = '/media/yuting/TOSHIBA EXT/weibo_new/mtl_n_target_embeddings_p.txt'
with open(path, "r", encoding="utf-8") as f:
    lines = f.readlines()
match_number = re.compile('-?\ *[0-9]+\.?[0-9]*(?:[Ee]\ *[-+]?\ *[0-9]+)?')
node2emb = {}
emb = ''
for i in range(len(lines)):
    
    if not lines[i].endswith(']]\n'):
        emb = emb + lines[i]
        continue
    else:
        emb = emb + lines[i]
        l_temp = re.findall(match_number,emb)
        assert len(l_temp)==51
        number = [float(x) for x in l_temp[1:]]
        node2emb[l_temp[0]] =  number
        emb = '' 

In [13]:
# load source embedding
path = '/media/yuting/TOSHIBA EXT/weibo_new/mtl_n_source_embeddings_p.txt'
with open(path, "r", encoding="utf-8") as f:
    lines2 = f.readlines()
node2emb_source = {}
emb_source = ''
for i in range(len(lines2)):
    if not lines2[i].endswith(']]\n'):
        emb_source = emb_source + lines2[i]
        continue
    else:
        emb_source = emb_source + lines2[i]
        l_temp = re.findall(match_number,emb_source)
        assert len(l_temp)==51
        number = [float(x) for x in l_temp[1:]]
        node2emb_source[l_temp[0]] =  number
        emb_source = ''

In [14]:
# merge the source and target embedding
node2emb.update(node2emb_source)

node2emb: userid -> embedding (source covers target)

In [15]:
node2emb_new = dict([int(k),v] for k,v in node2emb.items())

# node embedding matrix

In [16]:
node_embedding_matrix = [0]*(max(node2emb_new.keys())+2) # to include node null
cand = []

for i in node2emb_new.keys():
    try:
        node_embedding_matrix[i+1] = np.array(node2emb_new[i], dtype='float32')
    except:
        node_embedding_matrix[i+1] = np.zeros(50, dtype='float32')
    cand.append(node_embedding_matrix[i+1])
cand = np.array(cand, dtype='float32')
mu = np.mean(cand, axis=0)
Sigma = np.cov(cand.T)
norm = np.random.multivariate_normal(mu, Sigma, 1)
# print(node_embedding_matrix[0])
# print(type(node_embedding_matrix[0]))
for i in range(len(node_embedding_matrix)):
    if type(node_embedding_matrix[i]) == int:
        node_embedding_matrix[i] = np.reshape(norm, 50)
# node_embedding_matrix[0] = np.zeros(50, dtype='float32')
node_embedding_matrix = np.array(node_embedding_matrix, dtype='float32')
print(node_embedding_matrix.shape)

(1787442, 50)


# reorganize weibo data

post_reuid_dict : post_id -> (timestamp,userid)
postid_mid_map : mid -> post_id (30w)
post_id -> mid -> news_index_id

In [84]:
file_path ='/media/yuting/TOSHIBA EXT/weibo_new/repost_data.txt'
with open(file_path, "r", encoding="gbk") as f:
    lines_repost = f.readlines()
    
postid_reuid_dict = {}
i = 0
while i < (len(lines_repost)):
    A = lines_repost[i].split()[0] # A post_id
    B = int(lines_repost[i].split()[1]) # B num of repost
    for j in range(B):
        postid_reuid_dict.setdefault(A,[]).append(lines_repost[i+1+j].split())
    i = i+1+B

In [18]:
# filter out useful news
mid_history = {}
for mid in postid_mid_map.keys():
    if int(mid) in news_dict.keys():
        mid_history[mid] = postid_reuid_dict[str(postid_mid_map[mid])]
print(len(postid_reuid_dict))
print(len(mid_history))

300000
13153


In [19]:
user_history = {}
for k,v in mid_history.items():
#     diffusion_list = mid_history[k]
    for i in range(len(v)):
        if v[i][1] not in user_history:
            user_history.setdefault(v[i][1],[]).append([k,v[i][0]])
        else:
            user_history[v[i][1]].append([k,v[i][0]])

In [20]:
# set time order
user_history = dict([k,sorted(v,key=lambda x: x[1])] for k,v in user_history.items())
mid_history = dict([k,sorted(v,key=lambda x: x[0])] for k,v in mid_history.items())

In [98]:
len(mid_history['3509462670824667'])

703

# buid up train and test data

In [21]:
# select user with threshold1 nb_history 
def selectUser(user_history_dict, threshold):
    uid = []
    for k,v in user_history_dict.items():
        if len(v) > threshold:
            uid.append(k)
    print(len(uid))
    return uid

In [71]:
uid = selectUser(user_history,5)

79109


In [23]:
# split train/test user
uid_train = sample(uid,1)
uid_test = sample(list(set(uid)-set(uid_train)),1)
print(len(uid_train),len(uid_test))

1 1


In [24]:
def getNodes(user,mid,mid_history,max_nodes_length=30,pos=True):
#     print(len(mid_history))
    diffusion_chain = mid_history[str(mid)]
    if pos:
        count = 0
        node_list = []
        for i in range(len(diffusion_chain)):
            if diffusion_chain[i][1] != user:
                node_list.append(int(diffusion_chain[i][1])+1)
            else:
                break
        node_list = node_list[:max_nodes_length]
        node_list += [0]*(max_nodes_length-len(node_list))
    else:
        node_list = [int(i[1])+1 for i in diffusion_chain][:max_nodes_length]
        node_list += [0]*(max_nodes_length-len(node_list))
    return node_list

In [25]:
def adoptByHour(diffusion_chain):
    cnt_list = []
    i = 0
    while i < len(diffusion_chain):
        if int(diffusion_chain[-1][0]) - int(diffusion_chain[0][0]) <= 3600: # one hour
            cnt_list.append(len(diffusion_chain))
            break
        elif int(diffusion_chain[i][0]) - int(diffusion_chain[0][0]) <= 3600:
            i += 1
            continue
        else:
            cnt_list.append(i)
            diffusion_chain = diffusion_chain[i:]
            i = 0
    return cnt_list

In [26]:
def getAdoption(user,mid,mid_history,max_adopt_length=120,pos=True):
    diffusion_chain = mid_history[str(mid)]
    if pos:
        time_list, user_list = zip(*diffusion_chain)
        index = user_list.index(user)
        sub_diffusion = diffusion_chain[:index]
#         print(sub_diffusion)
        cnt_list = adoptByHour(sub_diffusion)
#         print(cnt_list)
        adopt_list = list(np.log(np.log(cnt_list)+1)[:max_adopt_length])
#         print(adopt_list)
        adopt_list += [0]*(max_adopt_length-len(adopt_list))
#         print(adopt_list)
    else:
        cnt_list = adoptByHour(diffusion_chain)
        rand = random.randint(0,len(cnt_list))
        cnt_list = cnt_list[:rand]
#         print(cnt_list)
#         print(len(cnt_list))
        adopt_list = list(np.log(np.log(cnt_list)+1)[:max_adopt_length])
        adopt_list += [0]*(max_adopt_length-len(adopt_list))
    return adopt_list

In [27]:
def newsample(nnn, ratio):
    if ratio > len(nnn):
        return random.sample(nnn * (ratio // len(nnn) + 1), ratio)
    else:
        return random.sample(nnn, ratio)

In [28]:
def setWindowSize(history_list,period=6, threshold2=3):
    timeLag = period * 30 * 24 * 60 *60 # count the seconds, period unit=month
    l_start = []
    l_end = []
    sub_history = []
    i = 0
    while i < len(history_list):
        if int(history_list[-1][1])-int(history_list[0][1]) < timeLag:
            start = int(history_list[0][1])
            end = int(history_list[-1][1])
            l_start.append(start)
            l_end.append(end)
            sub_history.append(history_list)
            break
        elif int(history_list[i][1])-int(history_list[0][1]) < timeLag: # set it around 6 months
            i += 1
            continue
        else:
            start = int(history_list[0][1])
            end = int(history_list[i][1])
            l_start.append(start)
            l_end.append(end)
            sub_history.append(history_list[:i+1])
            if len(history_list[i+1:])<threshold2:
                break
            else:
                history_list = history_list[i+1:]
                i = 0
    return list(zip(l_start,l_end)), sub_history

In [29]:
def buid_train_data_T(user,mid_history,poslist,neglist,news_index,nb_history=20, npratio=4, max_nodes_length=30,max_adopt_length=120):
    # pos,neg sample with mid
    for pos_sample in poslist:
        pos_neg_sample = newsample(neglist,npratio)
        pos_neg_sample_node = []
#         -----------------------
        pos_neg_sample_adopt = []
#         ----------------------------
        for neg_sample in pos_neg_sample:
            # negative nodes sampling
            neg_node_list = getNodes(user=user, mid=neg_sample, mid_history=mid_history,pos=False)
            pos_neg_sample_node.append(neg_node_list)
#         -----------------------
            neg_adopt_list = getAdoption(user=user,mid=neg_sample,mid_history=mid_history,pos=False)
            pos_neg_sample_adopt.append(neg_adopt_list)
#         ----------------------------
        pos_neg_sample.append(pos_sample)
        pos_node_list = getNodes(user=user, mid=pos_sample, mid_history=mid_history,pos=True)
        pos_neg_sample_node.append(pos_node_list)
        
#         print(pos_sample)
#         -------------------------
        pos_adopt_list = getAdoption(user=user,mid=pos_sample,mid_history=mid_history,pos=True)
        pos_neg_sample_adopt.append(pos_adopt_list)
#         -------------------------

        temp_label = [0] * npratio + [1]
        temp_id = list(range(npratio + 1))
        random.shuffle(temp_id)

        shuffle_sample = []
        shuffle_sample_node = []
#         ----------------------
        shuffle_sample_adopt = []
#         ----------------------
        shuffle_label = []

        for id in temp_id:
            shuffle_sample.append(int(pos_neg_sample[id]))
            shuffle_sample_node.append(pos_neg_sample_node[id])
#         ----------------------
            shuffle_sample_adopt.append(pos_neg_sample_adopt[id])
#         ----------------------
            shuffle_label.append(temp_label[id])
            
        shuffle_sample = [news_index[int(i)] for i in shuffle_sample] # map to news index

        posset = list(set(poslist)-set([pos_sample]))
        allpos = [int(p) for p in random.sample(posset, min(nb_history, len(posset)))[:nb_history]]

        allpos_node = []
#         ----------------------
        allpos_adopt = []
#         ----------------------
        for pos in allpos:
            pos_node_list = getNodes(user=user, mid=pos, mid_history=mid_history,pos=True)
            allpos_node.append(pos_node_list)
#         ----------------------
            pos_adopt_list = getAdoption(user=user, mid=pos, mid_history=mid_history,pos=True)
            allpos_adopt.append(pos_adopt_list)
#         ----------------------
        allpos_node = allpos_node[:nb_history]
        allpos_adopt = allpos_adopt[:nb_history] #####
        allpos += [0] * (nb_history - len(allpos))
        allpos_node += [[0]*max_nodes_length]*(nb_history-len(allpos_node))
        allpos_adopt += [[0]*max_adopt_length]*(nb_history-len(allpos_adopt)) #####
        
        allpos = [news_index[int(i)] for i in allpos] # mapping to news index
    
    return shuffle_sample,shuffle_sample_node,shuffle_sample_adopt,shuffle_label,allpos,allpos_node,allpos_adopt

In [30]:
all_train_id = []
all_train_pn = []
all_train_nodes = []
all_train_adopt = [] #####
all_label = []

all_test_id = []
all_test_pn = []
all_test_node = []
all_test_adopt = [] #####
all_test_label = []
all_test_index = []

all_user_pos = []
all_user_pos_nodes = [] ## node in history
all_user_pos_adopt = [] #####
all_test_user_pos = []
all_test_user_pos_nodes = []
all_test_user_pos_adopt = [] #####
npratio = 4
nb_history = 20
max_nodes_length = 30
max_adopt_length = 120 #####
nb_neg_sample = 10

for user in uid_train:
    history = user_history[user]
    time_list,sub_his_list = setWindowSize(history,period=3,threshold2=3) # split user history under windowsize
    for i in range(len(time_list)):
#         print(len(time_list))
        sub_mid = dict([k,v] for k,v in mid_history.items() if int(v[0][0])<=\
                       time_list[i][1] and int(v[-1][0])>=time_list[i][0])
        sub_poslist = [h[0] for h in sub_his_list[i]]
        sub_neglist = list(set(sub_mid.keys())-set(sub_poslist))
        
        shuffle_sample,shuffle_sample_node,shuffle_sample_adopt,shuffle_label,allpos,allpos_node,allpos_adopt = \
        buid_train_data_T(user,sub_mid,sub_poslist,sub_neglist,news_index)
    
        all_train_pn.append(shuffle_sample)
        all_train_nodes.append(shuffle_sample_node)
        all_train_adopt.append(shuffle_sample_adopt) #####
        all_label.append(shuffle_label)
        all_train_id.append(user)
        all_user_pos.append(allpos)
        all_user_pos_nodes.append(allpos_node)
        all_user_pos_adopt.append(allpos_adopt) #####

for user in uid_test:
    test_history = user_history[user]
    test_time_list,test_sub_his_list = setWindowSize(test_history,period=3,threshold2=3)
    for i in range(len(test_time_list)):
        test_sub_mid = dict([k,v] for k,v in mid_history.items() if int(v[0][0])<=\
                            test_time_list[i][1] and int(v[-1][0])>=test_time_list[i][0])
        test_sub_poslist = [h[0] for h in test_sub_his_list[i]]
        test_sub_neglist = list(set(test_sub_mid.keys())-set(test_sub_poslist))
        
        sess_index = []
        sess_index.append(len(all_test_pn))
        
        for pos_sample in test_sub_poslist:
            test_pos_node = getNodes(user=user, mid=pos_sample, mid_history=test_sub_mid,pos=True)
            # ------------------
            test_pos_adopt = getAdoption(user=user, mid=pos_sample, mid_history=test_sub_mid,pos=True)
            # -------------------
            test_posset = list(set(test_sub_poslist)-set([pos_sample]))
            test_allpos = [int(p) for p in random.sample(test_posset, min(nb_history, len(test_posset)))[:nb_history]]
            
            test_allpos_node = []
            # ------------------
            test_allpos_adopt = []
            # ------------------
            for pos in test_allpos:
                test_pos_node = getNodes(user=user, mid=pos, mid_history=test_sub_mid, pos=True)
                test_allpos_node.append(test_pos_node)
                # ------------------
                test_pos_adopt = getAdoption(user=user, mid=pos, mid_history=test_sub_mid, pos=True)
                test_allpos_adopt.append(test_pos_adopt)
                # -------------------
            test_allpos_node = test_allpos_node[:nb_history]
            test_allpos_adopt = test_allpos_adopt[:nb_history] #####
            test_allpos += [0]*(nb_history - len(test_allpos))
            test_allpos_node += [[0]*max_nodes_length]*(nb_history-len(test_allpos_node))
            test_allpos_adopt += [[0]*max_adopt_length]*(nb_history-len(test_allpos_adopt)) #####
            
            test_allpos = [news_index[int(i)] for i in test_allpos] # mapping
            
            all_test_pn.append(news_index[int(pos_sample)]) # mapping
            all_test_node.append(test_pos_node)
            all_test_adopt.append(test_pos_adopt) #####
            all_test_label.append(1)
            all_test_id.append(user)
            all_test_user_pos.append(test_allpos)
            all_test_user_pos_nodes.append(test_allpos_node)
            all_test_user_pos_adopt.append(test_allpos_adopt) #####
            
        assert len(test_sub_neglist)>=nb_neg_sample
        test_sub_neglist = random.sample(test_sub_neglist,nb_neg_sample)
        
        for neg_sample in test_sub_neglist:
            test_neg_node = getNodes(user=user, mid=neg_sample, mid_history=test_sub_mid,pos=False)
            # ----------
            test_neg_adopt = getAdoption(user=user, mid=neg_sample, mid_history=test_sub_mid,pos=False)
            # ----------
            test_allpos_neg = [int(p) for p in random.sample(test_sub_poslist, min(nb_history, len(test_sub_poslist)))[:nb_history]]
            
            test_allpos_neg_node = []
            # -----------------------------
            test_allpos_neg_adopt = []
            # -----------------------------
            for pos in test_allpos_neg:
                test_pos_node = getNodes(user=user, mid=pos, mid_history=test_sub_mid, pos=True)
                test_allpos_neg_node.append(test_pos_node)
                # ---------------------------
                test_pos_adopt = getAdoption(user=user, mid=pos, mid_history=test_sub_mid, pos=True)
                test_allpos_neg_adopt.append(test_pos_adopt)
                # -----------------------------
            test_allpos_neg_node = test_allpos_neg_node[:nb_history]
            test_allpos_neg_adopt = test_allpos_neg_adopt[:nb_history] #####
            test_allpos_neg += [0]*(nb_history - len(test_allpos_neg))
            test_allpos_neg_node += [[0]*max_nodes_length]*(nb_history-len(test_allpos_neg_node))
            test_allpos_neg_adopt += [[0]*max_adopt_length]*(nb_history-len(test_allpos_neg_adopt)) #####
            
            test_allpos_neg = [news_index[int(i)] for i in test_allpos_neg] # mapping
            
            all_test_pn.append(news_index[int(neg_sample)]) # mapping
            all_test_node.append(test_neg_node)
            all_test_adopt.append(test_neg_adopt) #####
            all_test_label.append(0)
            all_test_id.append(user)
            all_test_user_pos.append(test_allpos_neg)
            all_test_user_pos_nodes.append(test_allpos_neg_node)
            all_test_user_pos_adopt.append(test_allpos_neg_adopt) ########
            
        sess_index.append(len(all_test_pn))
        all_test_index.append(sess_index)

In [31]:
all_train_pn = np.array(all_train_pn, dtype='int32')
all_train_nodes = np.array(all_train_nodes,dtype='int32')
all_train_adopt = np.array(all_train_adopt,dtype='float32') ###
all_label = np.array(all_label, dtype='int32')
all_train_id = np.array(all_train_id, dtype='int32')
all_user_pos = np.array(all_user_pos, dtype='int32')
all_user_pos_nodes = np.array(all_user_pos_nodes,dtype='int32')
all_user_pos_adopt = np.array(all_user_pos_adopt,dtype='float32') ###

all_test_pn = np.array(all_test_pn, dtype='int32')
all_test_node = np.array(all_test_node,dtype='int32')
all_test_adopt = np.array(all_test_adopt,dtype='float32') ###
all_test_label = np.array(all_test_label, dtype='int32')
all_test_id = np.array(all_test_id, dtype='int32')
all_test_user_pos = np.array(all_test_user_pos, dtype='int32')
all_test_user_pos_nodes = np.array(all_test_user_pos_nodes,dtype='int32')
all_test_user_pos_adopt = np.array(all_test_user_pos_adopt,dtype='float32') ###

# store data

In [38]:
save_pkl('data_T_3m_nb10_120_test_1', news_words,word_embedding_matrix,node_embedding_matrix,\
         all_user_pos,all_user_pos_nodes,all_user_pos_adopt,all_train_pn,all_train_nodes,all_train_adopt,all_label,all_train_id,\
         all_test_user_pos,all_test_user_pos_nodes,all_test_user_pos_adopt,all_test_pn,all_test_node,all_test_adopt,all_test_label,all_test_id,
         all_test_index)