In [1]:
import numpy as np
import pandas as pd
import dgl
import torch

In [2]:
live = pd.read_csv('./input/live.txt', header=None, sep=" ")
live.columns = ["country", "uid", "rid", "duration"]
live = live[live['duration']>=10]
print(live.shape, live.nunique()['uid'], live.nunique()['rid'])

(8618431, 4) 1305806 150965


In [20]:
sing = pd.read_csv('./input/sing.txt', header=None, sep=" ")
sing.columns = ["country", "uid", "sid", "click_pv"]
print(sing.shape, sing.nunique()['uid'], sing.nunique()['sid'])

(10631819, 4) 3298855 127538


In [2]:
friend = pd.read_csv('./input/friend.txt', header=None, sep=" ")
friend.columns = ["uid_from", "uid_to"]
print(friend.shape, friend.nunique()['uid_from'], friend.nunique()['uid_to'])

(48796703, 2) 3984300 3984430


In [1]:
from graph import read_txt
live, sing, friend = read_txt()

In [27]:
user_ids = list(set(live['uid'].unique()) & set(sing['uid'].unique()))
user_id_map = dict(zip(user_ids, [1]*len(user_ids)))

live['label'] = live['uid'].map(user_id_map)
live = live[~live['label'].isnull()]

sing['label'] = sing['uid'].map(user_id_map)
sing = sing[~sing['label'].isnull()]

friend['label_from'] = friend['uid_from'].map(user_id_map)
friend['label_to'] = friend['uid_to'].map(user_id_map)
friend = friend[~friend['label_from'].isnull() & ~friend['label_to'].isnull()]

In [28]:
print("uid cnt", len(user_ids), "rid cnt", live.nunique()['rid'], "sid cnt", sing.nunique()['sid'])  

uid cnt 469806 rid cnt 122011 sid cnt 87275


In [29]:
print("txt cnt:", live.shape[0], sing.shape[0], friend.shape[0])

txt cnt: 3342663 2491114 2035056


In [30]:
def label_encoder(series, return_dict=False):
    unique_val = series.unique()
    fea_dict = dict(zip(unique_val, range(0, len(unique_val))))
    fea_dict_reverse = dict(zip(range(0, len(unique_val)), unique_val))
    new_series = series.map(fea_dict)
    if return_dict:
        return new_series, fea_dict_reverse
    return new_series

In [31]:
unique_val = pd.concat([live['uid'], live['rid'], sing['uid'], sing['sid']]).unique()
id_map = dict(zip(unique_val, range(0, len(unique_val))))


live['uid_node'] = live['uid'].map(id_map)
live['rid_node'] = live['rid'].map(id_map)

sing['uid_node'] = sing['uid'].map(id_map)
sing['sid_node'] = sing['sid'].map(id_map)

friend['uid_from_node'] = friend['uid_from'].map(id_map)
friend['uid_to_node'] = friend['uid_to'].map(id_map)

In [18]:
src_ids = pd.concat([live['uid_node'], live['rid_node'], sing['uid_node'], sing['sid_node'], friend['uid_from_node']]).values
dst_ids = pd.concat([live['rid_node'], live['uid_node'], sing['sid_node'], sing['uid_node'], friend['uid_to_node']]).values
g = dgl.graph((src_ids, dst_ids))
g

In [32]:
uids = np.array(list(set(live['uid_node'].values) | set(sing['uid_node'].values)))
rids = np.array(list(set(live['rid_node'].values)))
sids = np.array(list(set(sing['sid_node'].values)))
all_id_map = {"all_ids":unique_val, "uids":uids, "rids":rids, "sids":sids}

In [7]:
# rgcn
live['uid_'] = label_encoder(live['uid'])
live['rid_'] = label_encoder(live['rid'])

sing['uid_'] = label_encoder(sing['uid'])
sing['sid_'] = label_encoder(sing['sid'])

friend['uid_from_'] = label_encoder(friend['uid_from'])
friend['uid_to_'] = label_encoder(friend['uid_to'])

graph_data = {
    ('user', 'watched', 'user'): (live['uid_'].values, live['rid_'].values),
    ('user', 'watched-by', 'user'): (live['rid_'].values, live['uid_'].values),
    ('user', 'clicked', 'user'): (sing['uid_'].values, sing['sid_'].values),
    ('user', 'clicked-by', 'user'): (sing['sid_'].values, sing['uid_'].values),
    ('user', 'friend', 'user'): (friend['uid_from_'].values, friend['uid_to_'].values),
}
g = dgl.heterograph(graph_data, idtype=torch.int64)

In [16]:
from graph import *
from gensim.models import Word2Vec

In [2]:
g, all_id_map = build_graph()
g

params: duration_min 10
txt cnt: 8618431 10631819 43813457
uid cnt 469806 rid cnt 122011 sid cnt 87275


Graph(num_nodes=618830, num_edges=13702610,
      ndata_schemes={}
      edata_schemes={})

In [None]:
def gen_walks(g):
    all_walks = []
    for i in range(num_walks):
        walks = dgl.sampling.node2vec_random_walk(g, np.arange(g.number_of_nodes()), p, q, walk_length).tolist()
        all_walks += walks

    np.random.shuffle(all_walks)
    print("all_walks cnt:", len(all_walks))
    return all_walks

In [97]:
all_walks = []
for i in range(1):
    walks = dgl.sampling.node2vec_random_walk(g, np.arange(g.number_of_nodes()), p, q, 20).tolist()
    all_walks += walks
    
np.random.shuffle(all_walks)
print("all_walks cnt:", len(all_walks))

all_walks cnt: 618830


In [98]:
w2v = Word2Vec(all_walks, vector_size=128, min_count=1, sg=1, workers=16, window=10, epochs=2)

In [101]:
w2v.wv.vectors

array([[ 1.5902323e-01, -1.3602446e-01,  4.5898589e-01, ...,
        -4.1525736e-01, -1.7255954e-01,  4.1876376e-01],
       [ 4.7883695e-01,  8.1943911e-01,  1.1761624e+00, ...,
        -2.0655749e+00,  1.6955082e-01,  1.5405567e-02],
       [ 4.0647559e-02,  5.9378367e-02,  1.6537051e-01, ...,
        -4.9101394e-01,  5.5569366e-02,  2.9044294e-01],
       ...,
       [ 2.4958747e-04, -4.9160384e-02,  3.2879256e-02, ...,
        -1.1122543e-02, -4.5592394e-02, -8.9941360e-03],
       [ 6.3927197e-03, -4.7091417e-02,  2.9585339e-02, ...,
        -1.6293906e-02, -4.6675980e-02, -1.5414115e-02],
       [ 7.2562281e-04, -2.4027722e-02,  1.7282929e-02, ...,
        -1.9815629e-03, -1.5019396e-02, -9.3592372e-04]], dtype=float32)

In [108]:
rids = all_id_map['rids']
sids = all_id_map['sids']
all_ids = all_id_map['all_ids']

In [122]:
rid_keys = all_ids[rids]
rid_values = w2v.wv.vectors[[w2v.wv.key_to_index[i] for i in rids]]

In [125]:
sid_keys = all_ids[sids]
sid_values = w2v.wv.vectors[[w2v.wv.key_to_index[i] for i in sids]]

In [130]:
rid_keys[0], rid_values[0]

(12103424001423753,
 array([-0.14123654, -0.36403435,  0.0481291 , -0.15840092,  0.39886844,
        -0.22068354,  0.25132614, -0.14845279, -0.04883002,  0.14017446,
        -0.00926742,  0.02001654, -0.04000501, -0.18733907,  0.21301775,
         0.27344683, -0.06606819, -0.08680673, -0.3228368 ,  0.19046661,
         0.5991383 ,  0.0910523 , -0.15429671, -0.6948021 , -0.2839669 ,
         0.12220848, -0.20262673,  0.16103297,  0.0689064 ,  0.03683029,
         0.07181509,  0.08739292,  0.10255376,  0.15252829,  0.35021   ,
        -0.0891073 ,  0.2006823 , -0.2826956 ,  0.14040749,  0.01572085,
         0.11303671,  0.3463538 , -0.14814249, -0.12660307,  0.23999949,
        -0.00195891, -0.24725725,  0.09962835,  0.04385688,  0.1650456 ,
         0.2987002 , -0.02947793,  0.17458706,  0.19266203, -0.15722871,
        -0.37672156, -0.03775607,  0.02752581,  0.05644812,  0.10601412,
        -0.24348547, -0.07254626,  0.2853582 ,  0.07726168,  0.2835891 ,
        -0.01840368, -0.0460744

In [132]:
sid_keys[0], sid_values[0]

(6755116273977146,
 array([-6.86888173e-02, -5.84324360e-01,  1.16712011e-01, -2.80860543e-01,
         4.44923699e-01, -4.26281452e-01,  6.24391198e-01, -6.09627187e-01,
        -4.08612899e-02,  4.16321069e-01, -2.66546935e-01, -6.16384577e-03,
        -2.27166384e-01, -3.97930712e-01,  4.25688863e-01,  6.13901854e-01,
        -9.25296471e-02, -1.64467275e-01, -4.22484159e-01,  2.67210305e-01,
         7.07764745e-01,  3.82291853e-01,  5.47312014e-02, -8.28856826e-01,
        -4.05506909e-01,  2.49537244e-01, -3.94689977e-01,  1.91236809e-02,
        -2.24951088e-01, -2.21584022e-01, -5.90593033e-02, -1.93273015e-02,
         1.84065327e-01,  5.27701855e-01,  4.05828178e-01, -4.23578680e-01,
         5.46429574e-01, -1.79198742e-01,  2.63764471e-01,  7.35894963e-02,
        -1.97808817e-01,  2.96653241e-01, -3.19762617e-01, -1.15828648e-01,
         1.64565235e-01, -2.30699833e-02, -3.63790393e-01, -3.14024985e-02,
         5.62122576e-02,  3.77338886e-01,  5.26409984e-01, -9.8567493

In [1]:
from conf import *
import pickle

In [2]:
with open(rid_output_path, 'rb') as f:
    rid_keys, rid_values = pickle.load(f)

In [3]:
rid_keys.shape, rid_values.shape

((122011,), (122011, 128))

In [4]:
rid_keys[:10], 

array([12103424001423753,  3940649674868954, 10133099162199348,
        5629499491704903,  2814749767187481, 12947848936252679,
       11540474048331467,  3940649680568207, 10977524100097623,
        1125899912554363])

In [24]:
live[live['rid']==7036874424220697]

Unnamed: 0,country,uid,rid,duration
7491395,IN,7881299355383021,7036874424220697,84
11518589,PH,7036874424220697,7036874424220697,485
13572797,IN,3659174699859321,7036874424220697,11


In [23]:
sing[sing['sid']==611752105020452087]

Unnamed: 0,country,uid,sid,click_pv
19,PH,4222124654601872,611752105020452087,1
2179,PH,11821949030527264,611752105020452087,2
5457,GB,9851624260939160,611752105020452087,2
7451,PH,7881299355458467,611752105020452087,2
10990,PH,3096224751420810,611752105020452087,7
...,...,...,...,...
10595080,PH,7599824378807803,611752105020452087,1
10597943,PH,9851624267257667,611752105020452087,1
10601034,HK,2251799820224744,611752105020452087,2
10602003,IN,12947848937364663,611752105020452087,1


In [1]:
import redis

In [2]:
work5redis = redis.StrictRedis(host="sg-prod-research-worker-5", port=6379)

In [4]:
pipeline = work5redis.pipeline(transaction=False)

In [5]:
pipeline.set("zsj", 123, 3600)

Pipeline<ConnectionPool<Connection<host=sg-prod-research-worker-5,port=6379,db=0>>>

In [6]:
pipeline.execute()

[True]

In [7]:
work5redis.get("zsj")

b'123'

In [8]:
work5redis.ttl("zsj")

3586

In [3]:
work5redis.mget(["zsj", "wyq"])

[b'123', None]

In [3]:
friend.nunique()

uid_from    3984300
uid_to      3984430
dtype: int64

In [5]:
friend.shape

(48796703, 2)

In [6]:
48796703/2

24398351.5

In [10]:
friend[24398351:].nunique()

uid_from    2823912
uid_to      2886601
dtype: int64

In [None]:
4000000