In [1]:
import pandas as pd
import numpy as np
import pickle
import gc
# from tqdm.notebook import tqdm
from tqdm import tqdm
import os
import time
import warnings
warnings.filterwarnings('ignore')
tqdm.pandas()

In [2]:
PATH = '../data/data_set_0926'
SAVE_PATH = '../pkl'
if not os.path.exists(SAVE_PATH):
    print('create dir: %s' % SAVE_PATH)
    os.mkdir(SAVE_PATH)

In [3]:
# 解析列表， 重编码id
def parse_str(d):
    return np.array(list(map(float, d.split())))

def parse_list_1(d):
    if d == '-1':
        return [0]
    return list(map(lambda x: int(x[1:]), str(d).split(',')))

def parse_list_2(d):
    if d == '-1':
        return [0]
    return list(map(lambda x: int(x[2:]), str(d).split(',')))

def parse_map(d):
    if d == '-1':
        return {}
    return dict([int(z.split(':')[0][1:]), float(z.split(':')[1])] for z in d.split(','))

### 1.single word

In [4]:
single_word = pd.read_csv(os.path.join(PATH, 'single_word_vectors_64d.txt'), 
                          names=['id', 'vector'], sep='\t')
single_word.head(2)

Unnamed: 0,id,vector
0,SW1,-0.985937 0.11307016 0.012898494 -0.6822068 -0...
1,SW2,-0.3367663 0.039051324 0.8155926 0.8351733 -0....


In [5]:
single_word['vector'] = single_word['vector'].progress_apply(parse_str)
single_word['id'] = single_word['id'].progress_apply(lambda x: int(x[2:]))
single_word.head(2)

100%|██████████| 23239/23239 [00:00<00:00, 45430.81it/s]
100%|██████████| 23239/23239 [00:00<00:00, 587340.05it/s]


Unnamed: 0,id,vector
0,1,"[-0.985937, 0.11307016, 0.012898494, -0.682206..."
1,2,"[-0.3367663, 0.039051324, 0.8155926, 0.8351733..."


In [6]:
single_word.to_pickle('../pkl/single_word.pkl')
del single_word

### 2.word

In [7]:
word = pd.read_csv(os.path.join(PATH, 'word_vectors_64d.txt'), 
                          names=['id', 'vector'], sep='\t')
word.head(2)

Unnamed: 0,id,vector
0,W1,0.12561196 -0.57268924 -0.14478925 -0.05249426...
1,W2,3.224765 2.2482696 -0.511986 -0.5329892 -0.943...


In [8]:
word['vector'] = word['vector'].progress_apply(parse_str)
word['id'] = word['id'].progress_apply(lambda x: int(x[1:]))
word.head(2)

100%|██████████| 1762829/1762829 [00:32<00:00, 54229.99it/s]
100%|██████████| 1762829/1762829 [00:02<00:00, 727089.71it/s]


Unnamed: 0,id,vector
0,1,"[0.12561196, -0.57268924, -0.14478925, -0.0524..."
1,2,"[3.224765, 2.2482696, -0.511986, -0.5329892, -..."


In [9]:
word.to_pickle('../pkl/word.pkl')
del word

### 3.topic

In [10]:
topic = pd.read_csv(os.path.join(PATH, 'topic_vectors_64d.txt'), 
                          names=['id', 'vector'], sep='\t')
topic.head(2)

Unnamed: 0,id,vector
0,T1,0.16508673 -0.0037432343 -0.058245048 -0.00134...
1,T2,1.608256 -1.0515573 -1.1897708 1.1820835 -0.80...


In [11]:
topic['vector'] = topic['vector'].progress_apply(parse_str)
topic['id'] = topic['id'].progress_apply(lambda x: int(x[1:]))
topic.head(2)

100%|██████████| 100000/100000 [00:01<00:00, 51750.51it/s]
100%|██████████| 100000/100000 [00:00<00:00, 725039.72it/s]


Unnamed: 0,id,vector
0,1,"[0.16508673, -0.0037432343, -0.058245048, -0.0..."
1,2,"[1.608256, -1.0515573, -1.1897708, 1.1820835, ..."


In [12]:
topic.to_pickle('../pkl/topic.pkl')
del topic

### 4.invite_info

In [4]:
invite_info = pd.read_csv(os.path.join(PATH, 'invite_info_0926.txt'), 
                          names=['qid', 'uid', 'itime', 'label'], sep='\t')
invite_info.head(2)

Unnamed: 0,qid,uid,itime,label
0,Q2166419046,M401693808,D3865-H22,0
1,Q1550017551,M3392373099,D3844-H11,0


In [5]:
invite_info['qid'] = invite_info['qid'].progress_apply(lambda x: int(x[1:]))
invite_info['uid'] = invite_info['uid'].progress_apply(lambda x: int(x[1:]))

100%|██████████| 9489162/9489162 [00:14<00:00, 671097.95it/s]
100%|██████████| 9489162/9489162 [00:13<00:00, 694976.66it/s]


In [6]:
invite_info['iday'] = invite_info['itime'].progress_apply(lambda x: int(x.split('-')[0][1:]))
invite_info['ihour'] = invite_info['itime'].progress_apply(lambda x: int(x.split('-')[1][1:]))
invite_info['itime'] = invite_info['iday'] * 24 + invite_info['ihour']
invite_info.head(2)

100%|██████████| 9489162/9489162 [00:15<00:00, 608138.48it/s]
100%|██████████| 9489162/9489162 [00:14<00:00, 633402.26it/s]


Unnamed: 0,qid,uid,itime,label,iday,ihour
0,2166419046,401693808,92782,0,3865,22
1,1550017551,3392373099,92267,0,3844,11


In [7]:
invite_info.to_pickle('../pkl/invite_info.pkl')

### 5.invite_info_evaluate ***

In [8]:
invite_info_evaluate = pd.read_csv(os.path.join(PATH, 'invite_info_evaluate_1_0926.txt'), 
                          names=['qid', 'uid', 'itime'], sep='\t')
invite_info_evaluate.head(2)

Unnamed: 0,qid,uid,itime
0,Q1493039281,M64135255,D3870-H9
1,Q2023398782,M2536956560,D3872-H22


In [9]:
invite_info_evaluate['qid'] = invite_info_evaluate['qid'].progress_apply(lambda x: int(x[1:]))
invite_info_evaluate['uid'] = invite_info_evaluate['uid'].progress_apply(lambda x: int(x[1:]))

100%|██████████| 1141683/1141683 [00:01<00:00, 643712.14it/s]
100%|██████████| 1141683/1141683 [00:01<00:00, 585599.59it/s]


In [10]:
invite_info_evaluate['iday'] = invite_info_evaluate['itime'].apply(lambda x: int(x.split('-')[0][1:]))
invite_info_evaluate['ihour'] = invite_info_evaluate['itime'].apply(lambda x: int(x.split('-')[1][1:]))
invite_info_evaluate['itime'] = invite_info_evaluate['iday'] * 24 + invite_info_evaluate['ihour']
invite_info_evaluate.head(2)

Unnamed: 0,qid,uid,itime,iday,ihour
0,1493039281,64135255,92889,3870,9
1,2023398782,2536956560,92950,3872,22


In [11]:
invite_info_evaluate.to_pickle('../pkl/invite_info_evaluate.pkl')

In [13]:
data = pd.concat([invite_info, invite_info_evaluate]).reset_index(drop=True)

In [14]:
data.to_pickle('../pkl/invite_data.pkl')

In [15]:
data['id'] = np.arange(len(data))
inv = data[['id', 'uid', 'itime']]
inv.sort_values(by=['uid', 'itime'], inplace=True)

In [16]:
inv.to_pickle(os.path.join('../pkl/inv_time.pkl'))

In [20]:
del invite_info
del invite_info_evaluate
del data
del inv

### 6.user

In [21]:
user_info = pd.read_csv(os.path.join(PATH, 'member_info_0926.txt'), 
                          names=['uid', 'gender', 'keyword', 'grade', 'hotness', 'reg_type','reg_plat','freq',
                                 'A1', 'B1', 'C1', 'D1', 'E1', 'A2', 'B2', 'C2', 'D2', 'E2',
                                 'score', 'topic_a', 'topic_i'], sep='\t')
user_info.head(2)

Unnamed: 0,uid,gender,keyword,grade,hotness,reg_type,reg_plat,freq,A1,B1,...,D1,E1,A2,B2,C2,D2,E2,score,topic_a,topic_i
0,M1934753188,male,-1,0.0,0.0,unknown,unknown,monthly,0,1,...,1,0,MD470265,BR470265,PV929066,CT929066,PF470265,764,T540,"T21107:1.7915097,T405:1.6123838,T4436:1.518003..."
1,M595924114,male,-1,0.0,0.0,unknown,unknown,daily,0,0,...,1,1,MD195122,BR596936,PV002320,CT840234,PF470265,671,"T44126,T15940,T839,T8978,T2934,T1113,T3914,T12...","T18016:2.0650618,T2384:1.2503042,T1142:1.13569..."


In [22]:
user_info['topic_a'] = user_info['topic_a'].progress_apply(parse_list_1)
user_info['topic_i'] = user_info['topic_i'].progress_apply(parse_map)

100%|██████████| 1931654/1931654 [00:16<00:00, 114493.60it/s]
100%|██████████| 1931654/1931654 [00:15<00:00, 121937.11it/s]


In [23]:
user_info['topic_ik'] = user_info['topic_i'].progress_apply(dict.keys).apply(list)
user_info['topic_iv'] = user_info['topic_i'].progress_apply(dict.values).apply(list)

100%|██████████| 1931654/1931654 [00:03<00:00, 577539.74it/s]
100%|██████████| 1931654/1931654 [00:02<00:00, 776614.01it/s] 


In [24]:
user_info['uid'] = user_info['uid'].progress_apply(lambda x: int(x[1:]))

100%|██████████| 1931654/1931654 [00:02<00:00, 743680.18it/s]


In [25]:
user_info.head(2)

Unnamed: 0,uid,gender,keyword,grade,hotness,reg_type,reg_plat,freq,A1,B1,...,A2,B2,C2,D2,E2,score,topic_a,topic_i,topic_ik,topic_iv
0,1934753188,male,-1,0.0,0.0,unknown,unknown,monthly,0,1,...,MD470265,BR470265,PV929066,CT929066,PF470265,764,[540],"{21107: 1.7915097, 405: 1.6123838, 4436: 1.518...","[21107, 405, 4436, 8788, 3435, 19450, 3702, 86...","[1.7915097, 1.6123838, 1.5180033, 1.4733534, 1..."
1,595924114,male,-1,0.0,0.0,unknown,unknown,daily,0,0,...,MD195122,BR596936,PV002320,CT840234,PF470265,671,"[44126, 15940, 839, 8978, 2934, 1113, 3914, 12...","{18016: 2.0650618, 2384: 1.2503042, 1142: 1.13...","[18016, 2384, 1142, 231, 13555, 1981, 5955, 96...","[2.0650618, 1.2503042, 1.1356933, 1.1340009, 1..."


In [26]:
user_info.to_pickle('../pkl/user_info.pkl')

In [27]:
del user_info

### 7.question

In [17]:
question_info = pd.read_csv(os.path.join(PATH, 'question_info_0926.txt'),
                          names=['qid', 'qtime', 'title_sw', 'title_w', 'desc_sw', 'desc_w', 'topic'], sep='\t')
question_info.head(2)

Unnamed: 0,qid,qtime,title_sw,title_w,desc_sw,desc_w,topic
0,Q2234111670,D1018-H5,"SW211,SW204,SW1715,SW69,SW2033,SW138,SW57,SW13...","W22414,W963,W10458",-1,-1,"T321,T730,T5784,T4389"
1,Q760329790,D1745-H20,"SW69,SW2033,SW138,SW2616,SW2668,SW36,SW2594,SW...","W12677,W16829,W15201,W6419,W101839","SW146,SW982,SW401,SW297,SW17,SW2616,SW2668,SW3...","W1296,W2118,W12677,W16829,W15201,W6419,W101839...","T278,T12673,T4677"


In [18]:
question_info['title_sw'] = question_info['title_sw'].progress_apply(parse_list_2)
question_info['title_w'] = question_info['title_w'].progress_apply(parse_list_1)
question_info['desc_sw'] = question_info['desc_sw'].progress_apply(parse_list_2)
question_info['desc_w'] = question_info['desc_w'].progress_apply(parse_list_1)
question_info['topic'] = question_info['topic'].progress_apply(parse_list_1)
question_info.head(2)

100%|██████████| 1829900/1829900 [00:21<00:00, 83465.99it/s]
100%|██████████| 1829900/1829900 [00:10<00:00, 168624.18it/s]
100%|██████████| 1829900/1829900 [00:48<00:00, 37722.95it/s]
100%|██████████| 1829900/1829900 [00:16<00:00, 108319.98it/s]
100%|██████████| 1829900/1829900 [00:07<00:00, 228741.73it/s]


Unnamed: 0,qid,qtime,title_sw,title_w,desc_sw,desc_w,topic
0,Q2234111670,D1018-H5,"[211, 204, 1715, 69, 2033, 138, 57, 138, 8, 28...","[22414, 963, 10458]",[0],[0],"[321, 730, 5784, 4389]"
1,Q760329790,D1745-H20,"[69, 2033, 138, 2616, 2668, 36, 2594, 1165, 20...","[12677, 16829, 15201, 6419, 101839]","[146, 982, 401, 297, 17, 2616, 2668, 36, 2594,...","[1296, 2118, 12677, 16829, 15201, 6419, 101839...","[278, 12673, 4677]"


In [19]:
question_info['qday'] = question_info['qtime'].progress_apply(lambda x: int(x.split('-')[0][1:]))
question_info['qhour'] = question_info['qtime'].progress_apply(lambda x: int(x.split('-')[1][1:]))
question_info['qtime'] = question_info['qday'] * 24 + question_info['qhour']
question_info['qid'] = question_info['qid'].progress_apply(lambda x: int(x[1:]))
question_info.head(2)

100%|██████████| 1829900/1829900 [00:02<00:00, 625791.04it/s]
100%|██████████| 1829900/1829900 [00:03<00:00, 545307.67it/s]
100%|██████████| 1829900/1829900 [00:02<00:00, 659439.89it/s]


Unnamed: 0,qid,qtime,title_sw,title_w,desc_sw,desc_w,topic,qday,qhour
0,2234111670,24437,"[211, 204, 1715, 69, 2033, 138, 57, 138, 8, 28...","[22414, 963, 10458]",[0],[0],"[321, 730, 5784, 4389]",1018,5
1,760329790,41900,"[69, 2033, 138, 2616, 2668, 36, 2594, 1165, 20...","[12677, 16829, 15201, 6419, 101839]","[146, 982, 401, 297, 17, 2616, 2668, 36, 2594,...","[1296, 2118, 12677, 16829, 15201, 6419, 101839...","[278, 12673, 4677]",1745,20


In [20]:
question_info.to_pickle('../pkl/question_info.pkl')

In [21]:
del question_info

### 8.answer

In [22]:
%%time
answer_info = pd.read_csv(os.path.join(PATH, 'answer_info_0926.txt'), 
                          names=['aid', 'qid', 'uid', 'atime', 'content_sw', 'content_w', 
                                 'excellent', 'recommend', 'round_table', 'figure', 'video', 
                                 'num_word', 'num_like', 'num_unlike', 'num_comment',
                                 'num_favor', 'num_thank', 'num_report', 'num_nohelp', 'num_oppose'], sep='\t')
answer_info.head(2)

CPU times: user 1min 37s, sys: 7.46 s, total: 1min 45s
Wall time: 2min 15s


Unnamed: 0,aid,qid,uid,atime,content_sw,content_w,excellent,recommend,round_table,figure,video,num_word,num_like,num_unlike,num_comment,num_favor,num_thank,num_report,num_nohelp,num_oppose
0,A2502060945,Q1867533817,M625498202,D3808-H7,"SW13,SW19,SW44,SW150,SW23,SW594,SW1254,SW91,SW...","W239,W10528,W142,W20372,W6473,W10,W24,W4527,W2...",0,0,0,0,0,41,1,0,1,0,1,0,0,0
1,A2847829478,Q3366788616,M142330444,D3810-H17,"SW898,SW3656,SW2,SW413,SW601,SW2,SW2541,SW681,...","W4628,W66060,W1607,W2647,W53385,W109029,W319,W...",0,0,0,0,0,204,1,0,0,3,1,0,0,0


In [23]:
answer_info['content_sw'] = answer_info['content_sw'].progress_apply(parse_list_2) 
answer_info['content_w'] = answer_info['content_w'].progress_apply(parse_list_1) 
answer_info.head(2)

100%|██████████| 4513735/4513735 [05:57<00:00, 12637.59it/s]
100%|██████████| 4513735/4513735 [02:05<00:00, 36098.64it/s]


Unnamed: 0,aid,qid,uid,atime,content_sw,content_w,excellent,recommend,round_table,figure,video,num_word,num_like,num_unlike,num_comment,num_favor,num_thank,num_report,num_nohelp,num_oppose
0,A2502060945,Q1867533817,M625498202,D3808-H7,"[13, 19, 44, 150, 23, 594, 1254, 91, 3, 87, 48...","[239, 10528, 142, 20372, 6473, 10, 24, 4527, 2...",0,0,0,0,0,41,1,0,1,0,1,0,0,0
1,A2847829478,Q3366788616,M142330444,D3810-H17,"[898, 3656, 2, 413, 601, 2, 2541, 681, 2, 4368...","[4628, 66060, 1607, 2647, 53385, 109029, 319, ...",0,0,0,0,0,204,1,0,0,3,1,0,0,0


In [24]:
answer_info['aday'] = answer_info['atime'].progress_apply(lambda x: int(x.split('-')[0][1:]))
answer_info['ahour'] = answer_info['atime'].progress_apply(lambda x: int(x.split('-')[1][1:]))
answer_info['atime'] = answer_info['aday'] * 24 + answer_info['ahour']
answer_info['aid'] = answer_info['aid'].progress_apply(lambda x: int(x[1:]))
answer_info['qid'] = answer_info['qid'].progress_apply(lambda x: int(x[1:]))
answer_info['uid'] = answer_info['uid'].progress_apply(lambda x: int(x[1:]))

100%|██████████| 4513735/4513735 [00:07<00:00, 627365.63it/s]
100%|██████████| 4513735/4513735 [00:07<00:00, 590245.64it/s]
100%|██████████| 4513735/4513735 [00:06<00:00, 670168.69it/s]
100%|██████████| 4513735/4513735 [00:07<00:00, 644550.67it/s]
100%|██████████| 4513735/4513735 [00:06<00:00, 683437.60it/s]


In [25]:
answer_info.head(2)

Unnamed: 0,aid,qid,uid,atime,content_sw,content_w,excellent,recommend,round_table,figure,...,num_like,num_unlike,num_comment,num_favor,num_thank,num_report,num_nohelp,num_oppose,aday,ahour
0,2502060945,1867533817,625498202,91399,"[13, 19, 44, 150, 23, 594, 1254, 91, 3, 87, 48...","[239, 10528, 142, 20372, 6473, 10, 24, 4527, 2...",0,0,0,0,...,1,0,1,0,1,0,0,0,3808,7
1,2847829478,3366788616,142330444,91457,"[898, 3656, 2, 413, 601, 2, 2541, 681, 2, 4368...","[4628, 66060, 1607, 2647, 53385, 109029, 319, ...",0,0,0,0,...,1,0,0,3,1,0,0,0,3810,17


In [26]:
answer_info.to_pickle('../pkl/answer_info.pkl')

In [27]:
ans = answer_info[['uid', 'atime']]
ans.sort_values(by=['uid', 'atime'], inplace=True)

In [28]:
ans.to_pickle(os.path.join('../pkl/ans_time.pkl'))

In [29]:
answer_info.drop(columns=['content_sw', 'content_w']).to_pickle(os.path.join('../pkl/answer_info_tiny.pkl'))

In [None]:
del answer_info

In [3]:
# invite_info = pd.read_pickle(os.path.join('../pkl', 'invite_info.pkl'))

In [4]:
# invite_info.shape

(9489162, 6)

In [2]:
# topic = pd.read_pickle(os.path.join('../pkl', 'topic.pkl'))
# topic.shape

(100000, 2)