In [1]:
import pandas as pd
import numpy as np
import pickle
import gc
import os
import time
import copy
import multiprocessing as mp
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm
from scipy import sparse, spatial
import warnings
warnings.filterwarnings('ignore')
tqdm.pandas()

In [2]:
tic = time.time()
DATA_PATH = '../pkl/'
SAVE_PATH = './feats/'
if not os.path.exists(SAVE_PATH):
    print('create dir: %s' % SAVE_PATH)
    os.mkdir(SAVE_PATH)

print('gen_user_question_feat...')

gen_user_question_feat...


In [3]:
data = pd.read_pickle(os.path.join(DATA_PATH, 'invite_data.pkl'))
user_info = pd.read_pickle(os.path.join(DATA_PATH, 'user_info.pkl'))
question_info = pd.read_pickle(os.path.join(DATA_PATH, 'question_info.pkl'))

In [4]:
invite_id = data[['uid', 'qid']]
invite_id_qm = invite_id.merge(user_info[['uid', 'topic_a', 'topic_i']], 'left', 'uid').merge(question_info[['qid', 'topic']], 'left', 'qid')

In [5]:
def gc_list(l):
    for i in l:
        del i
    del l
    gc.collect()

In [6]:
# 用户关注topic和问题 topic的交集
def process(df):
    return df.progress_apply(lambda row: list(set(row['topic_a']) & set(row['topic'])),axis=1)

with mp.Pool(8) as pool:
    ret = pool.map(process, np.array_split(invite_id_qm, 8))
invite_id_qm['topic_a_com'] = pd.concat(ret)
gc_list(ret)

100%|██████████| 1328856/1328856 [00:58<00:00, 22671.47it/s]
100%|██████████| 1328856/1328856 [00:58<00:00, 22613.06it/s]
100%|██████████| 1328856/1328856 [00:57<00:00, 23044.31it/s]
100%|██████████| 1328856/1328856 [00:59<00:00, 22202.69it/s]
100%|██████████| 1328856/1328856 [00:59<00:00, 22212.64it/s]
100%|██████████| 1328855/1328855 [01:01<00:00, 21538.83it/s]
100%|██████████| 1328855/1328855 [00:59<00:00, 22520.98it/s]
100%|██████████| 1328855/1328855 [01:01<00:00, 21630.94it/s]


In [7]:
# 用户感兴趣topic和问题 topic的交集
def process(df):
    return df.progress_apply(lambda row: list(set(row['topic_i'].keys()) & set(row['topic'])),axis=1)

with mp.Pool(8) as pool:
    ret = pool.map(process, np.array_split(invite_id_qm, 8))
invite_id_qm['topic_i_com'] = pd.concat(ret)
gc_list(ret)

 59%|█████▉    | 783052/1328855 [00:39<00:23, 23657.39it/s]]
100%|██████████| 1328856/1328856 [01:03<00:00, 21079.32it/s]
100%|██████████| 1328856/1328856 [01:02<00:00, 21293.79it/s]
100%|██████████| 1328856/1328856 [00:59<00:00, 22391.40it/s]
100%|██████████| 1328856/1328856 [01:03<00:00, 20992.18it/s]
100%|██████████| 1328855/1328855 [01:02<00:00, 21386.67it/s]
100%|██████████| 1328855/1328855 [00:59<00:00, 22209.81it/s]
100%|██████████| 1328855/1328855 [00:59<00:00, 22352.09it/s]


In [8]:
def process(df):
    return df.progress_apply(lambda row: [row['topic_i'][t] for t in row['topic_i_com']],axis=1)

with mp.Pool(8) as pool:
    ret = pool.map(process, np.array_split(invite_id_qm, 8))
invite_id_qm['topic_iv_com'] = pd.concat(ret)
gc_list(ret)

 50%|█████     | 665248/1328856 [00:22<00:32, 20273.70it/s]]
100%|██████████| 1328856/1328856 [00:35<00:00, 37580.10it/s]
100%|██████████| 1328856/1328856 [00:34<00:00, 38324.65it/s]
100%|██████████| 1328856/1328856 [00:36<00:00, 36620.79it/s]
100%|██████████| 1328856/1328856 [00:41<00:00, 32142.82it/s]
100%|██████████| 1328855/1328855 [00:34<00:00, 38029.98it/s]
100%|██████████| 1328855/1328855 [00:35<00:00, 37413.09it/s]
100%|██████████| 1328855/1328855 [00:33<00:00, 39594.19it/s]


In [9]:
# 交集topic计数
invite_id_qm['num_topic_a_com'] = invite_id_qm['topic_a_com'].progress_apply(len)
invite_id_qm['num_topic_i_com'] = invite_id_qm['topic_i_com'].progress_apply(len)

100%|██████████| 10630845/10630845 [00:11<00:00, 891879.13it/s] 
100%|██████████| 10630845/10630845 [00:11<00:00, 926519.50it/s] 


In [10]:
# 交集topic兴趣值统计
invite_id_qm['topic_iv_com'] = invite_id_qm['topic_iv_com'].progress_apply(lambda x: [0] if len(x) == 0 else x)
invite_id_qm['min_topic_iv_com'] = invite_id_qm['topic_iv_com'].progress_apply(np.min)
invite_id_qm['max_topic_iv_com'] = invite_id_qm['topic_iv_com'].progress_apply(np.max)
invite_id_qm['mean_topic_iv_com'] = invite_id_qm['topic_iv_com'].progress_apply(np.mean)
invite_id_qm['std_topic_iv_com'] = invite_id_qm['topic_iv_com'].progress_apply(np.std)

100%|██████████| 10630845/10630845 [00:12<00:00, 871023.60it/s]
100%|██████████| 10630845/10630845 [01:19<00:00, 134411.89it/s]
100%|██████████| 10630845/10630845 [01:18<00:00, 136060.86it/s]
100%|██████████| 10630845/10630845 [02:17<00:00, 77482.62it/s]
100%|██████████| 10630845/10630845 [07:27<00:00, 23776.58it/s]


In [11]:
feats = ['num_topic_a_com', 'num_topic_i_com', 'min_topic_iv_com', 'max_topic_iv_com', 'mean_topic_iv_com', 'std_topic_iv_com']
feats += []
user_question_feat = invite_id_qm[feats]

In [12]:
user_question_feat.head(2)

Unnamed: 0,num_topic_a_com,num_topic_i_com,min_topic_iv_com,max_topic_iv_com,mean_topic_iv_com,std_topic_iv_com
0,1,0,0.0,0.0,0.0,0.0
1,0,0,0.0,0.0,0.0,0.0


In [13]:
user_question_feat.to_pickle(os.path.join(SAVE_PATH, 'user_question_feat.pkl'))

In [14]:
toc = time.time()
print('Used time: %d' % int(toc-tic))

Used time: 1200
