In [1]:
import pandas as pd
import numpy as np
import pickle
import gc
import os
import time
import copy
import multiprocessing as mp
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm
from scipy import sparse, spatial
import warnings
warnings.filterwarnings('ignore')
tqdm.pandas()

  from collections import Mapping, defaultdict


In [2]:
tic = time.time()
DATA_PATH = '../pkl/'
SAVE_PATH = './feats/'
if not os.path.exists(SAVE_PATH):
    print('create dir: %s' % SAVE_PATH)
    os.mkdir(SAVE_PATH)

print('gen_topic_sim_stats...')

gen_topic_sim_stats...


In [3]:
data = pd.read_pickle(os.path.join(DATA_PATH, 'invite_data.pkl'))
user_info = pd.read_pickle(os.path.join(DATA_PATH, 'user_info.pkl'))
question_info = pd.read_pickle(os.path.join(DATA_PATH, 'question_info.pkl'))

In [4]:
invite_id = data[['uid', 'qid']]
invite_id_qm = invite_id.merge(user_info[['uid', 'topic_a', 'topic_ik']], 'left', 'uid').merge(question_info[['qid', 'topic']], 'left', 'qid')

In [5]:
invite_id_qm.head()

Unnamed: 0,uid,qid,topic_a,topic_ik,topic
0,401693808,2166419046,"[1727, 5310, 3402, 916, 1506, 26329, 7293, 180...","[2794, 9701, 9533, 2396, 11120, 3197, 1734, 30...","[456, 112, 9566, 5310]"
1,3392373099,1550017551,"[42595, 3, 8520, 597, 6485, 6212, 25664, 148, ...","[1470, 235, 4692, 2294, 346, 8082, 1219, 2959,...","[2, 3095]"
2,2317670257,604029601,"[610, 448, 61, 2801, 9019, 65, 233, 190, 55, 5...","[13383, 1955, 8943, 5797, 4538, 4192, 7701, 11...","[6090, 2156, 97, 456]"
3,1618461867,2350061229,"[5, 33331, 2274, 31, 245, 516, 309, 1326, 119,...","[671, 3772, 974, 1074, 1918, 124, 1773, 1096, ...",[856]
4,3544409350,2443223942,[0],"[4876, 2467, 245, 68, 556, 42, 8, 825, 227, 637]","[26, 76, 17]"


In [6]:
topic_comb_sim_mat = np.load('../pkl/topic_comb_sim_mat.npy')

In [7]:
def get_topic_sim(i, j):
    if i == 0 or j == 0:
        return 0
    if i == j:
        return 1
    if i > j:
        return topic_comb_sim_mat[j-1][i-1]
    else:
        return topic_comb_sim_mat[i-1][j-1]

In [8]:
## 用户问题topic相似度统计

In [9]:
def process(df):
    topic_qu_sims_st = np.zeros((len(df), 8))
    for i, (tas, tis, ts) in enumerate(tqdm(df[['topic_a', 'topic_ik', 'topic']].values)):
        ass = []
        iss = []
        for t in ts:
            for ta in tas:
                ass.append(get_topic_sim(t, ta))
            for ti in tis:
                iss.append(get_topic_sim(t, ti))
        ass = ([0] if len(ass) == 0 else ass)
        iss = ([0] if len(iss) == 0 else iss)
        topic_qu_sims_st[i] = np.array([np.min(ass), np.max(ass), np.mean(ass), np.std(ass),
                                        np.min(iss), np.max(iss), np.mean(iss), np.std(iss)])
    return topic_qu_sims_st

In [11]:
with mp.Pool(8) as pool:
    ret = pool.map(process, np.array_split(invite_id_qm, 8))
ret = np.vstack(ret)

In [11]:
cols = ['topic_%s_sims_%s' % (t, st) for t in ['a', 'i'] for st in ['min', 'max', 'mean', 'std']]
topic_qu_sims_stat = pd.DataFrame(ret, columns=cols)
print(topic_qu_sims_stat.shape)

(10630845, 8)


In [12]:
topic_qu_sims_stat.to_pickle(os.path.join(SAVE_PATH, 'topic_qu_sims_stat.pkl'))
topic_qu_sims_stat.head(2)

Unnamed: 0,topic_a_sims_min,topic_a_sims_max,topic_a_sims_mean,topic_a_sims_std,topic_i_sims_min,topic_i_sims_max,topic_i_sims_mean,topic_i_sims_std
0,-0.363709,1.0,0.05934,0.216908,-0.170191,0.352119,0.008833,0.097024
1,-0.339155,0.466608,0.022478,0.186707,-0.268557,0.318178,0.071149,0.153256


In [13]:
## 用户历史回答问题和当前问题 topic 相似度统计

In [14]:
prev_ans_ques = pd.read_pickle(os.path.join(DATA_PATH, 'prev_ans_ques.pkl'))[['prev_ans_ques']]

In [15]:
question_info = question_info.set_index('qid')
question_info.head(2)

Unnamed: 0_level_0,qtime,title_sw,title_w,desc_sw,desc_w,topic,qday,qhour
qid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2234111670,24437,"[211, 204, 1715, 69, 2033, 138, 57, 138, 8, 28...","[22414, 963, 10458]",[0],[0],"[321, 730, 5784, 4389]",1018,5
760329790,41900,"[69, 2033, 138, 2616, 2668, 36, 2594, 1165, 20...","[12677, 16829, 15201, 6419, 101839]","[146, 982, 401, 297, 17, 2616, 2668, 36, 2594,...","[1296, 2118, 12677, 16829, 15201, 6419, 101839...","[278, 12673, 4677]",1745,20


In [16]:
invite_id = pd.concat([invite_id, prev_ans_ques], axis=1)
invite_id.head(3)

Unnamed: 0,uid,qid,prev_ans_ques
0,401693808,2166419046,
1,3392373099,1550017551,
2,2317670257,604029601,"[3034061280, 1309562090]"


In [17]:
def process(df):
    prev_topic_sims_st = np.zeros((len(df), 4))
    for i, (q1, q2s) in enumerate(tqdm(df[['qid', 'prev_ans_ques']].values)):
        t1s = question_info.loc[q1, 'topic']
        t2s = []
        if type(q2s) == list:
            for q2 in q2s:
                ts = question_info.loc[q2, 'topic']
                if type(ts) == list:
                    t2s += ts
        ss = []
        for t1 in t1s:
            for t2 in t2s:
                ss.append(get_topic_sim(t1, t2)) 
        ss = ([0] if len(ss) == 0 else ss)
        prev_topic_sims_st[i] = np.array([np.min(ss), np.max(ss), np.mean(ss), np.std(ss)])
    return prev_topic_sims_st

In [18]:
with mp.Pool(8) as pool:
    ret = pool.map(process, np.array_split(invite_id, 8))
ret = np.vstack(ret)

100%|██████████| 1328856/1328856 [06:34<00:00, 3369.78it/s]
100%|██████████| 1328856/1328856 [06:41<00:00, 3310.56it/s]
100%|██████████| 1328856/1328856 [06:33<00:00, 3377.38it/s]
100%|██████████| 1328856/1328856 [06:35<00:00, 3356.99it/s]
100%|██████████| 1328856/1328856 [06:33<00:00, 3380.47it/s]
100%|██████████| 1328855/1328855 [06:37<00:00, 3343.05it/s]
100%|██████████| 1328855/1328855 [06:33<00:00, 3378.32it/s]
100%|██████████| 1328855/1328855 [07:25<00:00, 2984.24it/s]


In [19]:
cols = ['prev_topic_sims_%s' % (st) for st in ['min', 'max', 'mean', 'std']]
prev_topic_sims_stat = pd.DataFrame(ret, columns=cols)
print(prev_topic_sims_stat.shape)

(10630845, 4)


In [20]:
prev_topic_sims_stat.to_pickle(os.path.join(SAVE_PATH, 'prev_topic_sims_stat.pkl'))
prev_topic_sims_stat.head(5)

Unnamed: 0,prev_topic_sims_min,prev_topic_sims_max,prev_topic_sims_mean,prev_topic_sims_std
0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0
2,-0.067982,0.641366,0.20226,0.182464
3,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0


In [21]:
toc = time.time()
print('Used time: %d' % int(toc-tic))

Used time: 1547
