In [1]:
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis
import time

time_start=time.time()

In [2]:
train=pd.read_csv('invite_info.txt',sep='\s+',names=['qid','uid','time','target'])
test=pd.read_csv('invite_info_evaluate_2_0926.txt',sep='\s+',names=['qid','uid','time'])
member_info=pd.read_csv('member_info.txt',sep='\s+',names=['uid','sex','key_word','num_level','hot_level','regis_type','regis_platform',
                                                          'look_freq','a','b','c','d','e','A','B','C','D','E','salt','l_topic','topic_n'])
ques_info=pd.read_csv('question_info.txt',sep='\s+',names=['qid','qtime','qtitle','qtitlec','qinfo','qinfoc','qtopic'])

train=train[['qid','uid','target']]
test=test[['qid','uid']]
member_info=member_info[['uid','l_topic']]
ques_info=ques_info[['qid','qtopic']]

member_info.drop_duplicates(['uid'],inplace=True)
ques_info.drop_duplicates(['qid'],inplace=True)

train=pd.merge(train,member_info,on='uid',how='left')
test=pd.merge(test,member_info,on='uid',how='left')

train=pd.merge(train,ques_info,on='qid',how='left')
test=pd.merge(test,ques_info,on='qid',how='left')

In [3]:
import pickle
def load_obj(name ):
    with open( name + '.pkl', 'rb') as f:
        return pickle.load(f)
topic_w2v=load_obj('topic_vectors_64d')

In [4]:
train.head()

Unnamed: 0,qid,uid,target,l_topic,qtopic
0,Q2166419046,M401693808,0,"T1727,T5310,T3402,T916,T1506,T26329,T7293,T180...","T456,T112,T9566,T5310"
1,Q1550017551,M3392373099,0,"T42595,T3,T8520,T597,T6485,T6212,T25664,T148,T...","T2,T3095"
2,Q604029601,M2317670257,0,"T610,T448,T61,T2801,T9019,T65,T233,T190,T55,T5...","T6090,T2156,T97,T456"
3,Q2350061229,M1618461867,0,"T5,T33331,T2274,T31,T245,T516,T309,T1326,T119,...",T856
4,Q2443223942,M3544409350,0,-1,"T26,T76,T17"


In [6]:
from tqdm import tqdm
tqdm.pandas()

In [7]:
def q_fun(df):
    print('start')
    result=[]
    def sent2vec(s):
        words = str(s).split(',')
        M = []
        for w in words:
            try:
                M.append(topic_w2v[w]['vector'])
            except:
                continue
        M = np.array(M)
        v = M.sum(axis=0)
        return v / np.sqrt((v ** 2).sum())
    def w2v_distances(q,a):
        x = np.nan_to_num(sent2vec(q))
        y = np.nan_to_num(sent2vec(a))
        result.append([cosine(x, y),cityblock(x, y),canberra(x, y),euclidean(x, y),minkowski(x, y, 3),braycurtis(x, y)])
    df.progress_apply(lambda row: w2v_distances(row['l_topic'], row['qtopic']), axis=1)
    return pd.DataFrame(result)

In [8]:
train_fea=q_fun(train)

  0%|          | 0/9489162 [00:00<?, ?it/s]

start


  
  dist = 1.0 - uv / np.sqrt(uu * vv)
  return l1_diff.sum() / l1_sum.sum()
100%|██████████| 9489162/9489162 [1:08:43<00:00, 2301.40it/s]


In [9]:
train.head()

Unnamed: 0,qid,uid,target,l_topic,qtopic
0,Q2166419046,M401693808,0,"T1727,T5310,T3402,T916,T1506,T26329,T7293,T180...","T456,T112,T9566,T5310"
1,Q1550017551,M3392373099,0,"T42595,T3,T8520,T597,T6485,T6212,T25664,T148,T...","T2,T3095"
2,Q604029601,M2317670257,0,"T610,T448,T61,T2801,T9019,T65,T233,T190,T55,T5...","T6090,T2156,T97,T456"
3,Q2350061229,M1618461867,0,"T5,T33331,T2274,T31,T245,T516,T309,T1326,T119,...",T856
4,Q2443223942,M3544409350,0,-1,"T26,T76,T17"


In [10]:
train_fea.head()

Unnamed: 0,0,1,2,3,4,5
0,0.611698,7.435055,40.985776,1.106072,0.621092,0.678906
1,1.048406,9.32195,46.302877,1.448037,0.840517,1.089007
2,1.168872,9.918149,50.857584,1.528968,0.884653,1.246107
3,0.819608,7.498162,41.49397,1.280319,0.798377,0.776758
4,,6.286499,64.0,1.0,0.5777,1.0


In [11]:
test_fea=q_fun(test)

  0%|          | 0/1141718 [00:00<?, ?it/s]

start


  
100%|██████████| 1141718/1141718 [07:16<00:00, 2614.38it/s]


In [15]:
test.head()

Unnamed: 0,qid,uid,l_topic,qtopic
0,Q3273481096,M1267743167,"T80,T182,T5992",T8074
1,Q4224184733,M2715893043,"T21,T22,T355,T6504,T659,T282,T97,T44,T5,T2,T46","T355,T2,T43264,T22,T602"
2,Q1832714071,M2244950365,-1,T26
3,Q3594972263,M2321407666,"T22,T4361,T2801,T5,T740,T18,T219,T1128,T53,T46...","T5,T366,T7370"
4,Q403456350,M1091084170,T1546,"T241,T636"


In [13]:
test_fea.head()

Unnamed: 0,0,1,2,3,4,5
0,0.95199,8.571484,45.779184,1.379848,0.828582,0.928267
1,0.546938,6.688151,35.733362,1.045885,0.605998,0.581101
2,,6.290456,64.0,1.0,0.588785,1.0
3,0.793505,8.203265,42.670239,1.259766,0.73371,0.785964
4,0.715753,7.679626,40.843853,1.196455,0.690655,0.723324


In [14]:
len(train_fea)+len(test_fea)

10630880

In [16]:
train_fea.columns=['t_cos','t_city','t_canb','t_euc','t_mink','t_bary']

In [17]:
test_fea.columns=['t_cos','t_city','t_canb','t_euc','t_mink','t_bary']

In [18]:
feature_topic_w2vdis=pd.concat([train_fea,test_fea],axis=0)

In [21]:
feature_topic_w2vdis=feature_topic_w2vdis.reset_index(drop=True)

In [20]:
train=pd.read_csv('invite_info.txt',sep='\s+',names=['qid','uid','time','target'])
test=pd.read_csv('invite_info_evaluate_2_0926.txt',sep='\s+',names=['qid','uid','time'])

In [22]:
all_data=pd.concat([train[['qid','uid','time']],test])


In [23]:
from tqdm import tqdm
tqdm.pandas()

In [24]:
all_data['day']=all_data['time'].progress_apply(lambda x:int(x[1:].split('-H')[0]))

100%|██████████| 10630880/10630880 [00:19<00:00, 532880.95it/s]


In [25]:
all_data=pd.concat([all_data.reset_index(drop=True),feature_topic_w2vdis.reset_index(drop=True)],axis=1)


In [26]:
all_data=all_data.fillna(0)

In [27]:
all_data

Unnamed: 0,qid,uid,time,day,t_cos,t_city,t_canb,t_euc,t_mink,t_bary
0,Q2166419046,M401693808,D3865-H22,3865,0.611698,7.435055,40.985776,1.106072,0.621092,0.678906
1,Q1550017551,M3392373099,D3844-H11,3844,1.048406,9.321950,46.302877,1.448037,0.840517,1.089007
2,Q604029601,M2317670257,D3862-H15,3862,1.168872,9.918149,50.857584,1.528968,0.884653,1.246107
3,Q2350061229,M1618461867,D3849-H11,3849,0.819608,7.498162,41.493970,1.280319,0.798377,0.776758
4,Q2443223942,M3544409350,D3867-H4,3867,0.000000,6.286499,64.000000,1.000000,0.577700,1.000000
5,Q640765464,M2818659842,D3841-H16,3841,0.369601,5.431981,34.474224,0.859768,0.504315,0.464283
6,Q795459266,M2818659842,D3861-H20,3861,0.849873,8.428731,45.431358,1.303743,0.752403,0.879627
7,Q190554387,M1581217469,D3850-H8,3850,0.278254,5.036149,34.238376,0.745995,0.424099,0.432920
8,Q1958712851,M3021021791,D3850-H19,3850,0.869792,8.662540,46.672839,1.318933,0.757372,0.992893
9,Q311993584,M1766315480,D3839-H15,3839,0.723637,7.517641,39.942830,1.203027,0.717823,0.700774


In [28]:
all_data['topic_cos_rank_day']=all_data.groupby(['uid','day'])['t_cos'].rank(ascending=1,method='min')

In [29]:
all_data['topic_bary_rank_day']=all_data.groupby(['uid','day'])['t_bary'].rank(ascending=1,method='min')
all_data['topic_city_rank_day']=all_data.groupby(['uid','day'])['t_city'].rank(ascending=1,method='min')
all_data['topic_canb_rank_day']=all_data.groupby(['uid','day'])['t_canb'].rank(ascending=1,method='min')
all_data['topic_euc_rank_day']=all_data.groupby(['uid','day'])['t_euc'].rank(ascending=1,method='min')
all_data['topic_mink_rank_day']=all_data.groupby(['uid','day'])['t_mink'].rank(ascending=1,method='min')

In [30]:
all_data.columns

Index(['qid', 'uid', 'time', 'day', 't_cos', 't_city', 't_canb', 't_euc',
       't_mink', 't_bary', 'topic_cos_rank_day', 'topic_bary_rank_day',
       'topic_city_rank_day', 'topic_canb_rank_day', 'topic_euc_rank_day',
       'topic_mink_rank_day'],
      dtype='object')

In [31]:
all_data[['t_cos', 't_city', 't_canb', 't_euc','t_mink', 't_bary', 'topic_cos_rank_day', 'topic_bary_rank_day',
          'topic_city_rank_day', 'topic_canb_rank_day', 'topic_euc_rank_day',
       'topic_mink_rank_day']].to_hdf('feature_topic_w2vdis.h5', key='data')

In [32]:
all_data

Unnamed: 0,qid,uid,time,day,t_cos,t_city,t_canb,t_euc,t_mink,t_bary,topic_cos_rank_day,topic_bary_rank_day,topic_city_rank_day,topic_canb_rank_day,topic_euc_rank_day,topic_mink_rank_day
0,Q2166419046,M401693808,D3865-H22,3865,0.611698,7.435055,40.985776,1.106072,0.621092,0.678906,1.0,1.0,1.0,1.0,1.0,1.0
1,Q1550017551,M3392373099,D3844-H11,3844,1.048406,9.321950,46.302877,1.448037,0.840517,1.089007,2.0,2.0,2.0,2.0,2.0,2.0
2,Q604029601,M2317670257,D3862-H15,3862,1.168872,9.918149,50.857584,1.528968,0.884653,1.246107,1.0,1.0,1.0,1.0,1.0,1.0
3,Q2350061229,M1618461867,D3849-H11,3849,0.819608,7.498162,41.493970,1.280319,0.798377,0.776758,1.0,1.0,1.0,1.0,1.0,1.0
4,Q2443223942,M3544409350,D3867-H4,3867,0.000000,6.286499,64.000000,1.000000,0.577700,1.000000,1.0,1.0,1.0,1.0,1.0,1.0
5,Q640765464,M2818659842,D3841-H16,3841,0.369601,5.431981,34.474224,0.859768,0.504315,0.464283,1.0,1.0,1.0,1.0,1.0,1.0
6,Q795459266,M2818659842,D3861-H20,3861,0.849873,8.428731,45.431358,1.303743,0.752403,0.879627,1.0,1.0,1.0,2.0,1.0,1.0
7,Q190554387,M1581217469,D3850-H8,3850,0.278254,5.036149,34.238376,0.745995,0.424099,0.432920,1.0,2.0,2.0,2.0,1.0,1.0
8,Q1958712851,M3021021791,D3850-H19,3850,0.869792,8.662540,46.672839,1.318933,0.757372,0.992893,1.0,1.0,1.0,1.0,1.0,1.0
9,Q311993584,M1766315480,D3839-H15,3839,0.723637,7.517641,39.942830,1.203027,0.717823,0.700774,1.0,1.0,1.0,1.0,1.0,1.0
