In [1]:
import pandas as pd
import numpy as np
import pickle
import gc
import os
import time
import copy
import multiprocessing as mp
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm
from scipy import sparse, spatial
import warnings
warnings.filterwarnings('ignore')
tqdm.pandas()

In [2]:
tic = time.time()
DATA_PATH = '../pkl/'
SAVE_PATH = './feats/'
if not os.path.exists(SAVE_PATH):
    print('create dir: %s' % SAVE_PATH)
    os.mkdir(SAVE_PATH)

print('gen_inv_time_stat...')

gen_inv_time_stat...


In [3]:
data = pd.read_pickle(os.path.join(DATA_PATH, 'invite_data.pkl'))
prev_ans_times_st = pd.read_pickle(os.path.join(SAVE_PATH, 'prev_ans_times_st.pkl'))
data = pd.concat([data, prev_ans_times_st], axis=1)

In [4]:
tmp = data.groupby('qid')['itime'].agg({'qtime_std': 'std', 'qtime_mean': 'mean'}).reset_index()
data = data.merge(tmp, 'left', 'qid')
tmp = data.groupby('uid')['itime'].agg({'utime_std': 'std', 'utime_mean': 'mean'}).reset_index()
data = data.merge(tmp, 'left', 'uid')

In [5]:
data['prev_ans_times_min_gap'] = data['itime'] - data['prev_ans_times_min']
data['prev_ans_times_mean_gap'] = data['itime'] - data['prev_ans_times_mean']
data['iweek'] = data['iday'] % 7

In [6]:
# 距离上一次问题曝光时间
tmp = data[['qid', 'itime']]
tmp['qid_time'] = tmp['qid'].astype(str) + '_' + tmp['itime'].astype(str)
tmp.drop_duplicates(subset='qid_time', keep='first', inplace=True)
tmp.sort_values(by=['qid', 'itime'], inplace=True)

tmp['last_itime'] = tmp.groupby('qid')['itime'].shift()
tmp['llast_itime'] = tmp.groupby('qid')['last_itime'].shift()

tmp['qlast_itime_gap'] = tmp['itime'] - tmp['last_itime']
tmp['qllast_itime_gap'] = tmp['itime'] - tmp['llast_itime']
tmp['qlllast_itime_gap'] = tmp['last_itime'] - tmp['llast_itime']

data['qid_time'] = data['qid'].astype(str) + '_' + data['itime'].astype(str)
data = data.merge(tmp[['qid_time', 'qlast_itime_gap', 'qllast_itime_gap', 'qlllast_itime_gap']], 'left', 'qid_time')
data.head(2)

Unnamed: 0,iday,ihour,itime,label,qid,uid,prev_ans_times_min,prev_ans_times_mean,prev_ans_times_std,qtime_std,qtime_mean,utime_std,utime_mean,prev_ans_times_min_gap,prev_ans_times_mean_gap,iweek,qid_time,qlast_itime_gap,qllast_itime_gap,qlllast_itime_gap
0,3865,22,92782,0.0,2166419046,401693808,,,,34.050627,92753.534884,175.642535,92708.4,,,1,2166419046_92782,3.0,7.0,4.0
1,3844,11,92267,0.0,1550017551,3392373099,,,,77.516511,92315.333333,90.753099,92307.875,,,1,1550017551_92267,3.0,26.0,23.0


In [7]:
# 距离下一次问题曝光时间
tmp = data[['qid', 'itime']]
tmp['qid_time'] = tmp['qid'].astype(str) + '_' + tmp['itime'].astype(str)
tmp.drop_duplicates(subset='qid_time', keep='first', inplace=True)
tmp.sort_values(by=['qid', 'itime'], ascending=False, inplace=True)

tmp['next_itime'] = tmp.groupby('qid')['itime'].shift()
tmp['nnext_itime'] = tmp.groupby('qid')['next_itime'].shift()

tmp['qnext_itime_gap'] = tmp['itime'] - tmp['next_itime']
tmp['qnnext_itime_gap'] = tmp['itime'] - tmp['nnext_itime']
tmp['qnnnext_itime_gap'] = tmp['next_itime'] - tmp['nnext_itime']

data['qid_time'] = data['qid'].astype(str) + '_' + data['itime'].astype(str)
data = data.merge(tmp[['qid_time', 'qnext_itime_gap', 'qnnext_itime_gap', 'qnnnext_itime_gap']], 'left', 'qid_time')
data.head(2)

Unnamed: 0,iday,ihour,itime,label,qid,uid,prev_ans_times_min,prev_ans_times_mean,prev_ans_times_std,qtime_std,...,prev_ans_times_min_gap,prev_ans_times_mean_gap,iweek,qid_time,qlast_itime_gap,qllast_itime_gap,qlllast_itime_gap,qnext_itime_gap,qnnext_itime_gap,qnnnext_itime_gap
0,3865,22,92782,0.0,2166419046,401693808,,,,34.050627,...,,,1,2166419046_92782,3.0,7.0,4.0,-11.0,,
1,3844,11,92267,0.0,1550017551,3392373099,,,,77.516511,...,,,1,1550017551_92267,3.0,26.0,23.0,-14.0,-36.0,-22.0


In [8]:
# 距离上一次用户曝光时间
tmp = data[['uid', 'itime']]
tmp['uid_time'] = tmp['uid'].astype(str) + '_' + tmp['itime'].astype(str)
tmp.drop_duplicates(subset='uid_time', keep='first', inplace=True)
tmp.sort_values(by=['uid', 'itime'], inplace=True)

tmp['last_itime'] = tmp.groupby('uid')['itime'].shift()
tmp['llast_itime'] = tmp.groupby('uid')['last_itime'].shift()

tmp['ulast_itime_gap'] = tmp['itime'] - tmp['last_itime']
tmp['ullast_itime_gap'] = tmp['itime'] - tmp['llast_itime']
tmp['ulllast_itime_gap'] = tmp['last_itime'] - tmp['llast_itime']

data['uid_time'] = data['uid'].astype(str) + '_' + data['itime'].astype(str)
data = data.merge(tmp[['uid_time', 'ulast_itime_gap', 'ullast_itime_gap', 'ulllast_itime_gap']], 'left', 'uid_time')
data.head(2)

Unnamed: 0,iday,ihour,itime,label,qid,uid,prev_ans_times_min,prev_ans_times_mean,prev_ans_times_std,qtime_std,...,qlast_itime_gap,qllast_itime_gap,qlllast_itime_gap,qnext_itime_gap,qnnext_itime_gap,qnnnext_itime_gap,uid_time,ulast_itime_gap,ullast_itime_gap,ulllast_itime_gap
0,3865,22,92782,0.0,2166419046,401693808,,,,34.050627,...,3.0,7.0,4.0,-11.0,,,401693808_92782,144.0,346.0,202.0
1,3844,11,92267,0.0,1550017551,3392373099,,,,77.516511,...,3.0,26.0,23.0,-14.0,-36.0,-22.0,3392373099_92267,45.0,,


In [11]:
# 距离下一次用户曝光时间
tmp = data[['uid', 'itime']]
tmp['uid_time'] = tmp['uid'].astype(str) + '_' + tmp['itime'].astype(str)
tmp.drop_duplicates(subset='uid_time', keep='first', inplace=True)
tmp.sort_values(by=['uid', 'itime'], ascending=False, inplace=True)

tmp['next_itime'] = tmp.groupby('uid')['itime'].shift()
tmp['nnext_itime'] = tmp.groupby('uid')['next_itime'].shift()

tmp['unext_itime_gap'] = tmp['itime'] - tmp['next_itime']
tmp['unnext_itime_gap'] = tmp['itime'] - tmp['nnext_itime']
tmp['unnnext_itime_gap'] = tmp['next_itime'] - tmp['nnext_itime']

data['uid_time'] = data['uid'].astype(str) + '_' + data['itime'].astype(str)
data = data.merge(tmp[['uid_time', 'unext_itime_gap', 'unnext_itime_gap', 'unnnext_itime_gap']], 'left', 'uid_time')
data.head(2)

Unnamed: 0,iday,ihour,itime,label,qid,uid,prev_ans_times_min,prev_ans_times_mean,prev_ans_times_std,qtime_std,...,qnext_itime_gap,qnnext_itime_gap,qnnnext_itime_gap,uid_time,ulast_itime_gap,ullast_itime_gap,ulllast_itime_gap,unext_itime_gap,unnext_itime_gap,unnnext_itime_gap
0,3865,22,92782,0.0,2166419046,401693808,,,,34.050627,...,-11.0,,,401693808_92782,144.0,346.0,202.0,-25.0,-97.0,-72.0
1,3844,11,92267,0.0,1550017551,3392373099,,,,77.516511,...,-14.0,-36.0,-22.0,3392373099_92267,45.0,,,-45.0,-68.0,-23.0


In [13]:
data[['prev_ans_times_min_gap', 'prev_ans_times_mean_gap',
      'qtime_std', 'qtime_mean', 'utime_std', 'utime_mean', 'iweek',
      'qlast_itime_gap', 'qllast_itime_gap', 'qlllast_itime_gap',
      'qnext_itime_gap', 'qnnext_itime_gap', 'qnnnext_itime_gap',
      'ulast_itime_gap', 'ullast_itime_gap', 'ulllast_itime_gap',
      'unext_itime_gap', 'unnext_itime_gap', 'unnnext_itime_gap']].to_pickle(os.path.join(SAVE_PATH, 'inv_times_st.pkl'))

In [14]:
toc = time.time()
print('Used time: %d' % int(toc-tic))

Used time: 443
