In [4]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
import logging

In [5]:
import pickle
from tqdm import tqdm, tqdm_notebook, _tqdm_notebook, tqdm_pandas

In [6]:
log_fmt = "[%(asctime)s] %(levelname)s in %(module)s: %(message)s"
logging.basicConfig(format=log_fmt, level=logging.INFO)

In [7]:
import warnings
warnings.filterwarnings('ignore')

In [8]:
def extract_day(s):
    return s.apply(lambda x: int(x.split('-')[0][1:]))

def extract_hour(s):
    return s.apply(lambda x: int(x.split('-')[1][1:]))

In [9]:
def parse_str(d):
    return np.array(list(map(float, d.split())))
def parse_list_1(d):
    if d == '-1':
        return [0]
    return list(map(lambda x: int(x[1:]), str(d).split(',')))
def parse_map(d):
    if d == '-1':
        return {}
    return dict([int(z.split(':')[0][1:]), float(z.split(':')[1])] for z in d.split(','))

In [10]:
base_path = '../data'

In [11]:
# 加载邀请回答数据

train = pd.read_csv(f'{base_path}/invite_info_0926.txt', sep='\t', header=None)
train.columns = ['qid', 'uid', 'dt', 'label']
logging.info("invite %s", train.shape)

# test = pd.read_csv(f'{base_path}/invite_info_evaluate_1_0926.txt', sep='\t', header=None)
test = pd.read_csv(f'{base_path}/sample_sub_1.txt', sep='\t', header=None)
test.columns = ['qid', 'uid', 'dt','pre']
logging.info("test %s", test.shape)

sub = test.copy()

sub_size = len(sub)

train['day'] = extract_day(train['dt'])
train['hour'] = extract_hour(train['dt'])

test['day'] = extract_day(test['dt'])
test['hour'] = extract_hour(test['dt'])
del train['dt'], test['dt'],test['pre']


[2019-12-17 00:49:08,653] INFO in <ipython-input-11-01d39a31e3c0>: invite (9489162, 4)


ValueError: Length mismatch: Expected axis has 4 elements, new values have 3 elements

In [None]:
# 加载问题
ques = pd.read_csv(f'{base_path}/question_info_0926.txt', header=None, sep='\t')
ques.columns = ['qid', 'q_dt', 'title_t1', 'title_t2', 'desc_t1', 'desc_t2', 'q_topic']
# del ques['title_t1'], ques['title_t2'], ques['desc_t1'], ques['desc_t2']
del ques['title_t1'],ques['desc_t1']
logging.info("ques %s", ques.shape)

ques['q_day'] = extract_day(ques['q_dt'])
ques['q_hour'] = extract_hour(ques['q_dt'])
del ques['q_dt']

In [None]:
# ques['关注话题'] = train['关注话题'].apply(parse_list_1)
ques['title_t2'] = ques['title_t2'].apply(parse_list_1)
ques['desc_t2'] = ques['desc_t2'].apply(parse_list_1)
ques['q_topic'] = ques['q_topic'].apply(parse_list_1)
# train['感兴趣话题'] = train['感兴趣话题'].apply(parse_map)
ques.head()

In [None]:
topicmap = pd.read_csv('../data/topic_vectors_64d.txt', 
                          names=['id', 'embed'], sep='\t')
topicmap['embed'] = topicmap['embed'].apply(parse_str)
topicmap['id'] = topicmap['id'].apply(lambda x: int(x[1:]))
topicmap.head()

In [None]:
topicmap.shape

In [None]:
topic_vector_dict = dict(zip(np.array(topicmap['id']), np.array(topicmap['embed'])))

In [None]:
def topic2v(x):
    try:
        tmp = topic_vector_dict[x[0]]
    except:
        tmp = np.zeros(64)
    for i in x[1:]:
        tmp = tmp + topic_vector_dict[i]
    if len(tmp) == 0:
        return np.zeros(64)
    return (tmp / len(x))

In [None]:
tqdm.pandas(desc="topic2v...")
ques['q_topic']=ques['q_topic'].progress_apply(lambda x:topic2v(x))
print('finished!')

In [None]:
wordmap = pd.read_csv('../data/word_vectors_64d.txt', 
                          names=['id', 'embed'], sep='\t')
wordmap['embed'] = wordmap['embed'].apply(parse_str)
wordmap['id'] = wordmap['id'].apply(lambda x: int(x[1:]))
wordmap.head()

In [None]:
wordmap.shape
word_vector_dict = dict(zip(np.array(wordmap['id']), np.array(wordmap['embed'])))

In [None]:
def word2v(x):
    try:
        tmp = word_vector_dict[x[0]]
    except:
        tmp = np.zeros(64)
    for i in x[1:]:
        tmp = tmp + word_vector_dict[i]
    if len(tmp) == 0:
        return np.zeros(64)
    return (tmp / len(x))

In [None]:
tqdm.pandas(desc="word2v...")
ques['title_t2']=ques['title_t2'].progress_apply(lambda x:word2v(x))
print('finished!')

In [None]:
tqdm.pandas(desc="word2v...")
ques['desc_t2']=ques['desc_t2'].progress_apply(lambda x:word2v(x))
print('finished!')

In [None]:
ques.head()

In [None]:
def wordadd(x):
    try:
        tmp = 0.0
    except:
        tmp = 0.0
    tmp=x.sum()
    return (tmp /64)

In [None]:
tqdm.pandas(desc="wordadd...")
ques['title_t2']=ques['title_t2'].progress_apply(lambda x:wordadd(x))
print('finished!')

In [None]:
tqdm.pandas(desc="wordadd...")
ques['desc_t2']=ques['desc_t2'].progress_apply(lambda x:wordadd(x))
print('finished!')

In [None]:
tqdm.pandas(desc="wordadd...")
ques['q_topic']=ques['q_topic'].progress_apply(lambda x:wordadd(x))
print('finished!')

In [None]:
ques.head()

In [None]:
# 加载回答
ans = pd.read_csv(f'{base_path}/answer_info_0926.txt', header=None, sep='\t')
ans.columns = ['aid', 'qid', 'uid', 'ans_dt', 'ans_t1', 'ans_t2', 'is_good', 'is_rec', 'is_dest', 'has_img',
               'has_video', 'word_count', 'reci_cheer', 'reci_uncheer', 'reci_comment', 'reci_mark', 'reci_tks',
               'reci_xxx', 'reci_no_help', 'reci_dis']
del ans['ans_t1'], ans['ans_t2']
logging.info("ans %s", ans.shape)

ans['a_day'] = extract_day(ans['ans_dt'])
ans['a_hour'] = extract_hour(ans['ans_dt'])
del ans['ans_dt']

ans = pd.merge(ans, ques, on='qid')
del ques

In [None]:
print(ans['a_day'].min())
print(ans['a_day'].max())

In [None]:
# 回答距提问的天数
ans['diff_qa_days'] = ans['a_day'] - ans['q_day']

In [None]:
# 时间窗口划分
# train
# val
train_start = 3838
train_end = 3867

val_start = 3868
val_end = 3874

label_end = 3867
label_start = label_end - 6

train_label_feature_end = label_end - 7
train_label_feature_start = train_label_feature_end - 22

train_ans_feature_end = label_end - 7
train_ans_feature_start = train_ans_feature_end - 50

val_label_feature_end = val_start - 1
val_label_feature_start = val_label_feature_end - 22

val_ans_feature_end = val_start - 1
val_ans_feature_start = val_ans_feature_end - 50

train_label_feature = train[(train['day'] >= train_label_feature_start) & (train['day'] <= train_label_feature_end)]
logging.info("train_label_feature %s", train_label_feature.shape)

val_label_feature = train[(train['day'] >= val_label_feature_start) & (train['day'] <= val_label_feature_end)]
logging.info("val_label_feature %s", val_label_feature.shape)

train_label = train[(train['day'] > train_label_feature_end)]

logging.info("train feature start %s end %s, label start %s end %s", train_label_feature['day'].min(),
             train_label_feature['day'].max(), train_label['day'].min(), train_label['day'].max())

logging.info("test feature start %s end %s, label start %s end %s", val_label_feature['day'].min(),
             val_label_feature['day'].max(), test['day'].min(), test['day'].max())

In [None]:
# 确定ans的时间范围
# 3807~3874
train_ans_feature = ans[(ans['a_day'] >= train_ans_feature_start) & (ans['a_day'] <= train_ans_feature_end)]

val_ans_feature = ans[(ans['a_day'] >= val_ans_feature_start) & (ans['a_day'] <= val_ans_feature_end)]

logging.info("train ans feature %s, start %s end %s", train_ans_feature.shape, train_ans_feature['a_day'].min(),
             train_ans_feature['a_day'].max())

logging.info("val ans feature %s, start %s end %s", val_ans_feature.shape, val_ans_feature['a_day'].min(),
             val_ans_feature['a_day'].max())

fea_cols = ['is_good', 'is_rec', 'is_dest', 'has_img', 'has_video', 'word_count',
            'reci_cheer', 'reci_uncheer', 'reci_comment', 'reci_mark', 'reci_tks',
            'reci_xxx', 'reci_no_help', 'reci_dis', 'diff_qa_days']


def extract_feature1(target, label_feature, ans_feature):
    # 问题特征
    t1 = label_feature.groupby('qid')['label'].agg(['mean', 'sum', 'std', 'count']).reset_index()
    t1.columns = ['qid', 'q_inv_mean', 'q_inv_sum', 'q_inv_std', 'q_inv_count']
    target = pd.merge(target, t1, on='qid', how='left')

    # 用户特征
    t1 = label_feature.groupby('uid')['label'].agg(['mean', 'sum', 'std', 'count']).reset_index()
    t1.columns = ['uid', 'u_inv_mean', 'u_inv_sum', 'u_inv_std', 'u_inv_count']
    target = pd.merge(target, t1, on='uid', how='left')
    #
    # train_size = len(train)
    # data = pd.concat((train, test), sort=True)

    # 回答部分特征

    t1 = ans_feature.groupby('qid')['aid'].count().reset_index()
    t1.columns = ['qid', 'q_ans_count']
    target = pd.merge(target, t1, on='qid', how='left')

    t1 = ans_feature.groupby('uid')['aid'].count().reset_index()
    t1.columns = ['uid', 'u_ans_count']
    target = pd.merge(target, t1, on='uid', how='left')

#     for col in fea_cols:
#         t1 = ans_feature.groupby('uid')[col].agg(['sum', 'max', 'mean']).reset_index()
#         t1.columns = ['uid', f'u_{col}_sum', f'u_{col}_max', f'u_{col}_mean']
#         target = pd.merge(target, t1, on='uid', how='left')

#         t1 = ans_feature.groupby('qid')[col].agg(['sum', 'max', 'mean']).reset_index()
#         t1.columns = ['qid', f'q_{col}_sum', f'q_{col}_max', f'q_{col}_mean']
#         target = pd.merge(target, t1, on='qid', how='left')
#         logging.info("extract %s", col)
    return target


train_label = extract_feature1(train_label, train_label_feature, train_ans_feature)
test = extract_feature1(test, val_label_feature, val_ans_feature)

In [None]:
# 特征提取结束
logging.info("train shape %s, test shape %s", train_label.shape, test.shape)
assert len(test) == sub_size

# 加载用户
user = pd.read_csv(f'{base_path}/member_info_0926.txt', header=None, sep='\t')
user.columns = ['uid', 'gender', 'creat_keyword', 'level', 'hot', 'reg_type', 'reg_plat', 'freq', 'uf_b1', 'uf_b2',
                'uf_b3', 'uf_b4', 'uf_b5', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5', 'score', 'follow_topic',
                'inter_topic']

In [None]:
dit = {'daily': 4, 'weekly': 3, 'monthly': 2, 'new': 1,'unknow':0}
user['freq'] = user['freq'].map(dit)

In [None]:
user = user.drop(['creat_keyword','level','hot','reg_type','reg_plat'],axis=1)

In [None]:
user.head()

In [None]:
user['follow_topic'] = user['follow_topic'].apply(parse_list_1)
user['inter_topic'] = user['inter_topic'].apply(parse_map)

In [None]:
def topic_interest2v(x):
    if len(x)==0:
        return np.zeros(64)
    else:
        tmp=np.zeros(64)
        for i in x:
            tmp = tmp + topic_vector_dict[i]*x[i]
        return (tmp / len(x))

In [None]:
tqdm.pandas(desc="topic2v...")
user['follow_topic']=user['follow_topic'].progress_apply(lambda x:topic2v(x))
print('finished!')

In [None]:
tqdm.pandas(desc="topic_interest2v...")
user['inter_topic']=user['inter_topic'].progress_apply(lambda x:topic_interest2v(x))
user.head()

In [None]:
tqdm.pandas(desc="wordadd...")
user['follow_topic']=user['follow_topic'].progress_apply(lambda x:wordadd(x))
print('finished!')

In [None]:
tqdm.pandas(desc="wordadd...")
user['inter_topic']=user['inter_topic'].progress_apply(lambda x:wordadd(x))
print('finished!')

In [None]:
user.head()

In [None]:

logging.info("user %s", user.shape)

unq = user.nunique()
logging.info("user unq %s", unq)

for x in unq[unq == 1].index:
    del user[x]
    logging.info('del unq==1 %s', x)

t = user.dtypes
cats = [x for x in t[t == 'object'].index if x not in ['follow_topic', 'inter_topic', 'uid','freq']]
logging.info("user cat %s", cats)

for d in cats:
    lb = LabelEncoder()
    user[d] = lb.fit_transform(user[d])
    logging.info('encode %s', d)

q_lb = LabelEncoder()
q_lb.fit(list(train_label['qid'].astype(str).values) + list(test['qid'].astype(str).values))
train_label['qid_enc'] = q_lb.transform(train_label['qid'])
test['qid_enc'] = q_lb.transform(test['qid'])

u_lb = LabelEncoder()
u_lb.fit(user['uid'])
train_label['uid_enc'] = u_lb.transform(train_label['uid'])
test['uid_enc'] = u_lb.transform(test['uid'])

# merge user
train_label = pd.merge(train_label, user, on='uid', how='left')
test = pd.merge(test, user, on='uid', how='left')
logging.info("train shape %s, test shape %s", train_label.shape, test.shape)

data = pd.concat((train_label, test), axis=0, sort=True)
# del train_label, test

In [None]:
train_label.head()

In [None]:
test.head()

In [None]:
data.head()

In [None]:
# count编码
count_fea = ['uid_enc', 'qid_enc', 'gender', 'freq', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5']
for feat in count_fea:
    col_name = '{}_count'.format(feat)
    data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
    data.loc[data[col_name] < 2, feat] = -1
    data[feat] += 1
    data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
    data[col_name] = (data[col_name] - data[col_name].min()) / (data[col_name].max() - data[col_name].min())
#     # 

# # 问题被回答的次数

In [None]:
# 压缩数据
t = data.dtypes
for x in t[t == 'int64'].index:
    data[x] = data[x].astype('int32')

for x in t[t == 'float64'].index:
    data[x] = data[x].astype('float32')

data['wk'] = data['day'] % 7

feature_cols = [x for x in data.columns if x not in ('label', 'uid', 'qid', 'dt', 'day')]

In [None]:
# target编码
logging.info("feature size %s", len(feature_cols))

X_train_all = data.iloc[:len(train_label)][feature_cols]
y_train_all = data.iloc[:len(train_label)]['label']
test = data.iloc[len(train_label):]
# del data
assert len(test) == sub_size

logging.info("train shape %s, test shape %s", train_label.shape, test.shape)

fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for index, (train_idx, val_idx) in enumerate(fold.split(X=X_train_all, y=y_train_all)):
    break

X_train, X_val, y_train, y_val = X_train_all.iloc[train_idx][feature_cols], X_train_all.iloc[val_idx][feature_cols], \
                                 y_train_all.iloc[train_idx], \
                                 y_train_all.iloc[val_idx]
del X_train_all

model_lgb = LGBMClassifier(n_estimators=6000, n_jobs=-1, objective='binary', seed=1000, silent=True)
model_lgb.fit(X_train, y_train,
              eval_metric=['logloss', 'auc'],
              eval_set=[(X_val, y_val)],
              early_stopping_rounds=100)

sub['label'] = model_lgb.predict_proba(test[feature_cols])[:, 1]


sub.to_csv('../example/result_1217_1.txt', index=None, header=None, sep='\t')