In [1]:
import numpy as np
import os
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
# import seaborn as sns
import pickle
import time
import gc
from tqdm import tqdm, tqdm_notebook

%matplotlib inline

#每次可以输出多个变量
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from pylab import rcParams
rcParams['figure.figsize'] = 14, 6

import warnings
warnings.filterwarnings("ignore")

#中文字体
import matplotlib
matplotlib.use('qt4agg')
#指定默认字体
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.family'] = 'sans-serif'
#解决负号'-'显示为方块的问题
matplotlib.rcParams['axes.unicode_minus'] = False

In [2]:
train = pd.read_csv('./data/oppo_round1_train_20180926.txt', names=['prefix', 'query_prediction', 'title', 'tag', 'label'], sep='\t')
val = pd.read_csv('./data/oppo_round1_vali_20180926.txt', names=['prefix', 'query_prediction', 'title', 'tag', 'label'], sep='\t')
test = pd.read_csv('./data/oppo_round1_test_A_20180926.txt', names=['prefix', 'query_prediction', 'title', 'tag'], sep='\t')

train.shape
val.shape
test.shape

(2000000, 5)

(50000, 5)

(50000, 4)

In [3]:
train.label.value_counts()
val.label.value_counts()

0    1250531
1     749469
Name: label, dtype: int64

0    31187
1    18813
Name: label, dtype: int64

In [4]:
train.isnull().sum()
val.isnull().sum()
test.isnull().sum()

prefix                  1
query_prediction    37248
title                   0
tag                     0
label                   0
dtype: int64

prefix                0
query_prediction    958
title                 0
tag                   0
label                 0
dtype: int64

prefix                0
query_prediction    964
title                 0
tag                   0
dtype: int64

In [5]:
len(train.prefix.unique())
print(len(val.prefix.unique()), '/', len(set(val.prefix.unique()) & set(train.prefix.unique())))
print(len(test.prefix.unique()), '/', len(set(test.prefix.unique()) & set(train.prefix.unique())))

len(set(val.prefix.unique()) & set(test.prefix.unique()))

169913

23908 / 17968
24198 / 18111


8700

In [6]:
train.query_prediction = train.query_prediction.apply(lambda x:eval(x) if type(x) == str else None)
val.query_prediction = val.query_prediction.apply(lambda x:eval(x) if type(x) == str else None)
test.query_prediction = test.query_prediction.apply(lambda x:eval(x) if type(x) == str else None)

In [7]:
train.query_prediction = train.query_prediction.apply(lambda x:dict(sorted(x.items(), key=lambda x:x[1], reverse=True)) if x != None else {})
val.query_prediction = val.query_prediction.apply(lambda x:dict(sorted(x.items(), key=lambda x:x[1], reverse=True)) if x != None else {})
test.query_prediction = test.query_prediction.apply(lambda x:dict(sorted(x.items(), key=lambda x:x[1], reverse=True)) if x != None else {})

In [8]:
train['dict_len'] = train.query_prediction.apply(len)
val['dict_len'] = val.query_prediction.apply(len)
test['dict_len'] = test.query_prediction.apply(len)

In [11]:
bayes_train = train.groupby(['title', 'tag'], as_index=False)['label'].agg({'bayes_label':'mean'})
bayes_val = val.groupby(['title', 'tag'], as_index=False)['label'].agg({'bayes_label':'mean'})

In [12]:
train = train.merge(bayes_train, on=['title', 'tag'], how='left')
val = val.merge(bayes_val, on=['title', 'tag'], how='left')
test['bayes_label'] = 0

In [13]:
def get_title_rate(x):
    if x.title in x.query_prediction:
        return float(x.query_prediction[x.title])
    else:
        return 0

train['title_rate'] = train.apply(get_title_rate, axis=1)
val['title_rate'] = val.apply(get_title_rate, axis=1)
test['title_rate'] = test.apply(get_title_rate, axis=1)

In [14]:
train['title_tag'] = train['title'] + '_' + train['tag']
val['title_tag'] = val['title'] + '_' + val['tag']
test['title_tag'] = test['title'] + '_' + test['tag']

In [15]:
train['is_max_rate'] = train.apply(lambda x: 1 if x.query_prediction != {} and float(list(x.query_prediction.values())[0]) == x.title_rate else 0, axis=1)
val['is_max_rate'] = val.apply(lambda x: 1 if x.query_prediction != {} and float(list(x.query_prediction.values())[0]) == x.title_rate else 0, axis=1)
test['is_max_rate'] = test.apply(lambda x: 1 if x.query_prediction != {} and float(list(x.query_prediction.values())[0]) == x.title_rate else 0, axis=1)

In [None]:
train['max_title_rate'] = train.apply(lambda x: float(list(x.query_prediction.values())[0]) if x.query_prediction != {} else 0.5, axis=1)
val['max_title_rate'] = val.apply(lambda x: float(list(x.query_prediction.values())[0]) if x.query_prediction != {} else 0.5, axis=1)
test['max_title_rate'] = test.apply(lambda x: float(list(x.query_prediction.values())[0]) if x.query_prediction != {} else 0.5, axis=1)

In [16]:
train['title_rate_max_sub'] = train.apply(lambda x: float(list(x.query_prediction.values())[0]) - x.title_rate if x.query_prediction != {} else 0.5, axis=1)
val['title_rate_max_sub'] = val.apply(lambda x: float(list(x.query_prediction.values())[0]) - x.title_rate if x.query_prediction != {} else 0.5, axis=1)
test['title_rate_max_sub'] = test.apply(lambda x: float(list(x.query_prediction.values())[0]) - x.title_rate if x.query_prediction != {} else 0.5, axis=1)

In [17]:
train['is_val'] = 0
val['is_val'] = 1
test['is_val'] = -1

df = pd.concat([train, val], axis=0)
df = pd.concat([df, test], axis=0)
df.reset_index(drop=True, inplace=True)
df.shape

(2100000, 12)

In [19]:
import hashlib

def get_user_id(x):
    md5 = hashlib.md5(str(x.query_prediction).encode(encoding='UTF-8'))
    return md5.hexdigest()

def get_item_id(x):
    md5 = hashlib.md5(str(x.title_tag).encode(encoding='UTF-8'))
    return md5.hexdigest()

df['user_id'] = df.apply(get_user_id, axis=1)
df['item_id'] = df.apply(get_item_id, axis=1)

In [16]:
val[val.prefix == '刺激战场']

Unnamed: 0,prefix,query_prediction,title,tag,label,dict_len,bayes_label,title_rate,title_tag,is_max_rate,title_rate_max_sub,user_id,item_id
40248,刺激战场,"{'刺激战场': '0.113', '刺激战场灵敏度': '0.051', '刺激战场辅助'...",刺激战场官网,网站,0,11,0.25,0.011,刺激战场官网_网站,0,0.102,923d4665cb160f141f4189e9d500e5eb,b189694d54fa156b9ccc89b17ede5745
44470,刺激战场,"{'刺激战场': '0.113', '刺激战场灵敏度': '0.051', '刺激战场辅助'...",绝地求生：刺激战场,百科,1,11,0.307692,0.0,绝地求生：刺激战场_百科,0,0.113,923d4665cb160f141f4189e9d500e5eb,9c5d918d17320fdfc7424ac861f1de91
44688,刺激战场,"{'刺激战场': '0.113', '刺激战场灵敏度': '0.051', '刺激战场辅助'...",刺激战场官网,网站,1,11,0.25,0.011,刺激战场官网_网站,0,0.102,923d4665cb160f141f4189e9d500e5eb,b189694d54fa156b9ccc89b17ede5745
46181,刺激战场,"{'刺激战场': '0.113', '刺激战场灵敏度': '0.051', '刺激战场辅助'...",绝地求生：刺激战场,百科,0,11,0.307692,0.0,绝地求生：刺激战场_百科,0,0.113,923d4665cb160f141f4189e9d500e5eb,9c5d918d17320fdfc7424ac861f1de91


In [21]:
df.to_csv('./feature/df_preprocess_andtest.csv', index=None, encoding='utf-8')