In [1]:
import random
import scipy.special as special
import math
from math import log
import pandas as pd
import gc
import datetime
from sklearn import preprocessing
import numpy as np
import os

In [2]:
from tqdm import tqdm

In [3]:
class HyperParam(object):
    def __init__(self, alpha, beta):
        self.alpha = alpha
        self.beta = beta

    def sample_from_beta(self, alpha, beta, num, imp_upperbound):
        sample = numpy.random.beta(alpha, beta, num)
        I = []
        C = []
        for click_ratio in sample:
            imp = random.random() * imp_upperbound
            #imp = imp_upperbound
            click = imp * click_ratio
            I.append(imp)
            C.append(click)
        return I, C

    def update_from_data_by_FPI(self, tries, success, iter_num, epsilon):
        '''estimate alpha, beta using fixed point iteration'''
        for i in range(iter_num):
            new_alpha, new_beta = self.__fixed_point_iteration(tries, success, self.alpha, self.beta)
            if abs(new_alpha-self.alpha)<epsilon and abs(new_beta-self.beta)<epsilon:
                break
            self.alpha = new_alpha
            self.beta = new_beta

    def __fixed_point_iteration(self, tries, success, alpha, beta):
        '''fixed point iteration'''
        sumfenzialpha = 0.0
        sumfenzibeta = 0.0
        sumfenmu = 0.0
        for i in range(len(tries)):
            sumfenzialpha += (special.digamma(success[i]+alpha) - special.digamma(alpha))
            sumfenzibeta += (special.digamma(tries[i]-success[i]+beta) - special.digamma(beta))
            sumfenmu += (special.digamma(tries[i]+alpha+beta) - special.digamma(alpha+beta))

        return alpha*(sumfenzialpha/sumfenmu), beta*(sumfenzibeta/sumfenmu)

    def update_from_data_by_moment(self, tries, success):
        '''estimate alpha, beta using moment estimation'''
        mean, var = self.__compute_moment(tries, success)
        #print 'mean and variance: ', mean, var
        #self.alpha = mean*(mean*(1-mean)/(var+0.000001)-1)
        self.alpha = (mean+0.000001) * ((mean+0.000001) * (1.000001 - mean) / (var+0.000001) - 1)
        #self.beta = (1-mean)*(mean*(1-mean)/(var+0.000001)-1)
        self.beta = (1.000001 - mean) * ((mean+0.000001) * (1.000001 - mean) / (var+0.000001) - 1)

    def __compute_moment(self, tries, success):
        '''moment estimation'''
        ctr_list = []
        var = 0.0
        for i in range(len(tries)):
            ctr_list.append(float(success[i])/tries[i])
        mean = sum(ctr_list)/len(ctr_list)
        for ctr in ctr_list:
            var += pow(ctr-mean, 2)

        return mean, var/(len(ctr_list)-1)

# reading



In [4]:
%%time
df_train = pd.read_csv('../../tencent_ad/CJY/r2/input/train.csv')
# df_test = pd.read_csv('../T_r2/data/ori_data/test1.csv')
df_test = pd.read_csv('../../tencent_ad/CJY/r2/input/test2.csv')
df_user = pd.read_csv('../../tencent_ad/CJY/r2/input/userFeature.csv')
df_ad = pd.read_csv('../../tencent_ad/CJY/r2/input/adFeature.csv')
# df_test1['label'] = -1
df_test['label'] = -2
# df_test = pd.concat((df_test1,df_test2))
df_train['label'] = df_train['label'].map(lambda x:0 if x == -1 else x)
df_all = pd.concat([df_train,df_test],axis=0,copy=False).reset_index(drop=True)
df_all.label = df_all.label.fillna(-1)
df_all = df_all.merge(df_ad,on='aid',how='left',copy=False)
df_all = df_all.merge(df_user,on='uid',how='left',copy=False)

CPU times: user 14min 7s, sys: 6min 57s, total: 21min 4s
Wall time: 23min 50s


In [5]:
df_all.shape
df_all.columns

Index(['aid', 'uid', 'label', 'advertiserId', 'campaignId', 'creativeId',
       'creativeSize', 'adCategoryId', 'productId', 'productType', 'age',
       'gender', 'marriageStatus', 'education', 'consumptionAbility', 'LBS',
       'interest1', 'interest2', 'interest3', 'interest4', 'interest5', 'kw1',
       'kw2', 'kw3', 'topic1', 'topic2', 'topic3', 'appIdInstall',
       'appIdAction', 'ct', 'os', 'carrier', 'house'],
      dtype='object')

In [6]:
df_all = df_all.fillna('-1')

In [7]:
df_all['house'] = df_all['house'].map(lambda x: 0 if x == '-1' else 1)
df_all['LBS'] = df_all['LBS'].astype(int)

In [8]:
multihots = ['interest1','interest2','interest3','interest4','interest5','os','ct','marriageStatus']

In [9]:
for feature in tqdm(multihots):
    multihot = []
    gc.collect()
    unique_set = set()
    for r in df_all[feature]:
        unique_set.update(r.split(';'))
#     if feature == 'marriageStatus':
#         unique_set.remove('8')# 坑！
    unique_dict = {}
    unique_list = sorted(list(unique_set))
    for idx,each in enumerate(unique_list):
        unique_dict[each] = idx
    for r in df_all[feature]:
        row = np.zeros(len(unique_set))
        for w in r.split(';'):
            if w in unique_dict:
                row[unique_dict[w]] = 1
        multihot.append(row.astype('int8'))
    multihot = np.array(multihot).astype('int8')
    gc.collect()
    for c in range(multihot.shape[1]):
        df_all[feature+'_%d'%c] = multihot[:,c]

100%|██████████| 8/8 [1:01:50<00:00, 463.76s/it]


In [12]:
# 这里是用来造转化率的特征列，后面加的是新造的multihots列
ctr_cols = ['aid', 'advertiserId', 'campaignId', 'creativeId',
       'creativeSize', 'adCategoryId', 'productId', 'productType', 'age',
       'gender', 'education', 'consumptionAbility', 'LBS','carrier', 'house'] + df_all.columns.tolist()[33:]

In [13]:
# merge 5折划分
df_all = df_all.merge(pd.read_csv('kflod5_auid_r2(10).csv'),on=['aid','uid'],how='left',copy=False)

In [14]:
# 测试集标记其kflod=-1
df_all['kflod'] = df_all['kflod'].fillna(-1)

In [15]:
df_all.shape

(57267004, 421)

In [16]:
df_all.to_csv('df_all_beforebayes_b.csv', index=None)

# bayes

In [4]:
df_all = pd.read_csv('df_all_beforebayes_b.csv', nrows=1)

df_all.shape

(1, 421)

In [5]:
# 这里是用来造转化率的特征列，后面加的是新造的multihots列
ctr_cols = ['aid', 'advertiserId', 'campaignId', 'creativeId',
       'creativeSize', 'adCategoryId', 'productId', 'productType', 'age',
       'gender', 'education', 'consumptionAbility', 'LBS','carrier', 'house'] + df_all.columns.tolist()[33:]

In [6]:
ad_feature = ['aid']
user_feature = ctr_cols[8:]

In [7]:
import itertools

In [8]:
# aid - user+multicols交叉
cross_cols = list(itertools.product(ad_feature,user_feature))

In [9]:
# 广告 - 用户交叉
cross_cols = list(itertools.product(['advertiserId', 'campaignId', 'creativeId',
       'creativeSize', 'adCategoryId', 'productId', 'productType'],['age',
       'gender', 'education', 'consumptionAbility', 'LBS','carrier', 'house'])) + cross_cols

In [10]:
# 用户 - 用户交叉
for w1 in ['age', 'gender', 'education', 'consumptionAbility', 'LBS','carrier', 'house']:
    for w2 in ['age', 'gender', 'education', 'consumptionAbility', 'LBS','carrier', 'house']:
        if w1!=w2:
            cross_cols.append((w1,w2))

In [11]:
len(ctr_cols),len(cross_cols)

(403, 486)

## merge

In [12]:
train_len = 45539700
test_len = 11727304

In [14]:
feature_col = ['marriageStatus', 'interest1', 'interest2','interest3', 'interest4', 'interest5', 'kw1', 
               'kw2', 'kw3', 'topic1', 'topic2', 'topic3', 'appIdInstall','appIdAction', 'ct', 'os']
cols = list(set(df_all.columns) - set(feature_col))

In [18]:
tmp_len = 11727304 // 4

for k in range(1, 5):
    tmp = pd.read_csv('df_all_beforebayes_b.csv', skiprows=(k-1)*tmp_len+train_len+1, nrows=tmp_len, names=list(df_all.columns), dtype='int8', usecols=cols)
      
    for i in tqdm(range(len(ctr_cols))):
        f = [ctr_cols[i]]
        if not os.path.exists('./ctr/%s.csv'%('_'.join(f))):
#             print('_'.join(f))
            continue
        ctr1 = pd.read_csv('./ctr/%s.csv'%('_'.join(f)))
        ctr1['_'.join(f) + '_bayes_rate'] = ctr1['_'.join(f) + '_bayes_rate'].astype("float16")
        ctr1.set_index(f + ['kflod'],inplace=True)
        tmp = tmp.join(ctr1,on=f + ['kflod'])
        del ctr1
        _ = gc.collect()
        
    for i in tqdm(range(len(cross_cols))):
        f = cross_cols[i]
        if not os.path.exists('./ctr/%s.csv'%('_'.join(f))):
#             print('_'.join(f))
            continue
        ctr1 = pd.read_csv('./ctr/%s.csv'%('_'.join(f)))
        ctr1['_'.join(f) + '_bayes_rate'] = ctr1['_'.join(f) + '_bayes_rate'].astype("float16")
        ctr1.set_index(list(f) + ['kflod'],inplace=True)
        tmp = tmp.join(ctr1,on=list(f) + ['kflod'])
        del ctr1
        _ = gc.collect()  
        
    tmp.to_csv('./output_r2/test_bayes_b_' + str(k), index=None)
    print('finish kflod ' + str(k))
    del tmp
    _ = gc.collect()

100%|██████████| 403/403 [21:17<00:00,  3.17s/it]
100%|██████████| 486/486 [30:57<00:00,  3.82s/it] 


finish kflod 1


100%|██████████| 403/403 [23:23<00:00,  3.48s/it]
100%|██████████| 486/486 [49:17<00:00,  6.09s/it] 


finish kflod 2


100%|██████████| 403/403 [28:24<00:00,  4.23s/it]
100%|██████████| 486/486 [51:30<00:00,  6.36s/it] 


finish kflod 3


100%|██████████| 403/403 [14:48<00:00,  2.21s/it]
100%|██████████| 486/486 [46:46<00:00,  5.77s/it] 


finish kflod 4


In [None]:
for i in range(5):
    for j in tqdm(range(10)):
        tmp = pd.read_csv(PATH + 'train_kflod' + str(j), usecols=cols[i*cnt:(i+1)*cnt])
        tmp = tmp.as_matrix().astype(np.float32)
        if j == 0:
            train = tmp
            del tmp
            _ = gc.collect()
        else:
            train = np.concatenate((train, tmp))
            del tmp
            _ = gc.collect()
    for j in tqdm(range(4)):
        tmp = pd.read_csv(PATH + 'test_kflod' + str(j), usecols=cols[i*cnt:(i+1)*cnt])
        tmp = tmp.as_matrix().astype(np.float32)
        if j == 0:
            test = tmp
            del tmp
            _ = gc.collect()
        else:
            test = np.concatenate((test, tmp))
            del tmp
            _ = gc.collect()
            
    if i == 0:
        train_all = train
        test_all = test
        del train
        del test
        _ = gc.collect()
    else:
        train_all = np.concatenate((train_all, train), axis=1)
        test_all = np.concatenate((test_all, test), axis=1)
        del train
        del test
        _ = gc.collect()

In [None]:
#说明：
#下面可以看到有的列存在空值，对于空值我选择填充为这个列的众数（一般来说就等于转化率为0经过贝叶斯平滑的结果）
#

In [23]:
df_all.info(null_counts=True,max_cols=9999)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11064803 entries, 0 to 11064802
Data columns (total 817 columns):
aid                              11064803 non-null int64
label                            11064803 non-null float64
uid                              11064803 non-null int64
advertiserId                     11064803 non-null int64
campaignId                       11064803 non-null int64
creativeId                       11064803 non-null int64
creativeSize                     11064803 non-null int64
adCategoryId                     11064803 non-null int64
productId                        11064803 non-null int64
productType                      11064803 non-null int64
age                              11064803 non-null int64
gender                           11064803 non-null int64
marriageStatus                   11064803 non-null object
education                        11064803 non-null int64
consumptionAbility               11064803 non-null int64
LBS                       

In [22]:
df_all.info(null_counts=True,max_cols=9999)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11064693 entries, 0 to 11064692
Data columns (total 900 columns):
aid                                           11064693 non-null int64
uid                                           11064693 non-null int64
label                                         11064693 non-null int64
advertiserId                                  11064693 non-null int64
campaignId                                    11064693 non-null int64
creativeId                                    11064693 non-null int64
creativeSize                                  11064693 non-null int64
adCategoryId                                  11064693 non-null int64
productId                                     11064693 non-null int64
productType                                   11064693 non-null int64
age                                           11064693 non-null int64
gender                                        11064693 non-null int64
marriageStatus                              

In [23]:
#保存
usecols = ['aid','uid','label']
for col in df_all.columns:
    if 'bayes_rate' in col:
        usecols.append(col)
df_all.to_csv('bayes_rate.csv',index=False,columns=usecols,float_format='%.8f')

In [24]:
del df_all

In [37]:
df_test2.info(null_counts=True,max_cols=9999)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2265879 entries, 8798814 to 11064692
Data columns (total 817 columns):
aid                              2265879 non-null int64
uid                              2265879 non-null int64
label                            2265879 non-null int64
advertiserId                     2265879 non-null int64
campaignId                       2265879 non-null int64
creativeId                       2265879 non-null int64
creativeSize                     2265879 non-null int64
adCategoryId                     2265879 non-null int64
productId                        2265879 non-null int64
productType                      2265879 non-null int64
age                              2265879 non-null int64
gender                           2265879 non-null int64
marriageStatus                   2265879 non-null object
education                        2265879 non-null int64
consumptionAbility               2265879 non-null int64
LBS                              22658

In [36]:
# 保存b榜测试集
usecols = ['aid','uid','label']
for col in df_test2.columns:
    if 'bayes_rate' in col:
        usecols.append(col)
df_test2.to_csv('../features/bayes_rate1_test2.csv',index=False,columns=usecols,float_format='%.8f')

In [38]:
del df_test2