In [1]:
#数据预处理
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import LabelEncoder
from dummyPy import OneHotEncoder
import random
import pickle  # 存储临时变量

## 读文件
#file_path = '/home/admin/avazu/'
fp_train = 'train.csv'
fp_test  = 'test.csv'

## 下采样写文件
fp_sub_train_f = 'sub_train_f.csv'
fp_col_counts = 'col_counts'

## data after selecting features (LR_fun needed)
## and setting rare categories' value to 'other' (feature filtering)
fp_train_f = 'train_f.csv'
fp_test_f  = 'test_f.csv'

## 存储标签编码和one-hot编码
fp_lb_enc = 'lb_enc'
fp_oh_enc = 'oh_enc'

##==================== 数据预处理 ====================##
## 特征选择
cols = ['C1', 
        'banner_pos', 
        'site_id',
        'site_category',
        'app_id',
        'app_category', 
        'device_id',
        'device_model',
        'device_type', 
        'device_conn_type',
        'C14', 
        'C15',
        'C16',
        'C17', 
        'C18',
        'C19',
        'C20']

cols_train = ['id', 'click']
cols_test  = ['id']
cols_train.extend(cols)
cols_test.extend(cols)

## 数据加载
print('loading data...')
df_train_ini = pd.read_csv(fp_train, nrows = 10)
df_train_org = pd.read_csv(fp_train, chunksize = 1000000, iterator = True)
df_test_org  = pd.read_csv(fp_test,  chunksize = 1000000, iterator = True)

#----- 统计分类变量 数值个数 -----#
## 初始化
cols_counts = {}  # 统计每个特征的分类数量
for col in cols:
    cols_counts[col] = df_train_ini[col].value_counts()

## 统计训练集
for chunk in df_train_org:
    for col in cols:
        cols_counts[col] = cols_counts[col].append(chunk[col].value_counts())

## 统计测试集
for chunk in df_test_org:
    for col in cols:
        cols_counts[col] = cols_counts[col].append(chunk[col].value_counts())
        
## 统计
for col in cols:
    cols_counts[col] = cols_counts[col].groupby(cols_counts[col].index).sum()
    # sort the counts
    cols_counts[col] = cols_counts[col].sort_values(ascending=False)   

## 存储value_counting
pickle.dump(cols_counts, open(fp_col_counts, 'wb'))
'''
## 绘制分布
fig = plt.figure(1)
for i, col in enumerate(cols):
    ax = fig.add_subplot(4, 3, i+1)
    ax.fill_between(np.arange(len(cols_counts[col])), cols_counts[col].get_values())
    # ax.set_title(col)
plt.show()
'''
## 只保存前K个分类变量
k = 99
col_index = {}
for col in cols:
    col_index[col] = cols_counts[col][0: k].index

df_train_org = pd.read_csv(fp_train, dtype = {'id': str}, chunksize = 1000000, iterator = True)
df_test_org  = pd.read_csv(fp_test,  dtype = {'id': str}, chunksize = 1000000, iterator = True)

## 训练集
hd_flag = True  # add column names at 1-st row
for chunk in df_train_org:
    df = chunk.copy()
    for col in cols:
        df[col] = df[col].astype('object')
        # assign all the rare variables as 'other'
        df.loc[~df[col].isin(col_index[col]), col] = 'other'
    with open(fp_train_f, 'a') as f:
        df.to_csv(f, columns = cols_train, header = hd_flag, index = False)
    hd_flag = False

## 测试集
hd_flag = True  # 第一个chunk需要有header
for chunk in df_test_org:
    df = chunk.copy()
    for col in cols:
        df[col] = df[col].astype('object')
        # 设置其他不常用变量为other
        df.loc[~df[col].isin(col_index[col]), col] = 'other'
    with open(fp_test_f, 'a') as f:
        df.to_csv(f, columns = cols_test, header = hd_flag, index = False)      
    hd_flag = False    

## 对分类变量进行标签编码
lb_enc = {}
for col in cols:
    col_index[col] = np.append(col_index[col], 'other')

for col in cols:
    lb_enc[col] = LabelEncoder()
    lb_enc[col].fit(col_index[col])
    
## 存储标签编码
pickle.dump(lb_enc, open(fp_lb_enc, 'wb'))

## one-hot编码
oh_enc = OneHotEncoder(cols)

df_train_f = pd.read_csv(fp_train_f, index_col=None, chunksize=500000, iterator=True)
df_test_f  = pd.read_csv(fp_test_f, index_col=None, chunksize=500000, iterator=True)

for chunk in df_train_f:
    oh_enc.fit(chunk)
for chunk in df_test_f:
    oh_enc.fit(chunk)
    
## 存储one-hot编码
pickle.dump(oh_enc, open(fp_oh_enc, 'wb'))


# 计算总训练样本 约46M
n = sum(1 for line in open(fp_train_f)) - 1 
# 保存下采样训练样本 2M
s = 10000000

## 设置哪些行不需要读 skip，不需要读的行数为n-s
skip = sorted(random.sample(range(1, n+1), n-s)) 
df_train = pd.read_csv(fp_train_f, skiprows = skip)
df_train.columns = cols_train

## 存储下采样的结果
df_train.to_csv(fp_sub_train_f, index=False) 



loading data...


In [2]:
# 使用xDeepFM 模型对Avazu CTR进行预估
import pandas as pd
import numpy as np
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt 
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from deepctr.models import xDeepFM
from deepctr.inputs import SparseFeat,get_feature_names
import pickle


##==================== 设置文件路径File-Path (fp) ====================##
#file_path = '/home/admin/avazu/'
fp_train_f = "sub_train_f.csv" #使用小样本进行训练

##==================== xDeepFM 训练 ====================##
data = pd.read_csv(fp_train_f, dtype={'id':str}, index_col=None)
print('data loaded')

#数据加载
sparse_features = ['C1', 
        'banner_pos', 
        'site_id',
        'site_category',
        'app_id',
        'app_category', 
        'device_id',
        'device_model',
        'device_type', 
        'device_conn_type',
        'C14', 
        'C15',
        'C16',
        'C17', 
        'C18',
        'C19',
        'C20']
target = ['click']

# 对特征标签进行编码
for feature in sparse_features:
    lbe = LabelEncoder()
    data[feature] = lbe.fit_transform(data[feature])
# 计算每个特征中的 不同特征值的个数
fixlen_feature_columns = [SparseFeat(feature, data[feature].nunique()) for feature in sparse_features]
linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
#print(fixlen_feature_columns)
#print(feature_names)

# 将数据集切分成训练集和测试集
train, test = train_test_split(data, test_size=0.2)
train_model_input = {name:train[name].values for name in feature_names}
test_model_input = {name:test[name].values for name in feature_names}


model = xDeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
model.compile("adam", "binary_crossentropy", metrics=['binary_crossentropy'], )


history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=10, verbose=True, validation_split=0.2, )
# 使用xDeepFM进行预测
pred_ans = model.predict(test_model_input, batch_size=256)
# 输出RMSE或MSE
mse = round(mean_squared_error(test[target].values, pred_ans), 4)
rmse = mse ** 0.5
print("test RMSE", rmse)

# 输出LogLoss
score = log_loss(test[target].values, pred_ans)
print("LogLoss", score)

data loaded
Train on 6400000 samples, validate on 1600000 samples
Epoch 1/10


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
test RMSE 0.3535533905932738
LogLoss 0.39876091923294543


In [3]:
import xgboost as xgb

param = {'boosting_type':'gbdt',
                         'objective' : 'binary:logistic', 
                         'eval_metric' : 'logloss',
                         'eta' : 0.01,
                         'max_depth' : 15,
                         'colsample_bytree':0.8,
                         'subsample': 0.9,
                         'subsample_freq': 8,
                         'alpha': 0.6,
                         'lambda': 0,
        }

X_train, X_valid, y_train, y_valid = train_test_split(train.drop(['id','click'],axis=1), train['click'], test_size=0.2)

train_data = xgb.DMatrix(X_train, label=y_train)
valid_data = xgb.DMatrix(X_valid, label=y_valid)
test_data = xgb.DMatrix(test.drop(['id','click'],axis=1))

model_xgb = xgb.train(param, train_data, evals=[(train_data, 'train'), (valid_data, 'valid')], num_boost_round = 10000, early_stopping_rounds=200, verbose_eval=25)
predict = model_xgb.predict(test_data)

# 输出RMSE或MSE
mse = round(mean_squared_error(test[target].values, predict), 4)
rmse = mse ** 0.5
print("test RMSE", rmse)

# 输出LogLoss
score = log_loss(test[target].values, predict)
print("LogLoss", score)

Parameters: { boosting_type, subsample_freq } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-logloss:0.68875	valid-logloss:0.68846
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 200 rounds.
[25]	train-logloss:0.59132	valid-logloss:0.59149
[50]	train-logloss:0.52880	valid-logloss:0.52930
[75]	train-logloss:0.48730	valid-logloss:0.48796
[100]	train-logloss:0.45914	valid-logloss:0.46009
[125]	train-logloss:0.43989	valid-logloss:0.44099
[150]	train-logloss:0.42642	valid-logloss:0.42784
[175]	train-logloss:0.41707	valid-logloss:0.41874
[200]	train-logloss:0.41048	valid-logloss:0.41242
[225]	train-logloss:0.40602	valid-logloss:0.40812
[250]	train-logloss:0.40285	valid-logloss:0.4050

In [4]:
test_sub = pd.read_csv('test_f.csv', dtype={'id':str})
for feature in sparse_features:
    lbe = LabelEncoder()
    test_sub[feature] = lbe.fit_transform(test_sub[feature])

predict_data = xgb.DMatrix(test_sub.drop(['id'],axis=1))
predict_xgb = model_xgb.predict(predict_data)

submission = pd.read_csv('sampleSubmission.csv', dtype={'id':str})
submission['click_xgb'] = predict_xgb

predict_model_input = {name:test_sub[name].values for name in feature_names}
predict_xdeepfm = model.predict(predict_model_input, batch_size=256)
submission['click_xdeepfm'] = predict_xdeepfm

submission['click'] = (submission['click_xdeepfm']+submission['click_xgb'])/2

submission = submission.drop(['click_xdeepfm','click_xgb'],axis =1)

submission.to_csv('submission.csv',index = 0)