In [9]:
import pandas as pd 
import numpy as np 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import lightgbm as lgb 
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import log_loss, roc_auc_score

import gc 
from tqdm.autonotebook import tqdm 
from scipy import sparse 

import warnings
warnings.filterwarnings("ignore")

# 读取数据

In [3]:
tmp = pd.read_csv("../data/criteo/train.csv")

In [4]:
tmp.head()

Unnamed: 0,Id,Label,I1,I2,I3,I4,I5,I6,I7,I8,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,10000743,1,1.0,0,1.0,,227.0,1.0,173.0,18.0,...,3486227d,e88ffc9d,c393dc22,b1252a9d,57c90cd9,,bcdee96c,4d19a3eb,cb079c2d,456c12a0
1,10000159,1,4.0,1,1.0,2.0,27.0,2.0,4.0,2.0,...,07c540c4,92555263,,,242bb710,,3a171ecb,72c78f11,,
2,10001166,1,0.0,806,,,1752.0,142.0,2.0,0.0,...,07c540c4,25c88e42,21ddcdc9,b1252a9d,a0136dd2,,32c7478e,8fc66e78,001f3601,f37f3967
3,10000318,0,2.0,-1,42.0,14.0,302.0,38.0,25.0,38.0,...,e5ba7672,5aed7436,21ddcdc9,b1252a9d,c3abeb21,,423fab69,1793a828,e8b83407,5cef228f
4,10000924,1,0.0,57,2.0,1.0,2891.0,2.0,35.0,1.0,...,e5ba7672,642f2610,1d1eb838,b1252a9d,1640d50b,ad3062eb,423fab69,45ab94c8,2bf691b1,c84c4aec


In [5]:
tmp.shape

(1599, 41)

In [7]:
## 查看缺失值情况
pd.DataFrame(tmp.isna().sum())

Unnamed: 0,0
Id,0
Label,0
I1,695
I2,0
I3,357
I4,334
I5,74
I6,392
I7,77
I8,2


In [8]:
# 数据预处理
def preProcess():
    path = "../data/criteo/"
    print("读取数据")
    df_train = pd.read_csv(path + "train.csv")
    df_test = pd.read_csv(path + "test.csv")
    print("读取结束")
    df_train.drop(["Id"], axis=1, inplace=True)
    df_test.drop(["Id"], axis=1, inplace=True)
    
    df_test["Label"] = -1 
    
    data = pd.concat([df_train, df_test], axis=0, sort=False, ignore_index=True)
    data = data.fillna(-1)
    data.to_csv(path+"data.csv", index=False)
    return data 

# 使用LR进行预测

In [42]:
def lr_predict(df, category_fe, continuous_fe):
    data = df.copy()
    
    # 对连续型特征进行归一化
    print("开始归一化...")
    scaler= MinMaxScaler()
    for col in continuous_fe:
        data[col] = scaler.fit_transform(data[col].values.reshape(-1, 1))
    print("归一化结束")
    
    # 离散型特征 one-hot编码
    print("开始one-hot...")
    for col in category_fe:
        onehot_feats = pd.get_dummies(data[col], prefix=col)
        data.drop([col], axis=1, inplace=True)
        data = pd.concat([data, onehot_feats], axis=1)
    print("one-hot编码结束")
    
    
    ## 训练集、测试集分离
    train = data[data['Label'] != -1]
    target = train.pop("Label")
    test = data[data['Label'] == -1]
    test.drop(["Label"], axis=1, inplace=True)
    
    # 划分数据集
    print("划分数据集...")
    X_train, X_val, y_train, y_val = train_test_split(train, target, test_size=0.2, random_state=2019)
    print("开始训练...")
    lr = LogisticRegression()
    lr.fit(X_train, y_train)
    tr_logloss = log_loss(y_train, lr.predict_proba(X_train)[:, 1])
    tr_auc_score = roc_auc_score(y_train, lr.predict_proba(X_train)[:, 1])
    print("tr_logloss：", tr_logloss, "\ntr_AUC：", tr_auc_score)
    val_logloss = log_loss(y_val, lr.predict_proba(X_val)[:, 1])
    val_auc_score = roc_auc_score(y_val, lr.predict_proba(X_val)[:, 1])
    print("val_logloss：", val_logloss, "\nval_AUC：", val_auc_score)
    '''
    print("开始预测...")
    y_pred = lr.predict_proba(test)[:, 1]
    print("写入结果...")
    res = pd.read_csv("../data/criteo/test.csv")
    submission = pd.DataFrame({"Id": res["Id"], "Label": y_pred})
    submission.to_csv(f"submisson_lr_trauc_{tr_auc_score}_valauc_{val_auc_score}.csv", index=False)
    '''

# 使用GBDT预测

In [51]:
def gbdt_predict(df, category_fe, continuous_fe):
    data = df.copy()
    # 树模型对缺失值和连续值是否归一化不敏感
    ## 对离散值进行one-hot
    print("开始one-hot...")
    for col in category_fe:
        onehot_feats = pd.get_dummies(data[col], prefix=col)
        data.drop([col], axis=1, inplace=True)
        data = pd.concat([data, onehot_feats], axis=1)
    print("one-hot结束")
    
    train = data[data['Label'] != -1]
    target = train.pop("Label")
    test = data[data["Label"] == -1]
    test.drop(["Label"], axis=1, inplace=True)
    
    # 划分数据集
    print("划分数据集...")
    X_train, X_val, y_train, y_val = train_test_split(train, target, test_size=0.2, random_state=2019)
    
    print("开始训练...")
    gbm = lgb.LGBMClassifier(objective="binary",
                            subsample=0.8, 
                            min_child_weight=0.5, colsample_bytree=0.7,
                            num_leaves=100, max_depth=12,
                            learning_rate=0.01, 
                            n_estimators=10000,)
    
    gbm.fit(X_train, y_train, 
           eval_set=[(X_train, y_train), (X_val, y_val)],
           eval_names=["train", "val"],
           eval_metric="binary_logloss",
           early_stopping_rounds=100,
           verbose=100)
    print("训练集预测")
    tr_pred = gbm.predict_proba(X_train)[:, 1]
    tr_logloss = log_loss(y_train, tr_pred)
    tr_auc = roc_auc_score(y_train, tr_pred)
    print("tr_logloss: ", tr_logloss, "\ntr_AUC: ", tr_auc)
    print("验证集预测")
    val_pred = gbm.predict_proba(X_val)[:, 1]
    val_logloss = log_loss(y_val, val_pred)
    val_auc = roc_auc_score(y_val, val_pred)
    print("val_logloss: ", val_logloss, "\nval_AUC: ", val_auc)

# 使用gbdt+LR

In [63]:
def gbdt_lr_predict(df, category_fe, continuous_fe):
    data = df.copy()
    # 离散特征one-hot编码
    print("开始onbe-hot...")
    for col in category_fe:
        onehot_feats = pd.get_dummies(data[col], prefix=col)
        data.drop([col], axis=1, inplace=True)
        data = pd.concat([data, onehot_feats], axis=1)
    print("one-hot编码结束")
    
    train = data[data["Label"] != -1]
    target = train.pop("Label")
    test = data[data["Label"] == -1]
    test.drop(["Label"], axis=1, inplace=True)
    
    # 划分数据集
    print("划分数据集")
    X_train, X_val, y_train, y_val = train_test_split(train, target, test_size=0.2, random_state = 2018)
    
    print("开始训练GBDT...")
    
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_val = lgb.Dataset(X_val, y_val)
    
    lgb_params = {
        "boosting_type": "gbdt",
        "objective": "regression_l1",
        "metric": "binary_logloss",
        
        "subsample": 0.8, 
        "min_child_weight": 0.5, 
        "colsample_bytree": 0.7, 
        "num_leaves": 100, 
        "max_depth": 12, 
        "learning_rate": 0.05,
        "seed": 2019, 
        
        "device": "gpu",
        "gpu_platform_id": 1,
        "gpu_device_id": 0
    }
    
    
    model = lgb.train(lgb_params, lgb_train,
                      num_boost_round=10, 
                     valid_sets=[lgb_train, lgb_val],
                     valid_names=["train", "val"])
    
    # 得到训练得到的叶子数
    print("训练得到的叶子数")
    ## 设置pred_leaf=True，表示返回每个样本在每棵树上的叶子的ID
    gbdt_feats_train = model.predict(train, pred_leaf=True)
    gbdt_feats_test = model.predict(test, pred_leaf=True)
    ## 将得到的叶子节点的值作为特征组合
    gbdt_feats_name = [f"gbdt_left_{i}" for i in range(gbdt_feats_train.shape[1])]
    ## 生成新的组合特征
    df_train_gbdt_feats = pd.DataFrame(gbdt_feats_train, columns=gbdt_feats_name)
    df_test_gbdt_feats = pd.DataFrame(gbdt_feats_test, columns=gbdt_feats_name)
    
    print("构造新的数据集...")
    train = pd.concat([train, df_train_gbdt_feats], axis=1)
    test = pd.concat([test, df_test_gbdt_feats], axis=1)
    train_len = train.shape[0]
    data = pd.concat([train, test])
    del train, test 
    gc.collect()
    
    # 对叶子特征就行one-hot
    print("开始One-hot...")
    for col in gbdt_feats_name:
        print("This is feature: ", col)
        onehot_feats = pd.get_dummies(data[col], prefix=col)
        data.drop([col], axis=1, inplace=True)
        data = pd.concat([data, onehot_feats], axis=1)
    print("one-hot结束")
    
    train = data[:train_len]
    test = data[train_len:]
    del data
    gc.collect()
    
    X_train, X_val, y_train, y_val = train_test_split(train, target, test_size=0.2, random_state=2019)
    # lr
    print("开始训练LR...")
    lr = LogisticRegression()
    lr.fit(X_train, y_train)
    tr_logloss = log_loss(y_train, lr.predict_proba(X_train)[:, 1])
    tr_auc = roc_auc_score(y_train, lr.predict_proba(X_train)[:, 1])
    print("tr_logloss: ", tr_logloss, "\ntr_AUC: ", tr_auc)
    val_logloss = log_loss(y_val, lr.predict_proba(X_val)[:, 1])
    val_auc = roc_auc_score(y_val, lr.predict_proba(X_val)[:, 1])
    print("val_logloss: ", val_logloss, "\nval_AUC: ", val_auc)

# 对不同的模型训练得到结果

In [53]:
data = preProcess()

读取数据
读取结束


In [54]:
# 划分连续特征和类别特征
continuous_feature = [col for col in data.columns if col.startswith("I")]
category_feature = [col for col in data.columns if col.startswith("C")]

## lr模型训练和预测

In [55]:
lr_predict(data, category_fe=category_feature, continuous_fe=continuous_feature)

开始归一化...
归一化结束
开始one-hot...
one-hot编码结束
划分数据集...
开始训练...
tr_logloss： 0.12300658751702918 
tr_AUC： 0.9999506487787471
val_logloss： 0.5298239203825984 
val_AUC： 0.7139775668679896


## gbdt预测

In [57]:
gbdt_predict(data, category_feature, continuous_feature)

开始one-hot...
one-hot结束
划分数据集...
开始训练...
Training until validation scores don't improve for 100 rounds.
[100]	train's binary_logloss: 0.360697	train's binary_logloss: 0.360697	val's binary_logloss: 0.518256	val's binary_logloss: 0.518256
[200]	train's binary_logloss: 0.29057	train's binary_logloss: 0.29057	val's binary_logloss: 0.520195	val's binary_logloss: 0.520195
Early stopping, best iteration is:
[133]	train's binary_logloss: 0.332547	train's binary_logloss: 0.332547	val's binary_logloss: 0.517428	val's binary_logloss: 0.517428
训练集预测
tr_logloss:  0.33254691862577923 
tr_AUC:  0.9785094412682503
验证集预测
val_logloss:  0.517428421341816 
val_AUC:  0.6736410698878342


## gbdt-lr预测

In [64]:
gbdt_lr_predict(data, category_feature, continuous_feature)

开始onbe-hot...
one-hot编码结束
划分数据集
开始训练GBDT...
[1]	train's binary_logloss: 4.75227	val's binary_logloss: 6.86988
[2]	train's binary_logloss: 4.22737	val's binary_logloss: 6.18712
[3]	train's binary_logloss: 3.47397	val's binary_logloss: 5.58418
[4]	train's binary_logloss: 2.94258	val's binary_logloss: 5.37
[5]	train's binary_logloss: 2.38065	val's binary_logloss: 4.87179
[6]	train's binary_logloss: 2.26809	val's binary_logloss: 4.86803
[7]	train's binary_logloss: 2.13636	val's binary_logloss: 4.86333
[8]	train's binary_logloss: 2.1016	val's binary_logloss: 4.86398
[9]	train's binary_logloss: 1.91367	val's binary_logloss: 4.31777
[10]	train's binary_logloss: 1.68647	val's binary_logloss: 4.12217
训练得到的叶子数
(1599, 10)
[ 2 32 15  1 16  1  2  2  2 29]
构造新的数据集...
开始One-hot...
This is feature:  gbdt_left_0
This is feature:  gbdt_left_1
This is feature:  gbdt_left_2
This is feature:  gbdt_left_3
This is feature:  gbdt_left_4
This is feature:  gbdt_left_5
This is feature:  gbdt_left_6
This is featu