In [1]:
import lightgbm as lgb  # 模型
import pandas as pd  # 数据处理包
import numpy as np  # 数据处理包
from sklearn import metrics  # 混淆矩阵
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split  # 分层五折验证包、寻找最优参函数、切分数据
from sklearn.metrics import accuracy_score, roc_curve, auc, confusion_matrix  # 准确率、roc计算、auc计算、混淆矩阵
import matplotlib.pyplot as plt  # 图形处理包
import itertools  # 处理混淆矩阵
import gc  # 处理缓存，有兴趣的可以搜搜怎么使用
import warnings  # 忽略普通警告，不打印太多东西
warnings.filterwarnings('ignore')
plt.rcParams['font.sans-serif']=['SimHei']  # 让图形可以显示中文
plt.rcParams['axes.unicode_minus']=False

In [2]:
data = pd.read_csv('data/train_set.csv')  # 读取训练数据
data.head()

Unnamed: 0,user_id,sku_id,cate,action_before_3_1.0_x,action_before_3_2.0_x,action_before_3_3.0_x,action_before_3_4.0_x,action_before_3_5.0_x,action_before_3_6.0_x,action_before_3_1.0_y,...,cate_action_5_mean,cate_action_6_mean,has_bad_comment,bad_comment_rate,comment_num_0,comment_num_1,comment_num_2,comment_num_3,comment_num_4,label
0,202633.0,12564.0,8.0,1.0,0.0,0.0,0.0,0.0,2.0,1.0,...,20.866667,5167.6,1.0,0.026,0.0,0.0,0.0,0.0,1.0,1.0
1,218498.0,149854.0,8.0,4.0,0.0,0.0,0.0,0.0,4.0,2.0,...,20.866667,5167.6,1.0,0.0403,0.0,0.0,0.0,0.0,1.0,1.0
2,221842.0,75877.0,8.0,3.0,0.0,0.0,0.0,0.0,5.0,79.0,...,20.866667,5167.6,1.0,0.0245,0.0,0.0,0.0,0.0,1.0,1.0
3,222886.0,154636.0,8.0,20.0,1.0,0.0,0.0,0.0,26.0,10.0,...,20.866667,5167.6,1.0,0.0208,0.0,0.0,0.0,0.0,1.0,1.0
4,235240.0,38222.0,8.0,30.0,1.0,0.0,0.0,0.0,28.0,55.0,...,20.866667,5167.6,1.0,0.0166,0.0,0.0,0.0,0.0,1.0,1.0


In [3]:
train_x = data.loc[:,data.columns != 'label']  # 将训练数据集分成特征和标签
train_y = data.loc[:,data.columns == 'label']
train_x.head()

Unnamed: 0,user_id,sku_id,cate,action_before_3_1.0_x,action_before_3_2.0_x,action_before_3_3.0_x,action_before_3_4.0_x,action_before_3_5.0_x,action_before_3_6.0_x,action_before_3_1.0_y,...,cate_action_4_mean,cate_action_5_mean,cate_action_6_mean,has_bad_comment,bad_comment_rate,comment_num_0,comment_num_1,comment_num_2,comment_num_3,comment_num_4
0,202633.0,12564.0,8.0,1.0,0.0,0.0,0.0,0.0,2.0,1.0,...,8.4,20.866667,5167.6,1.0,0.026,0.0,0.0,0.0,0.0,1.0
1,218498.0,149854.0,8.0,4.0,0.0,0.0,0.0,0.0,4.0,2.0,...,8.4,20.866667,5167.6,1.0,0.0403,0.0,0.0,0.0,0.0,1.0
2,221842.0,75877.0,8.0,3.0,0.0,0.0,0.0,0.0,5.0,79.0,...,8.4,20.866667,5167.6,1.0,0.0245,0.0,0.0,0.0,0.0,1.0
3,222886.0,154636.0,8.0,20.0,1.0,0.0,0.0,0.0,26.0,10.0,...,8.4,20.866667,5167.6,1.0,0.0208,0.0,0.0,0.0,0.0,1.0
4,235240.0,38222.0,8.0,30.0,1.0,0.0,0.0,0.0,28.0,55.0,...,8.4,20.866667,5167.6,1.0,0.0166,0.0,0.0,0.0,0.0,1.0


In [4]:
train_y.head()

Unnamed: 0,label
0,1.0
1,1.0
2,1.0
3,1.0
4,1.0


In [5]:
del train_x['user_id']
del train_x['sku_id']

train_x.head()

Unnamed: 0,cate,action_before_3_1.0_x,action_before_3_2.0_x,action_before_3_3.0_x,action_before_3_4.0_x,action_before_3_5.0_x,action_before_3_6.0_x,action_before_3_1.0_y,action_before_3_2.0_y,action_before_3_3.0_y,...,cate_action_4_mean,cate_action_5_mean,cate_action_6_mean,has_bad_comment,bad_comment_rate,comment_num_0,comment_num_1,comment_num_2,comment_num_3,comment_num_4
0,8.0,1.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,...,8.4,20.866667,5167.6,1.0,0.026,0.0,0.0,0.0,0.0,1.0
1,8.0,4.0,0.0,0.0,0.0,0.0,4.0,2.0,0.0,0.0,...,8.4,20.866667,5167.6,1.0,0.0403,0.0,0.0,0.0,0.0,1.0
2,8.0,3.0,0.0,0.0,0.0,0.0,5.0,79.0,0.0,0.0,...,8.4,20.866667,5167.6,1.0,0.0245,0.0,0.0,0.0,0.0,1.0
3,8.0,20.0,1.0,0.0,0.0,0.0,26.0,10.0,0.0,0.0,...,8.4,20.866667,5167.6,1.0,0.0208,0.0,0.0,0.0,0.0,1.0
4,8.0,30.0,1.0,0.0,0.0,0.0,28.0,55.0,0.0,0.0,...,8.4,20.866667,5167.6,1.0,0.0166,0.0,0.0,0.0,0.0,1.0


In [7]:
train_x.shape

(14619, 234)

In [8]:
data = pd.read_csv('data/val_set.csv')  # 读取验证数据
data.head()

Unnamed: 0,user_id,sku_id,cate,action_before_3_1.0_x,action_before_3_2.0_x,action_before_3_3.0_x,action_before_3_4.0_x,action_before_3_5.0_x,action_before_3_6.0_x,action_before_3_1.0_y,...,cate_action_4_mean,cate_action_5_mean,cate_action_6_mean,has_bad_comment,bad_comment_rate,comment_num_0,comment_num_1,comment_num_2,comment_num_3,comment_num_4
0,200005.0,67444.0,4.0,2.0,0.0,0.0,0.0,0.0,3.0,26.0,...,73.4,169.366667,48251.0,1.0,0.0821,0.0,0.0,0.0,0.0,1.0
1,200005.0,72967.0,4.0,26.0,1.0,0.0,1.0,0.0,30.0,2.0,...,73.4,169.366667,48251.0,1.0,0.0196,0.0,0.0,0.0,0.0,1.0
2,200007.0,26229.0,9.0,2.0,0.0,0.0,0.0,0.0,2.0,12.0,...,20.766667,56.7,12937.7,1.0,0.0198,0.0,0.0,0.0,0.0,1.0
3,200007.0,63315.0,9.0,4.0,0.0,0.0,0.0,0.0,3.0,10.0,...,20.766667,56.7,12937.7,1.0,0.0476,0.0,0.0,0.0,0.0,1.0
4,200007.0,126404.0,9.0,4.0,0.0,0.0,0.0,0.0,3.0,10.0,...,20.766667,56.7,12937.7,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [None]:
val_x = data.loc[:,data.columns != 'label']  # 将验证数据集分成特征和标签
val_y = data.loc[:,data.columns == 'label']
val_x.head()