# 数据处理

## 导入相应模型

In [2]:
import gc # 垃圾回收
import pandas as pd 
import numpy as np
import warnings
warnings.filterwarnings('ignore')

#导入分析库
from sklearn.model_selection import train_test_split # 数据拆分
from sklearn.model_selection import StratifiedKFold # 同分布数据拆分，交叉验证
# import lightgbm as lgb # 微软
import xgboost as xgb

## 数据加载

In [3]:
%%time
# 加载数据
# 用户行为日志
user_log = pd.read_csv('./data_format1/user_log_format1.csv', dtype={'time_stamp':'str'})
# 用户画像
user_info = pd.read_csv('./data_format1/user_info_format1.csv')
# 训练数据和测试数据
train_data = pd.read_csv('./data_format1/train_format1.csv')
test_data = pd.read_csv('./data_format1/test_format1.csv')

CPU times: user 15.5 s, sys: 3.18 s, total: 18.7 s
Wall time: 18.9 s


## 查看数据

In [4]:
print('---data shape---')     
for data in [user_log, user_info, train_data, test_data]:
    print(data.shape)

---data shape---
(54925330, 7)
(424170, 3)
(260864, 3)
(261477, 3)


In [5]:
print('---data info ---')
for data in [user_log, user_info, train_data, test_data]:
    print(data.info())

---data info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54925330 entries, 0 to 54925329
Data columns (total 7 columns):
 #   Column       Dtype  
---  ------       -----  
 0   user_id      int64  
 1   item_id      int64  
 2   cat_id       int64  
 3   seller_id    int64  
 4   brand_id     float64
 5   time_stamp   object 
 6   action_type  int64  
dtypes: float64(1), int64(5), object(1)
memory usage: 2.9+ GB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 424170 entries, 0 to 424169
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   user_id    424170 non-null  int64  
 1   age_range  421953 non-null  float64
 2   gender     417734 non-null  float64
dtypes: float64(2), int64(1)
memory usage: 9.7 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260864 entries, 0 to 260863
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype
---  ------       --------------   -----
 0   u

In [6]:
display(user_info.head())

Unnamed: 0,user_id,age_range,gender
0,376517,6.0,1.0
1,234512,5.0,0.0
2,344532,5.0,0.0
3,186135,5.0,0.0
4,30230,5.0,0.0


In [7]:
display(train_data.head(),test_data.head())

Unnamed: 0,user_id,merchant_id,label
0,34176,3906,0
1,34176,121,0
2,34176,4356,1
3,34176,2217,0
4,230784,4818,0


Unnamed: 0,user_id,merchant_id,prob
0,163968,4605,
1,360576,1581,
2,98688,1964,
3,98688,3645,
4,295296,3361,


## 数据集成

In [9]:
train_data['origin'] = 'train'
test_data['origin'] = 'test'
# 集成
all_data = pd.concat([train_data, test_data], ignore_index=True, sort=False)
# prob测试数据中特有的一列
all_data.drop(['prob'], axis=1, inplace=True) # 删除概率这一列
display(all_data.head(),all_data.shape)

Unnamed: 0,user_id,merchant_id,label,origin
0,34176,3906,0.0,train
1,34176,121,0.0,train
2,34176,4356,1.0,train
3,34176,2217,0.0,train
4,230784,4818,0.0,train


(522341, 4)

In [10]:
all_data.tail()

Unnamed: 0,user_id,merchant_id,label,origin
522336,228479,3111,,test
522337,97919,2341,,test
522338,97919,3971,,test
522339,32639,3536,,test
522340,32639,3319,,test


In [11]:
# 连接user_info表，通过user_id关联
all_data = all_data.merge(user_info, on='user_id', how='left')
display(all_data.shape,all_data.head())

(522341, 6)

Unnamed: 0,user_id,merchant_id,label,origin,age_range,gender
0,34176,3906,0.0,train,6.0,0.0
1,34176,121,0.0,train,6.0,0.0
2,34176,4356,1.0,train,6.0,0.0
3,34176,2217,0.0,train,6.0,0.0
4,230784,4818,0.0,train,0.0,0.0


In [12]:
# 使用 merchant_id（原列名seller_id）
user_log.rename(columns={'seller_id':'merchant_id'}, inplace=True)

In [13]:
del train_data,test_data,user_info
gc.collect()

9363

## 数据类型转换

In [14]:
%%time
display(user_log.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54925330 entries, 0 to 54925329
Data columns (total 7 columns):
 #   Column       Dtype  
---  ------       -----  
 0   user_id      int64  
 1   item_id      int64  
 2   cat_id       int64  
 3   merchant_id  int64  
 4   brand_id     float64
 5   time_stamp   object 
 6   action_type  int64  
dtypes: float64(1), int64(5), object(1)
memory usage: 2.9+ GB


None

CPU times: user 14.8 ms, sys: 3.42 ms, total: 18.2 ms
Wall time: 16.7 ms


In [15]:
%%time
display(user_log.head())

Unnamed: 0,user_id,item_id,cat_id,merchant_id,brand_id,time_stamp,action_type
0,328862,323294,833,2882,2661.0,829,0
1,328862,844400,1271,2882,2661.0,829,0
2,328862,575153,1271,2882,2661.0,829,0
3,328862,996875,1271,2882,2661.0,829,0
4,328862,1086186,1271,1253,1049.0,829,0


CPU times: user 14.2 ms, sys: 1.88 ms, total: 16.1 ms
Wall time: 14.4 ms


In [16]:
%%time
# 用户行为数据类型转换
user_log['user_id'] = user_log['user_id'].astype('int32')
user_log['merchant_id'] = user_log['merchant_id'].astype('int32')
user_log['item_id'] = user_log['item_id'].astype('int32')
user_log['cat_id'] = user_log['cat_id'].astype('int32')
user_log['brand_id'].fillna(0, inplace=True)
user_log['brand_id'] = user_log['brand_id'].astype('int32')
user_log['time_stamp'] = pd.to_datetime(user_log['time_stamp'], format='%H%M')
user_log['action_type'] = user_log['action_type'].astype('int32')
display(user_log.info(),user_log.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54925330 entries, 0 to 54925329
Data columns (total 7 columns):
 #   Column       Dtype         
---  ------       -----         
 0   user_id      int32         
 1   item_id      int32         
 2   cat_id       int32         
 3   merchant_id  int32         
 4   brand_id     int32         
 5   time_stamp   datetime64[ns]
 6   action_type  int32         
dtypes: datetime64[ns](1), int32(6)
memory usage: 1.6 GB


None

Unnamed: 0,user_id,item_id,cat_id,merchant_id,brand_id,time_stamp,action_type
0,328862,323294,833,2882,2661,1900-01-01 08:29:00,0
1,328862,844400,1271,2882,2661,1900-01-01 08:29:00,0
2,328862,575153,1271,2882,2661,1900-01-01 08:29:00,0
3,328862,996875,1271,2882,2661,1900-01-01 08:29:00,0
4,328862,1086186,1271,1253,1049,1900-01-01 08:29:00,0


CPU times: user 6.22 s, sys: 1.68 s, total: 7.91 s
Wall time: 7.91 s


In [17]:
display(all_data.isnull().sum())

user_id             0
merchant_id         0
label          261477
origin              0
age_range        2578
gender           7545
dtype: int64

In [18]:
# 缺失值填充
all_data['age_range'].fillna(0, inplace=True)
all_data['gender'].fillna(2, inplace=True)
all_data.isnull().sum()

user_id             0
merchant_id         0
label          261477
origin              0
age_range           0
gender              0
dtype: int64

In [19]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 522341 entries, 0 to 522340
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   user_id      522341 non-null  int64  
 1   merchant_id  522341 non-null  int64  
 2   label        260864 non-null  float64
 3   origin       522341 non-null  object 
 4   age_range    522341 non-null  float64
 5   gender       522341 non-null  float64
dtypes: float64(3), int64(2), object(1)
memory usage: 27.9+ MB


In [20]:
all_data['age_range'] = all_data['age_range'].astype('int8')
all_data['gender'] = all_data['gender'].astype('int8')
all_data['label'] = all_data['label'].astype('str')
all_data['user_id'] = all_data['user_id'].astype('int32')
all_data['merchant_id'] = all_data['merchant_id'].astype('int32')
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 522341 entries, 0 to 522340
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   user_id      522341 non-null  int32 
 1   merchant_id  522341 non-null  int32 
 2   label        522341 non-null  object
 3   origin       522341 non-null  object
 4   age_range    522341 non-null  int8  
 5   gender       522341 non-null  int8  
dtypes: int32(2), int8(2), object(2)
memory usage: 16.9+ MB


## 用户特征工程(5min)

In [21]:
%%time
##### 特征处理
##### User特征处理
groups = user_log.groupby(['user_id'])

# 用户交互行为数量 u1
temp = groups.size().reset_index().rename(columns={0:'u1'})
all_data = all_data.merge(temp, on='user_id', how='left')

# 细分
# 使用 agg 基于列的聚合操作，统计唯一值个数 item_id, cat_id, merchant_id, brand_id
# 用户，交互行为：点了多少商品呢？
temp = groups['item_id'].agg([('u2', 'nunique')]).reset_index()
all_data = all_data.merge(temp, on='user_id', how='left')

# 用户，交互行为，具体统计：类目多少
temp = groups['cat_id'].agg([('u3', 'nunique')]).reset_index()
all_data = all_data.merge(temp, on='user_id', how='left')

temp = groups['merchant_id'].agg([('u4', 'nunique')]).reset_index()
all_data = all_data.merge(temp, on='user_id', how='left')

temp = groups['brand_id'].agg([('u5', 'nunique')]).reset_index()
all_data = all_data.merge(temp, on='user_id', how='left')


# 购物时间间隔特征 u6 按照小时
temp = groups['time_stamp'].agg([('F_time', 'min'), ('B_time', 'max')]).reset_index()
temp['u6'] = (temp['B_time'] - temp['F_time']).dt.seconds/3600
all_data = all_data.merge(temp[['user_id', 'u6']], on='user_id', how='left')


# 统计操作类型为0，1，2，3的个数
temp = groups['action_type'].value_counts().unstack().reset_index().rename(
    columns={0:'u7', 1:'u8', 2:'u9', 3:'u10'})
all_data = all_data.merge(temp, on='user_id', how='left')

del temp,groups
gc.collect()

CPU times: user 41.9 s, sys: 1.31 s, total: 43.2 s
Wall time: 43.2 s


19

In [22]:
all_data.head()

Unnamed: 0,user_id,merchant_id,label,origin,age_range,gender,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10
0,34176,3906,0.0,train,6,0,451,256,45,109,108,5.833333,410.0,,34.0,7.0
1,34176,121,0.0,train,6,0,451,256,45,109,108,5.833333,410.0,,34.0,7.0
2,34176,4356,1.0,train,6,0,451,256,45,109,108,5.833333,410.0,,34.0,7.0
3,34176,2217,0.0,train,6,0,451,256,45,109,108,5.833333,410.0,,34.0,7.0
4,230784,4818,0.0,train,0,0,54,31,17,20,19,5.166667,47.0,,7.0,


## 店铺特征工程(5min)

In [23]:
%%time
##### 商家特征处理
groups = user_log.groupby(['merchant_id'])

# 商家被交互行为数量 m1
temp = groups.size().reset_index().rename(columns={0:'m1'})
all_data = all_data.merge(temp, on='merchant_id', how='left')

# 统计商家被交互的 user_id, item_id, cat_id, brand_id 唯一值
temp = groups['user_id', 'item_id', 'cat_id', 'brand_id'].nunique().reset_index().rename(
    columns={
    'user_id':'m2',
    'item_id':'m3', 
    'cat_id':'m4', 
    'brand_id':'m5'})
all_data = all_data.merge(temp, on='merchant_id', how='left')

# 统计商家被交互的 action_type 唯一值
temp = groups['action_type'].value_counts().unstack().reset_index().rename(  
    columns={0:'m6', 1:'m7', 2:'m8', 3:'m9'})
all_data = all_data.merge(temp, on='merchant_id', how='left')

del temp
gc.collect()

CPU times: user 37.6 s, sys: 1.99 s, total: 39.6 s
Wall time: 39.7 s


0

In [24]:
display(all_data.tail())

Unnamed: 0,user_id,merchant_id,label,origin,age_range,gender,u1,u2,u3,u4,...,u10,m1,m2,m3,m4,m5,m6,m7,m8,m9
522336,228479,3111,,test,6,0,2004,1173,71,278,...,208.0,10105,4154,542,50,18,8997.0,9.0,687.0,412.0
522337,97919,2341,,test,8,1,55,29,14,17,...,1.0,5543,1592,352,93,19,4548.0,6.0,815.0,174.0
522338,97919,3971,,test,8,1,55,29,14,17,...,1.0,28892,7587,272,7,2,24602.0,94.0,2608.0,1588.0
522339,32639,3536,,test,0,0,72,46,24,33,...,1.0,14027,4956,322,19,3,12807.0,29.0,793.0,398.0
522340,32639,3319,,test,0,0,72,46,24,33,...,1.0,25959,7927,952,175,85,21737.0,34.0,2700.0,1488.0


## 用户和店铺联合特征工程(4min)

In [25]:
%%time
##### 用户+商户特征
groups = user_log.groupby(['user_id', 'merchant_id'])

# 用户在不同商家交互统计
temp = groups.size().reset_index().rename(columns={0:'um1'})
all_data = all_data.merge(temp, on=['user_id', 'merchant_id'], how='left')

# 统计用户在不同商家交互的 item_id, cat_id, brand_id 唯一值
temp = groups['item_id', 'cat_id', 'brand_id'].nunique().reset_index().rename(
    columns={
    'item_id':'um2',
    'cat_id':'um3',
    'brand_id':'um4'})
all_data = all_data.merge(temp, on=['user_id', 'merchant_id'], how='left')

# 统计用户在不同商家交互的 action_type 唯一值
temp = groups['action_type'].value_counts().unstack().reset_index().rename(
    columns={
    0:'um5',
    1:'um6',
    2:'um7',
    3:'um8'})
all_data = all_data.merge(temp, on=['user_id', 'merchant_id'], how='left')

# 统计用户在不同商家购物时间间隔特征 um9 按照小时
temp = groups['time_stamp'].agg([('F_time', 'min'), ('B_time', 'max')]).reset_index()
temp['um9'] = (temp['B_time'] - temp['F_time']).dt.seconds/3600
all_data = all_data.merge(temp[['user_id','merchant_id','um9']], on=['user_id', 'merchant_id'], how='left')

del temp,groups
gc.collect()

CPU times: user 57.6 s, sys: 4.9 s, total: 1min 2s
Wall time: 1min 2s


19

In [26]:
display(all_data.head())

Unnamed: 0,user_id,merchant_id,label,origin,age_range,gender,u1,u2,u3,u4,...,m9,um1,um2,um3,um4,um5,um6,um7,um8,um9
0,34176,3906,0.0,train,6,0,451,256,45,109,...,961.0,39,20,6,1,36.0,,1.0,2.0,0.85
1,34176,121,0.0,train,6,0,451,256,45,109,...,2699.0,14,1,1,1,13.0,,1.0,,0.05
2,34176,4356,1.0,train,6,0,451,256,45,109,...,196.0,18,2,1,1,12.0,,6.0,,0.016667
3,34176,2217,0.0,train,6,0,451,256,45,109,...,4150.0,2,1,1,1,1.0,,1.0,,0.0
4,230784,4818,0.0,train,0,0,54,31,17,20,...,1959.0,8,1,1,1,7.0,,1.0,,0.05


## 购买点击比

In [27]:
all_data['r1'] = all_data['u9']/all_data['u7'] # 用户购买点击比
all_data['r2'] = all_data['m8']/all_data['m6'] # 商家购买点击比
all_data['r3'] = all_data['um7']/all_data['um5'] #不同用户不同商家购买点击比
display(all_data.head())

Unnamed: 0,user_id,merchant_id,label,origin,age_range,gender,u1,u2,u3,u4,...,um3,um4,um5,um6,um7,um8,um9,r1,r2,r3
0,34176,3906,0.0,train,6,0,451,256,45,109,...,6,1,36.0,,1.0,2.0,0.85,0.082927,0.027572,0.027778
1,34176,121,0.0,train,6,0,451,256,45,109,...,1,1,13.0,,1.0,,0.05,0.082927,0.066145,0.076923
2,34176,4356,1.0,train,6,0,451,256,45,109,...,1,1,12.0,,6.0,,0.016667,0.082927,0.158024,0.5
3,34176,2217,0.0,train,6,0,451,256,45,109,...,1,1,1.0,,1.0,,0.0,0.082927,0.071243,1.0
4,230784,4818,0.0,train,0,0,54,31,17,20,...,1,1,7.0,,1.0,,0.05,0.148936,0.063164,0.142857


## 空数据填充

In [28]:
display(all_data.isnull().sum())

user_id             0
merchant_id         0
label               0
origin              0
age_range           0
gender              0
u1                  0
u2                  0
u3                  0
u4                  0
u5                  0
u6                  0
u7                360
u8             484162
u9                  0
u10            227482
m1                  0
m2                  0
m3                  0
m4                  0
m5                  0
m6                  0
m7               4052
m8                  0
m9                  0
um1                 0
um2                 0
um3                 0
um4                 0
um5             59408
um6            512947
um7                 0
um8            425790
um9                 0
r1                360
r2                  0
r3              59408
dtype: int64

In [29]:
all_data.fillna(0, inplace=True)

In [30]:
all_data.isnull().sum()

user_id        0
merchant_id    0
label          0
origin         0
age_range      0
gender         0
u1             0
u2             0
u3             0
u4             0
u5             0
u6             0
u7             0
u8             0
u9             0
u10            0
m1             0
m2             0
m3             0
m4             0
m5             0
m6             0
m7             0
m8             0
m9             0
um1            0
um2            0
um3            0
um4            0
um5            0
um6            0
um7            0
um8            0
um9            0
r1             0
r2             0
r3             0
dtype: int64

## 年龄性别类别型转换

In [31]:
all_data['age_range']

0         6
1         6
2         6
3         6
4         0
         ..
522336    6
522337    8
522338    8
522339    0
522340    0
Name: age_range, Length: 522341, dtype: int8

In [32]:
%%time
# 修改age_range字段名称为 age_0, age_1, age_2... age_8
# 独立编码
temp = pd.get_dummies(all_data['age_range'], prefix='age')
display(temp.head(10))
all_data = pd.concat([all_data, temp], axis=1)

Unnamed: 0,age_0,age_1,age_2,age_3,age_4,age_5,age_6,age_7,age_8
0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,1,0,0
4,1,0,0,0,0,0,0,0,0
5,0,0,0,0,1,0,0,0,0
6,0,0,0,0,0,1,0,0,0
7,0,0,0,0,0,1,0,0,0
8,0,0,0,0,0,1,0,0,0
9,0,0,0,0,1,0,0,0,0


CPU times: user 68.3 ms, sys: 4.9 ms, total: 73.2 ms
Wall time: 69.6 ms


In [33]:
# 性别转换
temp = pd.get_dummies(all_data['gender'], prefix='g')
all_data = pd.concat([all_data, temp], axis=1) # 列进行合并

# 删除原数据
all_data.drop(['age_range', 'gender'], axis=1, inplace=True)

del temp
gc.collect()

0

In [34]:
all_data.head()

Unnamed: 0,user_id,merchant_id,label,origin,u1,u2,u3,u4,u5,u6,...,age_2,age_3,age_4,age_5,age_6,age_7,age_8,g_0,g_1,g_2
0,34176,3906,0.0,train,451,256,45,109,108,5.833333,...,0,0,0,0,1,0,0,1,0,0
1,34176,121,0.0,train,451,256,45,109,108,5.833333,...,0,0,0,0,1,0,0,1,0,0
2,34176,4356,1.0,train,451,256,45,109,108,5.833333,...,0,0,0,0,1,0,0,1,0,0
3,34176,2217,0.0,train,451,256,45,109,108,5.833333,...,0,0,0,0,1,0,0,1,0,0
4,230784,4818,0.0,train,54,31,17,20,19,5.166667,...,0,0,0,0,0,0,0,1,0,0


# 数据存储

In [35]:
%%time
# train_data、test-data
train_data = all_data[all_data['origin'] == 'train'].drop(['origin'], axis=1)
test_data = all_data[all_data['origin'] == 'test'].drop(['label', 'origin'], axis=1)

train_data.to_csv('train_data.csv')
test_data.to_csv('test_data.csv')

CPU times: user 7.33 s, sys: 99.6 ms, total: 7.43 s
Wall time: 7.44 s


# 算法建模预测

In [116]:
# 训练数据和目标值
train_X, train_y = train_data.drop(['label'], axis=1), train_data['label']

# 数据拆分保留20%作为测试数据
X_train, X_valid, y_train, y_valid = train_test_split(train_X, train_y, test_size=.2)

## LGB 模型

In [None]:
# def lgb_train(X_train, y_train, X_valid, y_valid, verbose=True):
#     model_lgb = lgb.LGBMClassifier(
#         max_depth=10, # 8 # 树最大的深度
#         n_estimators=5000, # 集成算法，树数量
#         min_child_weight=100, 
#         colsample_bytree=0.7, # 特征筛选
#         subsample=0.9,  # 样本采样比例
#         learning_rate=0.1) # 学习率

#     model_lgb.fit(
#         X_train, 
#         y_train,
#         eval_metric='auc',
#         eval_set=[(X_train, y_train), (X_valid, y_valid)],
#         verbose=verbose, # 是否打印输出训练过程
#         early_stopping_rounds=10) # 早停，等10轮决策，评价指标不在变化，停止

#     print(model_lgb.best_score_['valid_1']['auc'])
#     return model_lgb

In [None]:
# model_lgb = lgb_train(X_train.values, y_train, X_valid.values, y_valid, verbose=True)

In [None]:
# %%time
# prob = model_lgb.predict_proba(test_data.values) # 预测

# submission = pd.read_csv('./data_format1/test_format1.csv')

# # 复购的概率
# submission['prob'] = pd.Series(prob[:,1]) # 预测数据赋值给提交数据

# display(submission.head())

# submission.to_csv('submission_lgb.csv', index=False)

# del submission
# gc.collect()

## XGB 模型

In [91]:
def xgb_train(X_train, y_train, X_valid, y_valid, verbose=True):
    model_xgb = xgb.XGBClassifier(
        max_depth=10, # raw8
        n_estimators=5000,
        min_child_weight=300, 
        colsample_bytree=0.7, 
        subsample=0.9, 
        learing_rate=0.1)
    
    model_xgb.fit(
        X_train, 
        y_train,
        eval_metric='auc',
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        verbose=verbose,
        early_stopping_rounds=10)# 早停法，如果auc在10epoch没有进步就stop
    print(model_xgb.best_score)
    return model_xgb

In [104]:
y_train

185218    0.0
66391     0.0
120761    1.0
156057    0.0
173632    0.0
         ... 
40556     0.0
39727     0.0
57854     0.0
62709     0.0
132381    0.0
Name: label, Length: 208691, dtype: object

In [101]:
y_train.astype(int)

ValueError: invalid literal for int() with base 10: '0.0'

In [108]:
y_train = y_train.astype(float)
y_train

185218    0.0
66391     0.0
120761    1.0
156057    0.0
173632    0.0
         ... 
40556     0.0
39727     0.0
57854     0.0
62709     0.0
132381    0.0
Name: label, Length: 208691, dtype: float64

In [109]:
y_valid = y_valid.astype(float)
y_valid

73713     0.0
205864    0.0
41950     0.0
57755     0.0
41947     0.0
         ... 
82266     0.0
160245    0.0
48738     0.0
84190     0.0
121841    0.0
Name: label, Length: 52173, dtype: float64

In [110]:
model_xgb = xgb_train(X_train,y_train, X_valid, y_valid,verbose=False)

Parameters: { "learing_rate" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


0.6796199966978188


In [87]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)
'''
https://blog.csdn.net/Soft_Po/article/details/120372703
'min_child_weight': 5,孩子节点中最小的样本权重和。
如果一个叶子节点的样本权重和小于min_child_weight则拆分过程结束。即调大这个参数能够控制过拟合。
gamma = 0.1,# 树的叶子节点上做进一步分区所需的最小损失减少，越大越保守，一般0.1 0.2这样子
scale_pos_weight =10 # 如果取值大于0的话，在类别样本不平衡的情况下有助于快速收敛，平衡正负权重
'eta': 0.1, # 如同学习率'''
param = {'n_estimators': 5000, 'max_depth': 10, 'min_child_weight': 300, 'gamma': 0.1, 
         'subsample': 0.9,'colsample_bytree': 0.9, 'eta': 0.1,
         'objective': 'binary:logistic','eval_metric':'auc'}

num_round = param['n_estimators']
evallist = [(dtrain, 'train'), (dvalid, 'eval')]
bst = xgb.train(param, dtrain, num_round, evallist, early_stopping_rounds=10)

Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-auc:0.64299	eval-auc:0.62560
[1]	train-auc:0.65554	eval-auc:0.63511
[2]	train-auc:0.65845	eval-auc:0.63673
[3]	train-auc:0.66186	eval-auc:0.63862
[4]	train-auc:0.66333	eval-auc:0.63968
[5]	train-auc:0.66593	eval-auc:0.64067
[6]	train-auc:0.66888	eval-auc:0.64192
[7]	train-auc:0.66946	eval-auc:0.64217
[8]	train-auc:0.67077	eval-auc:0.64231
[9]	train-auc:0.67171	eval-auc:0.64358
[10]	train-auc:0.67270	eval-auc:0.64402
[11]	train-auc:0.67388	eval-auc:0.64426
[12]	train-auc:0.67531	eval-auc:0.64425
[13]	train-auc:0.67578	eval-auc:0.64515
[14]	train-auc:0.67639	eval-auc:0.64519
[15]	train-auc:0.67685	eval-auc:0.64554
[16]	train-auc:0.67704	eval-auc:0.64543
[17]	train-

[192]	train-auc:0.73677	eval-auc:0.67587
[193]	train-auc:0.73691	eval-auc:0.67587


In [88]:
X_test_DMatrix = xgb.DMatrix(test_data)
prob = bst.predict(X_test_DMatrix)
print(prob)
# prob = bst.predict(test_data)
submission = pd.read_csv('./data_format1/test_format1.csv')
submission['prob'] = pd.Series(prob) 
submission.to_csv('submission_xgb.csv', index=False) 
display(submission.head()) 
del submission 
gc.collect()

[0.06208067 0.10710361 0.04364111 ... 0.12833077 0.0376034  0.05607484]


Unnamed: 0,user_id,merchant_id,prob
0,163968,4605,0.062081
1,360576,1581,0.107104
2,98688,1964,0.043641
3,98688,3645,0.039762
4,295296,3361,0.064959


9165

In [52]:
X_train

Unnamed: 0,user_id,merchant_id,u1,u2,u3,u4,u5,u6,u7,u8,...,age_2,age_3,age_4,age_5,age_6,age_7,age_8,g_0,g_1,g_2
76383,117345,1150,310,169,40,56,54,5.916667,292.0,0.0,...,0,0,1,0,0,0,0,1,0,0
59155,5166,1760,202,128,29,66,66,5.816667,195.0,0.0,...,0,1,0,0,0,0,0,1,0,0
7249,326037,4368,192,106,31,45,48,4.783333,170.0,0.0,...,0,0,0,0,0,0,0,1,0,0
6420,421266,3476,69,47,27,22,21,5.833333,55.0,0.0,...,0,0,1,0,0,0,0,1,0,0
251866,248165,4976,94,78,17,50,51,5.150000,92.0,0.0,...,0,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189410,308399,361,32,19,10,8,9,4.983333,30.0,0.0,...,0,0,0,0,0,0,0,0,1,0
27695,315090,4318,22,12,10,12,12,5.833333,15.0,0.0,...,1,0,0,0,0,0,0,1,0,0
131147,160258,524,71,51,26,24,23,5.716667,63.0,0.0,...,0,0,0,1,0,0,0,1,0,0
9241,322203,1346,238,193,25,58,61,5.750000,235.0,0.0,...,0,0,0,0,0,0,0,1,0,0


In [53]:
X_train.values

array([[1.17345e+05, 1.15000e+03, 3.10000e+02, ..., 1.00000e+00,
        0.00000e+00, 0.00000e+00],
       [5.16600e+03, 1.76000e+03, 2.02000e+02, ..., 1.00000e+00,
        0.00000e+00, 0.00000e+00],
       [3.26037e+05, 4.36800e+03, 1.92000e+02, ..., 1.00000e+00,
        0.00000e+00, 0.00000e+00],
       ...,
       [1.60258e+05, 5.24000e+02, 7.10000e+01, ..., 1.00000e+00,
        0.00000e+00, 0.00000e+00],
       [3.22203e+05, 1.34600e+03, 2.38000e+02, ..., 1.00000e+00,
        0.00000e+00, 0.00000e+00],
       [4.23229e+05, 4.37000e+02, 2.53000e+02, ..., 1.00000e+00,
        0.00000e+00, 0.00000e+00]])

In [47]:
y_train.unique()

array(['0.0', '1.0'], dtype=object)

In [48]:
y_valid.unique()

array(['0.0', '1.0'], dtype=object)

In [63]:
print(np.array(y_train))

[0         0.0
 1         0.0
 2         1.0
 3         0.0
 4         0.0
          ...
 260859    0.0
 260860    0.0
 260861    0.0
 260862    0.0
 260863    0.0
 Name: label, Length: 247820, dtype: object
 0         0.0
 1         0.0
 2         1.0
 3         0.0
 4         0.0
          ...
 260858    0.0
 260859    0.0
 260860    0.0
 260862    0.0
 260863    0.0
 Name: label, Length: 247820, dtype: object
 0         0.0
 1         0.0
 2         1.0
 3         0.0
 4         0.0
          ...
 260859    0.0
 260860    0.0
 260861    0.0
 260862    0.0
 260863    0.0
 Name: label, Length: 247820, dtype: object
 0         0.0
 1         0.0
 2         1.0
 3         0.0
 4         0.0
          ...
 260859    0.0
 260860    0.0
 260861    0.0
 260862    0.0
 260863    0.0
 Name: label, Length: 247820, dtype: object
 1         0.0
 2         1.0
 4         0.0
 5         0.0
 6         0.0
          ...
 260859    0.0
 260860    0.0
 260861    0.0
 260862    0.0
 260863    0.0
 Nam

# 交叉验证多轮建模

In [111]:
# 构造训练集和测试集
def get_train_test_datas(train_df,label_df):
    skv = StratifiedKFold(n_splits=50, shuffle=True)
    trainX = []
    trainY = []
    testX = []
    testY = []
    # 索引：训练数据索引train_index,目标值的索引test_index
    for train_index, test_index in skv.split(X=train_df, y=label_df):# 10轮for循环
        
        train_x, train_y, test_x, test_y = train_df.iloc[train_index, :], label_df.iloc[train_index], \
                                            train_df.iloc[test_index, :], label_df.iloc[test_index]

        trainX.append(train_x)
        trainY.append(train_y)
        testX.append(test_x)
        testY.append(test_y)
    return trainX, testX, trainY, testY

## LGB 模型（1min）

In [121]:
%%time
train_X, train_y = train_data.drop(['label'], axis=1), train_data['label']

# 拆分为10份训练数据和验证数据
X_train, X_valid, y_train, y_valid = get_train_test_datas(train_X, train_y)
y_valid = y_valid.astype(float)
y_valid = y_valid.astype(float)
y_valid = y_valid.astype(float)
print('----训练数据，长度',len(X_train))
print('----验证数据，长度',len(X_valid))

pred_lgbms = [] # 列表，接受目标值，10轮，平均值

for i in range(10):
    print('\n============================LGB training use Data {}/10============================\n'.format(i+1))
    model_lgb = lgb.LGBMClassifier(
        max_depth=10, # 8
        n_estimators=1000,
        min_child_weight=100,
        colsample_bytree=0.7,
        subsample=0.9,
        learning_rate=0.05)

    model_lgb.fit(
        X_train[i].values, 
        y_train[i],
        eval_metric='auc',
        eval_set=[(X_train[i].values, y_train[i]), (X_valid[i].values, y_valid[i])],
        verbose=False,
        early_stopping_rounds=10)

    print(model_lgb.best_score_['valid_1']['auc'])

    pred = model_lgb.predict_proba(test_data.values)
    
    pred = pd.DataFrame(pred[:,1]) # 将预测概率（复购）去处理，转换成DataFrame
    
    pred_lgbms.append(pred)

# 求10轮平均值生成预测结果，保存
# 每一轮的结果，作为一列，进行了添加
pred_lgbms = pd.concat(pred_lgbms, axis=1) # 级联，列进行级联

# 加载提交数据
submission = pd.read_csv('./data_format1/test_format1.csv')

submission['prob'] = pred_lgbms.mean(axis=1) # 10轮训练的平均值

submission.to_csv('submission_KFold_lgb.csv', index=False)

AttributeError: 'list' object has no attribute 'astype'

In [None]:
pred_lgbms

## XGB 模型

In [112]:
# 构造训练集和测试集
def get_train_test_datas(train_df,label_df):
    skv = StratifiedKFold(n_splits=50, shuffle=True)
    trainX = []
    trainY = []
    testX = []
    testY = []
    # 索引：训练数据索引train_index,目标值的索引test_index
    for train_index, test_index in skv.split(X=train_df, y=label_df):# 10轮for循环
        
        train_x, train_y, test_x, test_y = train_df.iloc[train_index, :], label_df.iloc[train_index], \
                                            train_df.iloc[test_index, :], label_df.iloc[test_index]

        trainX.append(train_x)
        trainY.append(train_y)
        testX.append(test_x)
        testY.append(test_y)
    return trainX, testX, trainY, testY

In [125]:
%%time

train_X, train_y = train_data.drop(['label'], axis=1), train_data['label']

# 拆分为20份训练数据和验证数据
X_train, X_valid, y_train, y_valid = get_train_test_datas(train_X, train_y)
X_train = np.array(X_train)
X_valid= np.array(X_valid)
y_train = np.array(y_train).astype(np.float32)
y_valid = np.array(y_valid).astype(np.float32)
print('------数据长度',len(X_train),len(y_train))

pred_xgbs = []
for i in range(50):
    print('\n============================XGB training use Data {}/50============================\n'.format(i+1))
    model_xgb = xgb.XGBClassifier(
        max_depth=10, # raw8
        n_estimators=5000,
        min_child_weight=200, 
        colsample_bytree=0.7, 
        subsample=0.9,
        learning_rate = 0.1)

    model_xgb.fit(
        X_train[i], 
        y_train[i],
        eval_metric='auc',
        eval_set=[(X_train[i], y_train[i]), (X_valid[i], y_valid[i])],
        verbose=False,
        early_stopping_rounds=10 # 早停法，如果auc在10epoch没有进步就stop
    )    

    print(model_xgb.best_score)

    pred = model_xgb.predict_proba(test_data)
    pred = pd.DataFrame(pred[:,1])
    pred_xgbs.append(pred)

# 求20轮平均值生成预测结果，保存
pred_xgbs = pd.concat(pred_xgbs, axis=1)
submission = pd.read_csv('./data_format1/test_format1.csv')
submission['prob'] = pred_xgbs.mean(axis=1)
submission.to_csv('submission_KFold_xgb.csv', index=False)

ValueError: setting an array element with a sequence.

In [120]:
X_train = np.array(X_train).astype(dtype=int)
X_valid= np.array(X_valid).astype(dtype=int)
y_train = np.array(y_train).astype(dtype=int)
y_valid = np.array(y_valid).astype(dtype=int)

ValueError: setting an array element with a sequence.

In [123]:
X_train = np.array(X_train)
X_valid= np.array(X_valid)
y_train = np.array(y_train)
y_valid = np.array(y_valid)