In [1]:
import gc
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
# 用户行为，使用format1进行加载
# 加载全量样本

user_log = pd.read_csv('./data_format1/user_log_format1.csv', dtype={'time_stamp':'str'})
user_log_new = pd.read_csv('./data_format1/user_log_format1_new.csv')
user_log['time_stamp_new'] = user_log_new['time_stamp_new']

user_info = pd.read_csv('./data_format1/user_info_format1.csv')
train_data1 = pd.read_csv('./data_format1/train_format1.csv')
submission = pd.read_csv('./data_format1/test_format1.csv')

"""
# 加载小样本
user_log = pd.read_csv('./data_format1_small/sample_user_log.csv', dtype={'time_stamp':'str'})
user_info = pd.read_csv('./data_format1_small/sample_user_info.csv')
train_data1 = pd.read_csv('./data_format1_small/train.csv')
submission = pd.read_csv('./data_format1_small/test.csv')
"""
train_data = pd.read_csv('./data_format2/train_format2.csv')

train_data1['origin'] = 'train'
submission['origin'] = 'test'
matrix = pd.concat([train_data1, submission], ignore_index=True, sort=False)
#print(matrix)

# 使用merchant_id（原列名seller_id）
user_log.rename(columns={'seller_id':'merchant_id'}, inplace=True)
# 格式化
user_log['user_id'] = user_log['user_id'].astype('int32')
user_log['merchant_id'] = user_log['merchant_id'].astype('int32')
user_log['item_id'] = user_log['item_id'].astype('int32')
user_log['cat_id'] = user_log['cat_id'].astype('int32')
user_log['brand_id'].fillna(0, inplace=True)
user_log['brand_id'] = user_log['brand_id'].astype('int32')
user_log['time_stamp'] = pd.to_datetime(user_log['time_stamp'], format='%H%M')
user_log['time_stamp_new'] = user_log['time_stamp_new'].astype('float32')

# 对离散特征做LabelEncoder
lbe_merchant_id=LabelEncoder()
lbe_merchant_id.fit(np.r_[0,user_log['merchant_id'].values])
user_log['merchant_id']=lbe_merchant_id.transform(user_log['merchant_id'])
matrix['merchant_id']=lbe_merchant_id.transform(matrix['merchant_id'])

lbe_user_id=LabelEncoder()
user_log['user_id']=lbe_user_id.fit_transform(user_log['user_id'])
user_info['user_id']=lbe_user_id.transform(user_info['user_id'])
matrix['user_id']=lbe_user_id.transform(matrix['user_id'])

lbe_item_id=LabelEncoder()
user_log['item_id']=lbe_item_id.fit_transform(user_log['item_id'])
lbe_cat_id=LabelEncoder()
user_log['cat_id']=lbe_cat_id.fit_transform(user_log['cat_id'])
lbe_brand_id=LabelEncoder()
user_log['brand_id']=lbe_brand_id.fit_transform(user_log['brand_id'])

user_log['merchant_id'].max(),user_log['user_id'].max()
matrix = matrix.merge(user_info, on='user_id', how='left')

# 1 for <18; 2 for [18,24]; 3 for [25,29]; 4 for [30,34]; 5 for [35,39]; 6 for [40,49]; 7 and 8 for >= 50; 0 and NULL for unknown
matrix['age_range'].fillna(0, inplace=True)
# 0:female, 1:male, 2:unknown
matrix['gender'].fillna(2, inplace=True)
matrix['age_range'] = matrix['age_range'].astype('int8')
matrix['gender'] = matrix['gender'].astype('int8')
matrix['label'] = matrix['label'].astype('str')
matrix['user_id'] = matrix['user_id'].astype('int32')
matrix['merchant_id'] = matrix['merchant_id'].astype('int32')
del user_info, train_data1
gc.collect()
#print(matrix)

# User特征处理
groups = user_log.groupby(['user_id'])
# 用户交互行为数量 u1
temp = groups.size().reset_index().rename(columns={0:'u1'})
matrix = matrix.merge(temp, on='user_id', how='left')
# 使用agg 基于列的聚合操作，统计唯一值的个数 item_id, cat_id, merchant_id, brand_id
#temp = groups['item_id', 'cat_id', 'merchant_id', 'brand_id'].nunique().reset_index().rename(columns={'item_id':'u2', 'cat_id':'u3', 'merchant_id':'u4', 'brand_id':'u5'})
temp = groups['item_id'].agg([('u2', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
temp = groups['cat_id'].agg([('u3', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
temp = groups['merchant_id'].agg([('u4', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
temp = groups['brand_id'].agg([('u5', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')

# 时间间隔特征 u6 按照小时
temp = groups['time_stamp'].agg([('F_time', 'min'), ('L_time', 'max')]).reset_index()
temp['u6'] = (temp['L_time'] - temp['F_time']).dt.seconds/3600
matrix = matrix.merge(temp[['user_id', 'u6']], on='user_id', how='left')
# 统计action_type为0，1，2，3的个数（原始操作，没有补0）
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'u7', 1:'u8', 2:'u9', 3:'u10'})
matrix = matrix.merge(temp, on='user_id', how='left')
#print(matrix)

# 商家特征处理
groups = user_log.groupby(['merchant_id'])
# 商家被交互行为数量 m1
temp = groups.size().reset_index().rename(columns={0:'m1'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
# 统计商家被交互的user_id, item_id, cat_id, brand_id 唯一值
temp = groups['user_id', 'item_id', 'cat_id', 'brand_id'].nunique().reset_index().rename(columns={'user_id':'m2', 'item_id':'m3', 'cat_id':'m4', 'brand_id':'m5'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
# 统计商家被交互的action_type 唯一值
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'m6', 1:'m7', 2:'m8', 3:'m9'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
# 按照merchant_id 统计随机负采样的个数
temp = train_data[train_data['label']==-1].groupby(['merchant_id']).size().reset_index().rename(columns={0:'m10'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
#print(matrix)

# 按照user_id, merchant_id分组
groups = user_log.groupby(['user_id', 'merchant_id'])
temp = groups.size().reset_index().rename(columns={0:'um1'}) #统计行为个数
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['item_id', 'cat_id', 'brand_id'].nunique().reset_index().rename(columns={'item_id':'um2', 'cat_id':'um3', 'brand_id':'um4'}) #统计item_id, cat_id, brand_id唯一个数
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'um5', 1:'um6', 2:'um7', 3:'um8'})#统计不同action_type唯一个数
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['time_stamp'].agg([('first', 'min'), ('last', 'max')]).reset_index()
temp['um9'] = (temp['last'] - temp['first']).dt.seconds/3600
temp.drop(['first', 'last'], axis=1, inplace=True)
#print(temp)
#print('-'*100)
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left') #统计时间间隔
#print(matrix)

#用户购买点击比
matrix['r1'] = matrix['u9']/matrix['u7'] 
#商家购买点击比
matrix['r2'] = matrix['m8']/matrix['m6'] 
#不同用户不同商家购买点击比
matrix['r3'] = matrix['um7']/matrix['um5']
matrix.fillna(0, inplace=True)
# # 修改age_range字段名称为 age_0, age_1, age_2... age_8
temp = pd.get_dummies(matrix['age_range'], prefix='age')
matrix = pd.concat([matrix, temp], axis=1)
temp = pd.get_dummies(matrix['gender'], prefix='g')
matrix = pd.concat([matrix, temp], axis=1)
matrix.drop(['age_range', 'gender'], axis=1, inplace=True)
#print(matrix)


In [2]:
lbe_action_type={0:1,1:2,2:3,3:4}
user_log['action_type']=user_log['action_type'].map(lbe_action_type)
# 用户行为sequence
user_log_1 = pd.DataFrame(user_log.sort_values('time_stamp_new', ascending=True).groupby('user_id')['merchant_id','action_type','time_stamp_new'].agg(lambda x:list(x)))
split_pot = list()
for session in user_log_1['time_stamp_new']:
    temp_list = list()
    for session_num in range(1,len(session)):
        time_num = (session[session_num] - session[session_num-1])
        if time_num >= 0.5 :
            temp_list.append(session_num)
    split_pot.append(temp_list)

In [3]:
max_len = 0
for count_1 in split_pot:
    temp = len(count_1)
    if temp > max_len:
        max_len = temp
print(max_len)

7


In [9]:
import time
from tqdm import tqdm
n = len(user_log_1['merchant_id'])
merchant_id_list_all = pd.DataFrame(columns = [range(3)])

for count_num in tqdm(range(n)):
    merchant_id_temp = user_log_1['merchant_id'][count_num]
    merchant_id_list_one = []
    split_pot_n = split_pot[count_num]
    if split_pot_n == []:
        merchant_id_list_one.append(merchant_id_temp)
    else:
        if len(split_pot_n) > 2:
            for count_2 in range(3):
                if count_2 == 0:
                    merchant_id_list_one.append(merchant_id_temp[0:split_pot_n[count_2]])
                else:
                    merchant_id_list_one.append(merchant_id_temp[split_pot_n[count_2-1]:split_pot_n[count_2]])
        else:
            for count_2 in range(len(split_pot_n)):
                if count_2 == 0:
                    merchant_id_list_one.append(merchant_id_temp[0:split_pot_n[count_2]])
                else:
                    merchant_id_list_one.append(merchant_id_temp[split_pot_n[count_2-1]:split_pot_n[count_2]])
                if count_2 == len(split_pot_n)-1:
                    merchant_id_list_one.append(merchant_id_temp[split_pot_n[count_2]:])                    
    while len(merchant_id_list_one) < 3:
        merchant_id_list_one.append([0])    
    merchant_id_list_one_temp = pd.Series(merchant_id_list_one)
    merchant_id_list_all = merchant_id_list_all.append(merchant_id_list_one_temp,ignore_index=True)

#print(merchant_id_list_all)
merchant_id_list_all.to_csv('merchant_id_list_all.csv',index = 0)

100%|████████████████████████████████████████████████████████████████████████| 424170/424170 [8:48:56<00:00, 13.37it/s]


In [15]:
n = len(user_log_1['action_type'])
action_type_list_all = pd.DataFrame(columns = [range(3)])

for count_num1 in tqdm(range(n)):
    action_type_temp = user_log_1['action_type'][count_num1]
    action_type_list_one = []
    split_pot_n = split_pot[count_num1]
    if split_pot_n == []:
        action_type_list_one.append(action_type_temp)
    else:
        if len(split_pot_n) > 2:
            for count_3 in range(3):
                if count_3 == 0:
                    action_type_list_one.append(action_type_temp[0:split_pot_n[count_3]])
                else:
                    action_type_list_one.append(action_type_temp[split_pot_n[count_3-1]:split_pot_n[count_3]])
        else:
            for count_4 in range(len(split_pot_n)):
                if count_4 == 0:
                    action_type_list_one.append(action_type_temp[0:split_pot_n[count_4]])
                else:
                    action_type_list_one.append(action_type_temp[split_pot_n[count_4-1]:split_pot_n[count_4]])
                if count_4 == len(split_pot_n)-1:
                    action_type_list_one.append(action_type_temp[split_pot_n[count_4]:])                    
    while len(action_type_list_one) < 3:
        action_type_list_one.append([0])    
    action_type_list_one_temp = pd.Series(action_type_list_one)
    action_type_list_all = action_type_list_all.append(action_type_list_one_temp,ignore_index=True)
    #action_type_list_all = pd.concat([action_type_list_all,action_type_list_one_temp],axis = 0 ,ignore_index=True)

#print(merchant_id_list_all)
action_type_list_all.to_csv('action_type_list_all.csv',index = 0)

100%|████████████████████████████████████████████████████████████████████████| 424170/424170 [8:50:32<00:00, 13.32it/s]


In [17]:
#print(action_type_list_all)
for count_2 in range(3):
    user_log_1['sess_' + str(count_2) + '_merchant_id'] = merchant_id_list_all[count_2]
    user_log_1['sess_' + str(count_2) + '_action_type'] = action_type_list_all[count_2]
#print(user_log_1)

In [18]:
matrix = matrix.merge(user_log_1, on='user_id', how='left')
#print(matrix)

In [19]:
matrix.to_csv('matrix.csv',index = 0)

In [1]:
import gc
import numpy as np
import pandas as pd
import time
from tqdm import tqdm
matrix = pd.read_csv('matrix.csv')

In [2]:
for num in range(3):
    temp_1 = matrix['sess_'+ str(num) + '_merchant_id']
    temp_2 = matrix['sess_'+ str(num) + '_action_type']
    temp_3 = 'sess_'+ str(num) + '_merchant_id'
    temp_4 = 'sess_'+ str(num) + '_action_type'
    for num_1 in tqdm(range(len(temp_1))):
        temp = temp_1[num_1]
        #print(temp)
        merchant_id_list = temp[1:-1].strip().split(', ')
        #print(merchant_id_list)
        matrix.at[num_1,temp_3] = merchant_id_list

    for num_2 in tqdm(range(len(temp_2))):
        temp = temp_2[num_2]
        action_type_list = temp[1:-1].strip().split(', ')
        matrix.at[num_2, temp_4] = action_type_list

100%|███████████████████████████████████████████████████████████████████████| 522341/522341 [00:08<00:00, 61608.51it/s]
100%|███████████████████████████████████████████████████████████████████████| 522341/522341 [00:08<00:00, 61784.57it/s]
100%|███████████████████████████████████████████████████████████████████████| 522341/522341 [00:09<00:00, 54698.62it/s]
100%|███████████████████████████████████████████████████████████████████████| 522341/522341 [00:09<00:00, 57142.60it/s]
100%|███████████████████████████████████████████████████████████████████████| 522341/522341 [00:10<00:00, 48059.78it/s]
100%|███████████████████████████████████████████████████████████████████████| 522341/522341 [00:08<00:00, 60760.89it/s]


In [3]:
M=500

for num in tqdm(range(3)):
    temp_1 = matrix['sess_'+ str(num) + '_merchant_id']
    temp_2 = matrix['sess_'+ str(num) + '_action_type']
    temp_3 = 'sess_'+ str(num) + '_merchant_id'
    temp_4 = 'sess_'+ str(num) + '_action_type'
    for merchant_id_num1 in range(len(temp_1)):
        temp_5 = temp_1[merchant_id_num1]
        #print(temp_5)
        if len(temp_5)>M:
            matrix.at[merchant_id_num1,temp_3] = temp_5[:M]
        else:
            while len(temp_5)<M:
                temp_5.append('0')
            matrix.at[merchant_id_num1,temp_3] = temp_5
    for action_type_num1 in range(len(temp_1)):
        temp_6 = temp_2[action_type_num1]
        if len(temp_6)>M:
            matrix.at[action_type_num1,temp_4] = temp_6[:M]
        else:
            while len(temp_6)<M:
                temp_6.append('0')
            matrix.at[action_type_num1,temp_4] = temp_6            


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [04:42<00:00, 94.21s/it]


In [4]:
matrix = matrix.drop('merchant_id_y',axis = 1)
matrix = matrix.drop('action_type',axis = 1)
matrix = matrix.drop('time_stamp_new',axis = 1)
matrix = matrix.rename(columns={'merchant_id_x':'merchant_id'})

In [5]:
for num in tqdm(range(len(matrix))):
    for num_1 in range(3):
        temp_1 = matrix['sess_'+ str(num_1) + '_merchant_id']
        temp_2 = temp_1[num]
        temp_3 = temp_2[0]
        if temp_3 == '0':
            matrix.loc[num,'sess_length'] = num_1
            break
        else:
            if num_1 == 2 :
                matrix.loc[num,'sess_length'] = 3

#print(matrix)

100%|█████████████████████████████████████████████████████████████████████████| 522341/522341 [48:44<00:00, 178.60it/s]


In [6]:
# 分割训练数据和测试数据
train_data = matrix[matrix['origin'] == 'train'].drop(['origin'], axis=1)
test_data = matrix[matrix['origin'] == 'test'].drop(['label', 'origin'], axis=1)
train_X, train_y = train_data.drop(['label'], axis=1), train_data['label']

In [7]:
# 使用DSIN模型
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import log_loss
from deepctr.inputs import SparseFeat,VarLenSparseFeat,DenseFeat,get_feature_names
from deepctr.models import DIN, DIEN, DSIN
from sklearn.metrics import classification_report

train_X['action_type']=3
feature_columns = []
for column in train_X.columns:
    session_text = 'sess'
    if session_text not in column :
        #print(column)
        num = train_X[column].nunique()
        if num > 10000:
            dim = 10
        else:
            if num > 1000:
                dim = 8
            else:
                dim = 4
        #print(num)
        if column  == 'user_id':
            feature_columns += [SparseFeat(column, 424169+1, embedding_dim=dim, use_hash=True)]
        elif column  == 'merchant_id':
            feature_columns += [SparseFeat(column, 4994+1, embedding_dim=dim, use_hash=True)]
        elif column  == 'action_type':
            feature_columns += [SparseFeat(column, 4+1, embedding_dim=dim, use_hash=True)]
        else:
            feature_columns += [DenseFeat(column, 1)]

print('M=', M)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


M= 500


In [8]:
#M = 500
# maxlen为历史信息的长度，vocabulary_size为onehot的长度
feature_columns += [VarLenSparseFeat(SparseFeat('sess_0_merchant_id', vocabulary_size=424169 + 1, embedding_dim=10, use_hash=True, embedding_name='merchant_id'),maxlen=M),
                    VarLenSparseFeat(SparseFeat('sess_0_action_type', vocabulary_size=4 + 1, embedding_dim=4, use_hash=True, embedding_name='cate_id'),maxlen=M)]
feature_columns += [VarLenSparseFeat(SparseFeat('sess_1_merchant_id', vocabulary_size=424169 + 1, embedding_dim=10, use_hash=True, embedding_name='merchant_id'),maxlen=M),
                    VarLenSparseFeat(SparseFeat('sess_1_action_type', vocabulary_size=4 + 1, embedding_dim=4, use_hash=True, embedding_name='cate_id'),maxlen=M)]
feature_columns += [VarLenSparseFeat(SparseFeat('sess_2_merchant_id', vocabulary_size=424169 + 1, embedding_dim=10, use_hash=True, embedding_name='merchant_id'),maxlen=M),
                    VarLenSparseFeat(SparseFeat('sess_2_action_type', vocabulary_size=4 + 1, embedding_dim=4, use_hash=True, embedding_name='cate_id'),maxlen=M)]
#feature_columns += [VarLenSparseFeat(SparseFeat('sess_3_merchant_id', vocabulary_size=19111 + 1, embedding_dim=8, use_hash=True, embedding_name='merchant_id'),maxlen=M),
                    #VarLenSparseFeat(SparseFeat('sess_3_action_type', vocabulary_size=4 + 1, embedding_dim=4, use_hash=True, embedding_name='cate_id'),maxlen=M)]
#feature_columns += [VarLenSparseFeat(SparseFeat('sess_4_merchant_id', vocabulary_size=19111 + 1, embedding_dim=8, use_hash=True, embedding_name='merchant_id'),maxlen=M),
                    #VarLenSparseFeat(SparseFeat('sess_4_action_type', vocabulary_size=4 + 1, embedding_dim=4, use_hash=True, embedding_name='cate_id'),maxlen=M)]
#feature_columns += [VarLenSparseFeat(SparseFeat('sess_5_merchant_id', vocabulary_size=19111 + 1, embedding_dim=8, use_hash=True, embedding_name='merchant_id'),maxlen=M),
                    #VarLenSparseFeat(SparseFeat('sess_5_action_type', vocabulary_size=4 + 1, embedding_dim=4, use_hash=True, embedding_name='cate_id'),maxlen=M)]
#feature_columns += [VarLenSparseFeat(SparseFeat('sess_6_merchant_id', vocabulary_size=19111 + 1, embedding_dim=8, use_hash=True, embedding_name='merchant_id'),maxlen=M),
                    #VarLenSparseFeat(SparseFeat('sess_6_action_type', vocabulary_size=4 + 1, embedding_dim=4, use_hash=True, embedding_name='cate_id'),maxlen=M)]
#feature_columns += [VarLenSparseFeat(SparseFeat('sess_7_merchant_id', vocabulary_size=19111 + 1, embedding_dim=8, use_hash=True, embedding_name='merchant_id'),maxlen=M),
                    #VarLenSparseFeat(SparseFeat('sess_7_action_type', vocabulary_size=4 + 1, embedding_dim=4, use_hash=True, embedding_name='cate_id'),maxlen=M)]


In [9]:
del matrix
gc.collect()
hist_features=['merchant_id','action_type']
# 使用DSIN模型
model=DSIN(feature_columns, hist_features, sess_max_count=3, att_embedding_size=3, att_head_num=4)
# 使用Adam优化器，二分类的交叉熵
model.compile('adam', 'binary_crossentropy', metrics=['binary_crossentropy'])

# 组装train_model_input，得到feature names，将train_X转换为字典格式
feature_names=list(train_X.columns)
train_model_input = {name:train_X[name].values for name in feature_names}
# histroy输入必须是二维数组
for fea in ['sess_0_merchant_id','sess_0_action_type',
           'sess_1_merchant_id','sess_1_action_type',
           'sess_2_merchant_id','sess_2_action_type']:
           #'sess_3_merchant_id','sess_3_action_type',
           #'sess_4_merchant_id','sess_4_action_type',
           #'sess_5_merchant_id','sess_5_action_type',
           #'sess_6_merchant_id','sess_6_action_type',
           #'sess_7_merchant_id','sess_7_action_type']:
    l = []
    for i in tqdm(train_model_input[fea]):
        l.append(i)
    train_model_input[fea]=np.array(l)
history = model.fit(train_model_input, train_y, verbose=True, epochs=3, validation_split=0.2,batch_size=32)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
dim is deprecated, use axis instead


100%|█████████████████████████████████████████████████████████████████████| 260864/260864 [00:00<00:00, 2289720.45it/s]
100%|█████████████████████████████████████████████████████████████████████| 260864/260864 [00:00<00:00, 1750222.62it/s]
100%|█████████████████████████████████████████████████████████████████████| 260864/260864 [00:00<00:00, 1809679.23it/s]
100%|█████████████████████████████████████████████████████████████████████| 260864/260864 [00:00<00:00, 1309024.53it/s]
100%|█████████████████████████████████████████████████████████████████████| 260864/260864 [00:00<00:00, 1253524.84it/s]
100%|██████████████████████████████████████████████████████████████████████| 260864/260864 [00:00<00:00, 659919.71it/s]


Train on 208691 samples, validate on 52173 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [10]:
# 转换test__model_input
test_data['action_type']=3
test_model_input = {name:test_data[name].values for name in feature_names}
from tqdm import tqdm
for fea in ['sess_0_merchant_id','sess_0_action_type',
           'sess_1_merchant_id','sess_1_action_type',
           'sess_2_merchant_id','sess_2_action_type']:
    l = []
    for i in tqdm(test_model_input[fea]):
        l.append(i)
    test_model_input[fea]=np.array(l)


# 得到预测结果
prob = model.predict(test_model_input)
submission = pd.read_csv('./data_format1/test_format1.csv')
submission['prob'] = prob
submission.to_csv('prediction.csv', index=False)


100%|██████████████████████████████████████████████████████████████████████| 261477/261477 [00:00<00:00, 931405.40it/s]
100%|██████████████████████████████████████████████████████████████████████| 261477/261477 [00:00<00:00, 494921.57it/s]
100%|██████████████████████████████████████████████████████████████████████| 261477/261477 [00:00<00:00, 792807.80it/s]
100%|█████████████████████████████████████████████████████████████████████| 261477/261477 [00:00<00:00, 1101479.03it/s]
100%|██████████████████████████████████████████████████████████████████████| 261477/261477 [00:00<00:00, 907992.36it/s]
100%|█████████████████████████████████████████████████████████████████████| 261477/261477 [00:00<00:00, 1253353.08it/s]
