### 调库导入模型

In [1]:
import pandas as pd
import torch
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from deepctr_torch.models import *

### 读取数据填充空值

In [2]:
data = pd.read_csv('./criteo_sample.txt')
sparse_features = ['C' + str(i) for i in range(1, 27)]
dense_features = ['I' + str(i) for i in range(1, 14)]
data[sparse_features] = data[sparse_features].fillna('-1', )
data[dense_features] = data[dense_features].fillna(0, )
target = ['label']
print(data.shape)
data.head()

Please check the latest version manually on https://pypi.org/project/deepctr-torch/#history
(200, 40)


Unnamed: 0,label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,0,0.0,3,260.0,0.0,17668.0,0.0,0.0,33.0,0.0,...,e5ba7672,87c6f83c,-1,-1,0429f84b,-1,3a171ecb,c0d61a5c,-1,-1
1,0,0.0,-1,19.0,35.0,30251.0,247.0,1.0,35.0,160.0,...,d4bb7bd8,6fc84bfb,-1,-1,5155d8a3,-1,be7c41b4,ded4aac9,-1,-1
2,0,0.0,0,2.0,12.0,2013.0,164.0,6.0,35.0,523.0,...,e5ba7672,675c9258,-1,-1,2e01979f,-1,bcdee96c,6d5d1302,-1,-1
3,0,0.0,13,1.0,4.0,16836.0,200.0,5.0,4.0,29.0,...,e5ba7672,52e44668,-1,-1,e587c466,-1,32c7478e,3b183c5c,-1,-1
4,0,0.0,0,104.0,27.0,1990.0,142.0,4.0,32.0,37.0,...,e5ba7672,25c88e42,21ddcdc9,b1252a9d,0e8585d2,-1,32c7478e,0d4a6d1a,001f3601,92c878de


### 对类别特征进行标签编码，对数值特征进行简单变换

In [3]:
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])
mms = MinMaxScaler(feature_range=(0, 1))
data[dense_features] = mms.fit_transform(data[dense_features])
data.head()

Unnamed: 0,label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,0,0.0,0.001332,0.092362,0.0,0.034825,0.0,0.0,0.673469,0.0,...,8,66,0,0,3,0,1,96,0,0
1,0,0.0,0.0,0.00675,0.402299,0.059628,0.117284,0.003322,0.714286,0.154739,...,7,52,0,0,47,0,7,112,0,0
2,0,0.0,0.000333,0.00071,0.137931,0.003968,0.077873,0.019934,0.714286,0.505803,...,8,49,0,0,25,0,6,53,0,0
3,0,0.0,0.004664,0.000355,0.045977,0.033185,0.094967,0.016611,0.081633,0.028046,...,8,37,0,0,156,0,0,32,0,0
4,0,0.0,0.000333,0.036945,0.310345,0.003922,0.067426,0.013289,0.653061,0.035783,...,8,14,5,3,9,0,0,5,1,47


### 统计每个类别字段的类别数，并记录数值特征字段名称

In [4]:
fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique(), embedding_dim=4)
                          for feat in sparse_features] + [DenseFeat(feat, 1, )
                                                          for feat in dense_features]

dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(
    linear_feature_columns + dnn_feature_columns)
fixlen_feature_columns

[SparseFeat(name='C1', vocabulary_size=27, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C1', group_name='default_group'),
 SparseFeat(name='C2', vocabulary_size=92, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C2', group_name='default_group'),
 SparseFeat(name='C3', vocabulary_size=172, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C3', group_name='default_group'),
 SparseFeat(name='C4', vocabulary_size=157, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C4', group_name='default_group'),
 SparseFeat(name='C5', vocabulary_size=12, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C5', group_name='default_group'),
 SparseFeat(name='C6', vocabulary_size=7, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C6', group_name='default_group'),
 SparseFeat(name='C7', vocabulary_size=183, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C7', group_name='default_group'),
 SparseFeat

### 划分训练集测试集，生成模型输入数据

In [7]:
train, test = train_test_split(data, test_size=0.2)

train_model_input = {name: train[name] for name in feature_names}
test_model_input = {name: test[name] for name in feature_names}
type(train_model_input)

dict

### 定义模型

In [8]:
device = 'cpu'
use_cuda = True
if use_cuda and torch.cuda.is_available():
    print('cuda ready...')
    device = 'cuda:0'

model = DeepFM(linear_feature_columns=linear_feature_columns, dnn_feature_columns=dnn_feature_columns,
               task='binary',
               l2_reg_embedding=1e-5, device=device)

model.compile("adagrad", "binary_crossentropy",
              metrics=["binary_crossentropy", "auc"], )
model

cuda ready...


DeepFM(
  (embedding_dict): ModuleDict(
    (C1): Embedding(27, 4)
    (C2): Embedding(92, 4)
    (C3): Embedding(172, 4)
    (C4): Embedding(157, 4)
    (C5): Embedding(12, 4)
    (C6): Embedding(7, 4)
    (C7): Embedding(183, 4)
    (C8): Embedding(19, 4)
    (C9): Embedding(2, 4)
    (C10): Embedding(142, 4)
    (C11): Embedding(173, 4)
    (C12): Embedding(170, 4)
    (C13): Embedding(166, 4)
    (C14): Embedding(14, 4)
    (C15): Embedding(170, 4)
    (C16): Embedding(168, 4)
    (C17): Embedding(9, 4)
    (C18): Embedding(127, 4)
    (C19): Embedding(44, 4)
    (C20): Embedding(4, 4)
    (C21): Embedding(169, 4)
    (C22): Embedding(6, 4)
    (C23): Embedding(10, 4)
    (C24): Embedding(125, 4)
    (C25): Embedding(20, 4)
    (C26): Embedding(90, 4)
  )
  (linear_model): Linear(
    (embedding_dict): ModuleDict(
      (C1): Embedding(27, 1)
      (C2): Embedding(92, 1)
      (C3): Embedding(172, 1)
      (C4): Embedding(157, 1)
      (C5): Embedding(12, 1)
      (C6): Embedding(7

### 训练，查看训练效果

In [10]:
model.fit(train_model_input,train[target].values,batch_size=32,epochs=10,verbose=2,validation_split=0.0)

pred_ans = model.predict(test_model_input, 256)
print("")
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))

cuda:0
Train on 160 samples, validate on 0 samples, 5 steps per epoch
Epoch 1/10
1s - loss:  0.6167 - binary_crossentropy:  0.6167 - auc:  0.5404
Epoch 2/10
0s - loss:  0.4851 - binary_crossentropy:  0.4851 - auc:  0.9784
Epoch 3/10
0s - loss:  0.3628 - binary_crossentropy:  0.3628 - auc:  0.9865
Epoch 4/10
0s - loss:  0.2150 - binary_crossentropy:  0.2150 - auc:  0.9991
Epoch 5/10
0s - loss:  0.1201 - binary_crossentropy:  0.1201 - auc:  1.0000
Epoch 6/10
0s - loss:  0.0769 - binary_crossentropy:  0.0769 - auc:  1.0000
Epoch 7/10
0s - loss:  0.0513 - binary_crossentropy:  0.0513 - auc:  1.0000
Epoch 8/10
0s - loss:  0.0362 - binary_crossentropy:  0.0362 - auc:  1.0000
Epoch 9/10
0s - loss:  0.0272 - binary_crossentropy:  0.0272 - auc:  1.0000
Epoch 10/10
0s - loss:  0.0210 - binary_crossentropy:  0.0210 - auc:  1.0000

test LogLoss 0.9136
test AUC 0.49
