In [56]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/tabular-playground-series-may-2022/sample_submission.csv
/kaggle/input/tabular-playground-series-may-2022/train.csv
/kaggle/input/tabular-playground-series-may-2022/test.csv


# Pytorch开发深度学习模型一般步骤

## 1.定义DataSet
- 理解数据原始形式
- 理解数据编码方式
- 理解如何进行数据I/O

## 2.定义模型
- 定义各个子模块
- 将子模块合并成最终的模型

## 3.完成Train Pipeline/Valid Pipeline
- Pytorch一般的Train Pipeline/Valid Pipeline书写

In [57]:
# 导入库
import torch
from torch.utils.data import Dataset
import torch.utils.data as D
from torch import nn
import copy
import os
from sklearn.metrics import roc_auc_score, log_loss
from tqdm.notebook import tqdm
from collections import defaultdict  # 该模块提供高阶数据类型（有默认值的字典）

In [58]:
# 参数
config = {
    'train_path': '/kaggle/input/tabular-playground-series-may-2022/train.csv', 
    'test_path': '/kaggle/input/tabular-playground-series-may-2022/test.csv',
    "sparse_cols":['f_07','f_08','f_09','f_10','f_11','f_12','f_13','f_14','f_15','f_16','f_17','f_18','f_29','f_30']+[f'ch_{i}' for i in range(10)],
    "dense_cols": ['f_00','f_01','f_02','f_03','f_04','f_05','f_06','f_19','f_20','f_21','f_22','f_23','f_24','f_25','f_26','f_28',],
    "debug_mode": False,
    "epoch": 5,
    "batch": 2048,
    "lr": 0.001,
    "device": 0,
}

In [59]:
train_df = pd.read_csv(config['train_path'])
if config['debug_mode']:
    train_df = train_df[:1000]
test_df = pd.read_csv(config['test_path'])

df = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)

for i in tqdm(range(10)):
    df[f'ch_{i}'] = df['f_27'].str.get(i).apply(ord) - ord('A')

  0%|          | 0/10 [00:00<?, ?it/s]

In [60]:
# 特征编码函数
def get_enc_dict(df, config):
    enc_dict = defaultdict(dict)  # 元素数据类型为dict
    for f in tqdm(config['sparse_cols']):  # 离散型数据
        map_dict = dict(zip(df[f].unique(), range(1, df[f].nunique()+1)))
        enc_dict[f] = map_dict
        enc_dict[f]['vocab_size'] = df[f].unique()+1
        
    for f in tqdm(config['dense_cols']):  # 连续型数据
        enc_dict[f]['min'] = df[f].min()
        enc_dict[f]['max'] = df[f].max()
        enc_dict[f]['std'] = df[f].std()
        
    return enc_dict

In [61]:
enc_dict = get_enc_dict(df, config)
print(enc_dict)

  0%|          | 0/24 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

defaultdict(<class 'dict'>, {'f_07': {1: 1, 3: 2, 6: 3, 2: 4, 5: 5, 4: 6, 0: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 'vocab_size': array([ 2,  4,  7,  3,  6,  5,  1,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17])}, 'f_08': {5: 1, 3: 2, 0: 3, 2: 4, 1: 5, 7: 6, 6: 7, 4: 8, 9: 9, 8: 10, 10: 11, 11: 12, 12: 13, 13: 14, 16: 15, 14: 16, 'vocab_size': array([ 6,  4,  1,  3,  2,  8,  7,  5, 10,  9, 11, 12, 13, 14, 17, 15])}, 'f_09': {1: 1, 4: 2, 2: 3, 0: 4, 3: 5, 5: 6, 7: 7, 6: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 16: 16, 'vocab_size': array([ 2,  5,  3,  1,  4,  6,  8,  7,  9, 10, 11, 12, 13, 14, 15, 17])}, 'f_10': {3: 1, 0: 2, 6: 3, 4: 4, 2: 5, 7: 6, 1: 7, 5: 8, 10: 9, 8: 10, 9: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 'vocab_size': array([ 4,  1,  7,  5,  3,  8,  2,  6, 11,  9, 10, 12, 13, 14, 15, 16])}, 'f_11': {3: 1, 2: 2, 6: 3, 1: 4, 4: 5, 5: 6, 0: 7, 9: 8, 7: 9, 8: 10, 10: 11, 12: 12, 11: 13, 13: 14, 14: 15, 'vocab_size': array([ 4, 

In [62]:
# 构造dataset
class BaseDataset(Dataset):
    def __init__(self, df, config, enc_dict=None):
        self.df = df
        self.config = config
        self.enc_dict = enc_dict
        self.dense_cols = list(set(self.config['dense_cols']))
        self.sparse_cols = list(set(self.config['sparse_cols']))
        self.feature_name = self.dense_cols + self.sparse_cols + ['label']
        self.enc_data()  # 自动执行数据编码函数，得到self.enc_df
        
    def enc_dense_data(self, col):
        return (self.df[col] - self.enc_dict[col]['min']) / (self.enc_dict[col]['max'] - self.enc_dict[col]['min'])
    
    def enc_sparse_data(self, col):
        return self.df[col].apply(lambda x: self.enc_dict[col].get(x,0))
    
    def enc_data(self):
        self.enc_df = copy.deepcopy(self.df)  # 深拷贝
        for col in self.dense_cols:
            self.enc_df[col] = self.enc_dense_data(col)
        for col in self.sparse_cols:
            self.enc_df[col] = self.enc_sparse_data(col)
            
    def __getitem__(self, index):  # 根据index取数据
        data = dict()
        for col in self.feature_name:
            if col in self.dense_cols:
                data[col] = torch.Tensor([self.enc_df[col].iloc[index]]).squeeze(-1)
            elif col in self.sparse_cols:
                data[col] = torch.Tensor([self.enc_df[col].iloc[index]]).long().squeeze(-1)
            
        if 'target' in self.enc_df.columns:
            data['target'] = torch.Tensor([self.enc_df['target'].iloc[index]]).squeeze(-1)
        return data
    
    def __len__(self):
        return len(self.enc_df)

In [63]:
train_df = df[df['target'].notna()].reset_index(drop=True)
test_df = df[df['target'].isna()].reset_index(drop=True)

train_num = int(len(train_df)*0.8)  # 按照8:2划分训练集和验证集

valid_df = train_df[train_num:].reset_index(drop=True)
train_df = train_df[:train_num].reset_index(drop=True)

In [64]:
train_dataset = BaseDataset(train_df, config, enc_dict=enc_dict)
valid_dataset = BaseDataset(valid_df, config, enc_dict=enc_dict)
test_dataset = BaseDataset(test_df, config, enc_dict=enc_dict)

In [65]:
# 可以抽取出来看看
train_dataset.__getitem__(5)

{'f_21': tensor(0.4961),
 'f_22': tensor(0.4136),
 'f_00': tensor(0.5766),
 'f_03': tensor(0.4239),
 'f_01': tensor(0.6185),
 'f_06': tensor(0.3839),
 'f_20': tensor(0.6430),
 'f_23': tensor(0.4478),
 'f_26': tensor(0.5356),
 'f_05': tensor(0.5364),
 'f_02': tensor(0.5420),
 'f_28': tensor(0.5187),
 'f_19': tensor(0.5284),
 'f_25': tensor(0.5545),
 'f_04': tensor(0.6201),
 'f_24': tensor(0.3972),
 'ch_7': tensor(5),
 'f_15': tensor(1),
 'ch_8': tensor(5),
 'f_13': tensor(3),
 'ch_1': tensor(4),
 'f_09': tensor(2),
 'f_07': tensor(3),
 'f_08': tensor(4),
 'ch_3': tensor(3),
 'f_11': tensor(4),
 'ch_5': tensor(2),
 'f_18': tensor(4),
 'ch_9': tensor(4),
 'ch_2': tensor(1),
 'f_14': tensor(2),
 'f_16': tensor(5),
 'f_29': tensor(1),
 'f_17': tensor(4),
 'f_30': tensor(3),
 'f_12': tensor(3),
 'f_10': tensor(5),
 'ch_0': tensor(2),
 'ch_6': tensor(1),
 'ch_4': tensor(1),
 'target': tensor(0.)}

In [66]:
# 定义模型基本层
class EmbeddingLayer(nn.Module):
    def __init__(self,
                 enc_dict = None,
                 embedding_dim = None):
        super(EmbeddingLayer, self).__init__()
        self.enc_dict = enc_dict
        self.embedding_dim = embedding_dim
        self.embedding_layer = nn.ModuleDict()

        self.emb_feature = []

        for col in self.enc_dict.keys():
            if 'vocab_size' in self.enc_dict[col].keys():
                self.emb_feature.append(col)
                self.embedding_layer.update({col : nn.Embedding(
                    self.enc_dict[col]['vocab_size'],
                    self.embedding_dim,
                )})

    def forward(self, X):
        #对所有的sparse特征挨个进行embedding
        feature_emb_list = []
        for col in self.emb_feature:
            inp = X[col].long().view(-1, 1)
            feature_emb_list.append(self.embedding_layer[col](inp))
        feature_emb = torch.stack(feature_emb_list, dim=1)
        return feature_emb

In [67]:
# 定义多层感知机模型
class MLP_Layer(nn.Module):
    def __init__(self,
                 input_dim,
                 output_dim=None,
                 hidden_units=[],
                 hidden_activations="ReLU",
                 final_activation=None,
                 dropout_rates=0,
                 batch_norm=False,
                 use_bias=True):
        super(MLP_Layer, self).__init__()
        dense_layers = []
        if not isinstance(dropout_rates, list):
            dropout_rates = [dropout_rates] * len(hidden_units)
        if not isinstance(hidden_activations, list):
            hidden_activations = [hidden_activations] * len(hidden_units)
        hidden_activations = [set_activation(x) for x in hidden_activations]
        hidden_units = [input_dim] + hidden_units
        for idx in range(len(hidden_units) - 1):
            dense_layers.append(nn.Linear(hidden_units[idx], hidden_units[idx + 1], bias=use_bias))
            if batch_norm:
                dense_layers.append(nn.BatchNorm1d(hidden_units[idx + 1]))
            if hidden_activations[idx]:
                dense_layers.append(hidden_activations[idx])
            if dropout_rates[idx] > 0:
                dense_layers.append(nn.Dropout(p=dropout_rates[idx]))
        if output_dim is not None:
            dense_layers.append(nn.Linear(hidden_units[-1], output_dim, bias=use_bias))
        if final_activation is not None:
            dense_layers.append(set_activation(final_activation))
        self.dnn = nn.Sequential(*dense_layers)  # * used to unpack list

    def forward(self, inputs):
        return self.dnn(inputs)

In [68]:
# 相关函数
def set_device(gpu=-1):
    if gpu >= 0 and torch.cuda.is_available():
        os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu)
        device = torch.device(f"cuda:{gpu}")
    else:
        device = torch.device("cpu")
    return device
    
def set_activation(activation):
    if isinstance(activation, str):
        if activation.lower() == "relu":
            return nn.ReLU()
        elif activation.lower() == "sigmoid":
            return nn.Sigmoid()
        elif activation.lower() == "tanh":
            return nn.Tanh()
        else:
            return getattr(nn, activation)()
    else:
        return activation
    
def get_dnn_input_dim(enc_dict, embedding_dim):
    num_sparse = 0
    num_dense = 0
    for col in enc_dict.keys():
        if 'min' in enc_dict[col].keys():
            num_dense+=1
        elif 'vocab_size' in enc_dict[col].keys():
            num_sparse+=1
    return num_sparse * embedding_dim + num_dense

def get_linear_input(enc_dict, data):
    res_data = []
    for col in enc_dict.keys():
        if 'min' in enc_dict[col].keys():
            res_data.append(data[col])
    res_data = torch.stack(res_data,axis=1)
    return res_data

In [69]:
# 定义我们的模型
class TPSModel(nn.Module):
    def __init__(self,
                embedding_dim=16,
                hidden_units=[64,32,16],
                enc_dict=None,
                hidden_activations='relu',
                dropout_rates=0,
                loss_fun='torch.nn.BCELoss()'):
        super(TPSModel, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_units = hidden_units
        self.enc_dict = enc_dict
        self.hidden_activations = hidden_activations
        self.dropout_rates = dropout_rates
        self.loss_fun = eval(loss_fun)
        
        self.embedding_layer = EmbeddingLayer(enc_dict=self.enc_dict, 
                                              embedding_dim=self.embedding_dim)
        
        self.dnn_input_dim = get_dnn_input_dim(enc_dict=self.enc_dict,
                                              embedding_dim=self.embedding_dim)
        
        self.dnn = MLP_Layer(input_dim=self.dnn_input_dim,
                            output_dim=1,
                            hidden_units=self.hidden_units,
                            hidden_activations=self.hidden_activations,
                            dropout_rates=self.dropout_rates)
        
    def forward(self,data):
        sparse_embedding = self.embedding_layer(data)
        sparse_embedding = torch.flatten(sparse_embedding, start_dim=1)

        dense_input = get_linear_input(enc_dict=self.enc_dict, data=data)
        dnn_input = torch.cat([sparse_embedding, dense_input], axis=1)

        y_pred = self.dnn(dnn_input).sigmoid()
        loss = self.loss_fun(y_pred.squeeze(-1), data['target'])
        output_dict = {'pred':y_pred, 'loss':loss}
        return output_dict

In [70]:
# 训练、验证、测试的pipeline
def train_model(model, train_loader, optimizer, device, metric_list=['roc_auc_score','log_loss']):
    model.train()
    pred_list = []
    label_list = []
    pbar = tqdm(train_loader)
    for data in pbar:

        for key in data.keys():
            data[key] = data[key].to(device)

        output = model(data)
        pred = output['pred']
        loss = output['loss']

        loss.backward()
        optimizer.step()
        model.zero_grad()

        pred_list.extend(pred.squeeze(-1).cpu().detach().numpy())
        label_list.extend(data['target'].squeeze(-1).cpu().detach().numpy())
        pbar.set_description("Loss {}".format(loss))

    res_dict = dict()
    for metric in metric_list:
        if metric =='log_loss':
            res_dict[metric] = log_loss(label_list, pred_list, eps=1e-7)
        else:
            res_dict[metric] = eval(metric)(label_list, pred_list)

    return res_dict


def valid_model(model, valid_loader, device, metric_list=['roc_auc_score','log_loss']):
    model.eval()
    pred_list = []
    label_list = []

    for data in (valid_loader):

        for key in data.keys():
            data[key] = data[key].to(device)

        output = model(data)
        pred = output['pred']

        pred_list.extend(pred.squeeze(-1).cpu().detach().numpy())
        label_list.extend(data['target'].squeeze(-1).cpu().detach().numpy())

    res_dict = dict()
    for metric in metric_list:
        if metric =='log_loss':
            res_dict[metric] = log_loss(label_list, pred_list, eps=1e-7)
        else:
            res_dict[metric] = eval(metric)(label_list, pred_list)

    return res_dict


def test_model(model, test_loader, device):
    model.eval()
    pred_list = []

    for data in tqdm(test_loader):

        for key in data.keys():
            data[key] = data[key].to(device)

        output = model(data)
        pred = output['pred']
        pred_list.extend(pred.squeeze().cpu().detach().numpy())

    return np.array(pred_list)

In [71]:
# dataloader
train_loader = D.DataLoader(train_dataset, batch_size=config['batch'], shuffle=True, num_workers=0)
valid_loader = D.DataLoader(valid_dataset, batch_size=config['batch'], shuffle=False, num_workers=0)
test_loader = D.DataLoader(test_dataset, batch_size=config['batch'], shuffle=False, num_workers=0)

In [74]:
model = TPSModel(enc_dict=enc_dict)
device = set_device(config['device'])
optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], betas=(0.9, 0.999), eps=1e-08, weight_decay=0)

model = model.to(device)

TypeError: empty() received an invalid combination of arguments - got (tuple, dtype=NoneType, device=NoneType), but expected one of:
 * (tuple of ints size, *, tuple of names names, torch.memory_format memory_format, torch.dtype dtype, torch.layout layout, torch.device device, bool pin_memory, bool requires_grad)
 * (tuple of ints size, *, torch.memory_format memory_format, Tensor out, torch.dtype dtype, torch.layout layout, torch.device device, bool pin_memory, bool requires_grad)


In [75]:
# 训练
for i in range(config['epoch']):
    #模型训练
    train_metirc = train_model(model,train_loader,optimizer=optimizer,device=device)
    #模型验证
    valid_metric = valid_model(model,valid_loader,device)

    print("Train Metric:")
    print(train_metirc)
    print("Valid Metric:")
    print(valid_metric)

NameError: name 'model' is not defined

In [76]:
# 测试
y_pre = test_model(model, test_loader, device)

NameError: name 'model' is not defined

In [None]:
# 写入结果
