In [1]:
import numpy as np
import pandas as pd
from pprint import pprint
from tqdm.notebook import tqdm
from collections import Counter
import json
import math
import copy
import os
import pickle
import random

import torch
import torch.nn as nn 
import torch.nn.functional as F
from torch.autograd import Variable
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from sklearn.metrics import f1_score, roc_auc_score
# import wandb


## 参数

In [2]:
class CFG:
    # 数据文件
    train_files = [
        '../data/rec_data/train-dataset.csv',
        '../data/rec_data/newTrain-dataset.csv',
    ]
    test_file = '../data/rec_data/test-dataset.csv'
    recommend_content_entity_paths = ['/home/zyj/sohu/docker/data/rec_test/recommend_content_entity_0317_初赛.txt',
                                     '/home/zyj/sohu/docker/data/rec_test/recommend_content_entity_复赛_训练.txt',
                                     '/home/zyj/sohu/docker/data/rec_test/recommend_content_entity_复赛_测试.txt',]
    history_len = 15
    entity_len = 10
    his_entity_len = 20
    emtion_feature_path = './tmp/sentiment.dic'
    output_dir = './checkpoint/tmp/'
    # 模型参数
    heads = 4
    layers = 4
    dim = 128

    # 训练参数
    device=torch.device('cuda:0')
    epochs=3
    learning_rate = 1e-4
    batch_size=1024
    eval_epoch = 1
    apex = False
    seed=42 

    # scheduler参数
    scheduler='cosine'                   # ['linear', 'cosine'] # lr scheduler 类型
    last_epoch=-1                        # 从第 last_epoch +1 个epoch开始训练
    batch_scheduler=True                 # 是否每个step结束后更新 lr scheduler
    weight_decay=0.01
    num_warmup_steps = 0
    num_cycles=0.5                    # 如果使用 cosine lr scheduler， 该参数决定学习率曲线的形状，0.5代表半个cosine曲线

    # log参数
    log_step = 5
    wandb = False
    key_metrics = 'auc'


    
#=======设置全局seed保证结果可复现====
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

## 数据预处理

### 处理原始数据

In [3]:
def read_raw_data(path):
    print(f'read {path}')
    data = pd.read_csv(path)
    data['time'] = pd.to_datetime(data['logTs'],unit='ms',origin=pd.to_datetime('1970-01-01 08:00:00'))
    data['Hour'] = data['time'].dt.hour
    data['Min'] = data['time'].dt.hour*60+data['time'].dt.minute
    data['seq'] = data['userSeq'].str.split('[;:]').fillna(0)
    
    logTs_min = data.groupby('pvId')['logTs'].min()
    logTs_min = pd.DataFrame({"logTs_min": logTs_min}).reset_index()
    data = pd.merge(data, logTs_min, how='left', on='pvId')
    data['logTs_gap'] = (data['logTs'] - data['logTs_min'])/1000
    data['logTs_gap'] = data['logTs_gap'].astype(int)
    del data['logTs_min']
    del data['userSeq']
    # print(len(data), len(data.columns))
    return data

### 生成feat_mapper

In [4]:
# def gen_feat_mapper(feature):
#     feat_names = ['itemId', 'Hour', 'operator', 'browserType', 'deviceType', 'osType', 'province', 'city', 'logTs_gap']
#     ret = {}
#     for name in feat_names:
#         dic = {'unk':0}
#         idx = 1
#         for label in feature[name].unique():
#             dic[label] = idx
#             idx += 1
#         ret[name] = dic
#     entity_dic = {'unk':0}
#     entity_map = {}
#     entity_idx = 1
#     for path in CFG.recommend_content_entity_paths:
#         with open(path) as f:
#             for line in f:
#                 if len(line.strip()):
#                     js = json.loads(line)
#                     entity_map[int(js['id'])] = js['entity']
#                     for entity in js['entity']:
#                         if entity not in entity_dic:
#                             entity_dic[entity] = entity_idx
#                             entity_idx += 1
#     ret['entity'] = entity_dic

#     his_logTs_dic = {'unk':0}
#     for row in tqdm(feature[['seq', 'logTs']].itertuples()):
#         history = row.seq
#         if history ==0 or len(history)==0:
#             continue
#         else:
#             assert len(history)%2==0
#             his_logTs = []
#             for i in range(0,len(history),2):
#                 itemId = int(history[i])
#                 logTs = int(history[i+1])
#                 his_logTs.append(logTs)
#                 if itemId not in ret['itemId']:
#                     ret['itemId'][itemId] = len(ret['itemId'])
#             his_logTs = (row.logTs - np.array(his_logTs))//1000//3600 # 历史到当前query的时间间隔（小时数）
#             for each in his_logTs:
#                 each = int(each)
#                 if each not in his_logTs_dic:
#                     his_logTs_dic[each] = len(his_logTs_dic)
#     ret['his_logTs'] = his_logTs_dic

#     ret['itemId']['noHistory'] = len(ret['itemId']) # 无历史标签
    
#     ret['emotion'] = {'pad':0, -2:1, -1:2, 0:3, 1:4, 2:5}
        
#     for k, v in ret.items():
#         print(f'name: {k}\tlen: {len(v)}')
#     with open('./tmp/feat_mapper.pkl', 'wb') as f:
#         pickle.dump(ret, f)
#     print('save feat_mapper at ./tmp/feat_mapper.pkl')

In [5]:
# features = []
# for path in CFG.train_files + [CFG.test_file]:
#     features.append(read_raw_data(path))
# gen_feat_mapper(pd.concat(features))

### Dataset

In [6]:
class TrainDataset(Dataset):
    def __init__(self, feature:pd.DataFrame=None, feature_name:str=None):
        if feature is None:
            path = f'./tmp/{feature_name}.csv'
            assert os.path.exists(path)
            self.df = pd.read_csv(path)
        else:
            print(f'{feature_name} feature len = {len(feature)}')
            df = {}
            # 直接特征
            with open('./tmp/feat_mapper.pkl', 'rb') as f:
                feat_mapper = pickle.load(f)
            names = ['itemId', 'Hour', 'operator', 'browserType', 'deviceType', 'osType', 'province', 'city', 'logTs_gap']
            for name in names:
                df[name] = feature[name].map(feat_mapper[name], na_action=0).values
            
            emotionDic = {}
            with open(CFG.emtion_feature_path, 'r', encoding='utf-8') as f:
                for line in f:
                    arr = line.strip().split('\t')
                    if arr[0] == 'id': continue
                    emotionDic[int(arr[0])] = json.loads(arr[1])
            print(len(emotionDic))


            
            print('deal history feature...')
            entitys = []
            emotions = []
            history_itemId = []
            history_logTs = []
            history_entitys = []
            history_emotions = []
            for row in tqdm(feature[['itemId', 'seq', 'logTs']].itertuples()):
                # 实体和情感特征
                itemId= row.itemId
                entity_feature = [0] * CFG.entity_len
                emotion_feature = [0] * CFG.entity_len
                assert itemId in emotionDic
                entity_emotion_dic = emotionDic[itemId]
                # 情感强烈的实体排前面
                sorted_emotions = sorted([[x,y] for x,y in entity_emotion_dic.items()], key=lambda pair:-abs(pair[1]))
                for idx, (x,y) in enumerate(sorted_emotions[:CFG.entity_len]):
                    if x in feat_mapper['entity']:
                        entity_feature[idx] = feat_mapper['entity'][x] 
                    else:
                        entity_feature[idx] = feat_mapper['entity']['unk'] 
                    assert y in feat_mapper['emotion']
                    emotion_feature[idx] = feat_mapper['emotion'][y] 

                assert len(entity_feature) == CFG.entity_len
                assert len(emotion_feature) == CFG.entity_len
                entitys.append(entity_feature)
                emotions.append(emotion_feature)

                # 历史itmeId和logTs特征
                history = row.seq
                history_itemId_feature = [0] * CFG.history_len
                history_logTs_feature = [0] * CFG.history_len
                history_entitys_feature = [0] * CFG.his_entity_len
                history_emotions_feature = [0] * CFG.his_entity_len
                history_entity_idx = 0
                cmp = set(entity_feature)
                if history==0 or len(history)==0:
                    history_itemId.append(history_itemId_feature)
                    history_logTs.append(history_logTs_feature)
                    history_entitys.append(history_entitys_feature)
                    history_emotions.append(history_emotions_feature)
                    continue
                else:
                    assert len(history)%2==0
                    his_itemId = []
                    his_logTs = []
                    for i in range(0,len(history),2):
                        itemId = int(history[i])
                        his_itemId.append(itemId)
                        logTs = int(history[i+1])
                        his_logTs.append(logTs)
                    his_logTs = (row.logTs - np.array(his_logTs))//1000//3600 # 历史到当前query的时间间隔（小时数）
                    # logTs大的排前面（取最近的历史信息）
                    sortedHis = sorted([[x,y] for x,y in zip(his_itemId,his_logTs)], key=lambda pair:-pair[1])
                    for idx, (x,y) in enumerate(sortedHis[:CFG.history_len]):
                        if x in feat_mapper['itemId']:
                            history_itemId_feature[idx] = feat_mapper['itemId'][x] 
                        else:
                            history_itemId_feature[idx] = 0 
                        if y in feat_mapper['his_logTs']:
                            history_logTs_feature[idx] = feat_mapper['his_logTs'][y] 
                        else:
                            history_logTs_feature[idx] = 0
                        if history_entity_idx < CFG.his_entity_len and x in emotionDic:
                            for entity, emotion in list(emotionDic[x].items()):
                                if entity in feat_mapper['entity'] and feat_mapper['entity'][entity] in cmp:
                                    history_entitys_feature[history_entity_idx] = feat_mapper['entity'][entity] 
                                    history_emotions_feature[history_entity_idx] = feat_mapper['emotion'][emotion] 
                                    history_entity_idx += 1
                                    if history_entity_idx >= CFG.his_entity_len:
                                        break

                assert len(history_logTs_feature) == CFG.history_len
                assert len(history_itemId_feature) == CFG.history_len
                history_itemId.append(history_itemId_feature)
                history_logTs.append(history_logTs_feature)
                history_entitys.append(history_entitys_feature)
                history_emotions.append(history_emotions_feature)
            df['history_itemId'] = history_itemId
            df['history_logTs'] = history_logTs
            df['entitys'] = entitys
            df['emotions'] = emotions
            df['history_entitys'] = history_entitys
            df['history_emotions'] = history_emotions

            if 'label' in feature.columns:
                df['label'] = feature['label'].values
            df = pd.DataFrame(df)
            print(df.columns)
            df.to_csv(f'./tmp/{feature_name}.csv', index=False)
            print(f'save at ./tmp/{feature_name}.csv')

            
            self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, item):
        feature = self.df.iloc[item].to_dict()
        for k,v in feature.items():
            if isinstance(feature[k], str):
                feature[k] = json.loads(v)
        return feature


## Model

### Transformer

In [7]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=None, dropout = 0.1):
        super().__init__() 
        if d_ff == None:
            d_ff = d_model * 4
        self.linear_1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model)
    
    def forward(self, x):
        x = self.dropout(F.gelu(self.linear_1(x)))
        x = self.linear_2(x)
        return x

class SelfAttentionLayer(nn.Module):
    def __init__(self, d_model, heads, hidden_dim=None, dropout=0.1):
        super().__init__()

        # MultiheadAttention layer
        self.norm_1 = nn.LayerNorm(d_model)
        self.norm_2 = nn.LayerNorm(d_model)
        
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        
        self.attn = nn.MultiheadAttention(d_model, heads, dropout=dropout)

        # FeedForward layer
        self.ff = FeedForward(d_model, hidden_dim, dropout=dropout)

        self.apply(self.init_weight)

    def init_weight(self, module):
        if isinstance(module, nn.LayerNorm):
            torch.nn.init.zeros_(module.bias)
            torch.nn.init.ones_(module.weight)
    
    def forward(self, x):
        x2 = self.norm_1(x)
        x = x + self.dropout_1(self.attn(x2,x2,x2,need_weights=False)[0])
        x2 = self.norm_2(x)
        x = x + self.dropout_2(self.ff(x2))
        return x

### AutoregressiveModel

In [8]:
class AutoregressiveModel(nn.Module):
    def __init__(self, dim, heads, layer_num, device, hidden_dim=None, dropout=0.1):
        super().__init__()
        self.simple_feature = ['itemId', 'logTs_gap', 'Hour', 'operator', 'browserType', 
                                'deviceType', 'osType', 'province', 'city']

        with open('./tmp/feat_mapper.pkl', 'rb') as f:
            feat_mapper = pickle.load(f)
            # for each in feat_mapper:
            #     print(each, len(feat_mapper[each]))
        self.feat_emb = {
            name:nn.Embedding(len(dic), dim).to(device)
            for name, dic in feat_mapper.items()
        }
        
        self.device = device

        # # Transformers
        # self.history_itemId_transformer = nn.Sequential(*[SelfAttentionLayer(dim, heads//2, hidden_dim, dropout)
        #                                 for i in range(layer_num//2)])
        # self.history_logTs_transformer = nn.Sequential(*[SelfAttentionLayer(dim, heads//2, hidden_dim, dropout)
        #                                 for i in range(layer_num//2)])
        # self.entity_transformer = nn.Sequential(*[SelfAttentionLayer(dim, heads//2, hidden_dim, dropout)
        #                                 for i in range(layer_num//2)])
        # self.emotion_transformer = nn.Sequential(*[SelfAttentionLayer(dim, heads//2, hidden_dim, dropout)
        #                                 for i in range(layer_num//2)])
                                        
        self.feature_transformer = nn.Sequential(*[SelfAttentionLayer(dim, heads, hidden_dim, dropout)
                                        for i in range(layer_num)])

        # # linears
        # self.history_linear = nn.Linear(dim*CFG.history_len, dim)
        # self.drop1 = nn.Dropout(dropout)

        # self.entity_linear = nn.Linear(dim*CFG.entity_len, dim)
        # self.drop2 = nn.Dropout(dropout)

        # classify the whole feature
        feat_num = len(self.simple_feature) + CFG.entity_len + CFG.history_len + CFG.his_entity_len
        self.linear = nn.Linear(dim*feat_num, 2)

        # 随机初始化权重
        self.apply(self.init_weight)
        for emb in self.feat_emb.values():
            torch.nn.init.normal_(emb.weight, mean=0.0, std=0.02)

        self.loss = nn.CrossEntropyLoss()

    def init_weight(self, module):
        if isinstance(module, (nn.Linear, nn.Embedding)):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if isinstance(module, nn.Linear) and module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.LayerNorm):
            torch.nn.init.zeros_(module.bias)
            torch.nn.init.ones_(module.weight)


    def gen_seq_input(self, batch:dict, batch_name:str, mapper_name:str):
        inputs = []
        for input in batch[batch_name]: # [seq_len, batch_size]
            feature_ids = torch.LongTensor(input).to(self.device)
            inputs.append(self.feat_emb[mapper_name](feature_ids))
        # return torch.stack(inputs) # [seq_len, batch, dim]
        return inputs # [seq_len, batch, dim]


    def forward(self, batch:dict):
        # batch_size = batch['itemId'].size()
        feature_seq = []
        # 普通特征
        for name in self.simple_feature:
            feature_ids = torch.LongTensor(batch[name]).to(self.device)
            feature_seq.append(self.feat_emb[name](feature_ids)) # [batch, dim]
        # 当前文章实体+情感特征
        entitys_seq = self.gen_seq_input(batch, 'entitys', 'entity')
        emotion_seq = self.gen_seq_input(batch, 'emotions', 'emotion')
        for a,b in zip(entitys_seq, emotion_seq): # CFG.entity_len
            feature_seq.append((a+b)/2)
        # 历史文章+时间特征
        his_itemId_seq = self.gen_seq_input(batch, 'history_itemId', 'itemId')
        his_logTs_seq = self.gen_seq_input(batch, 'history_logTs', 'his_logTs')
        for a,b in zip(his_itemId_seq, his_logTs_seq): # CFG.history_len
            feature_seq.append((a+b)/2)
        # 历史文章实体+情感特征
        his_entitys_seq = self.gen_seq_input(batch, 'history_entitys', 'entity')
        his_emotions_seq = self.gen_seq_input(batch, 'history_emotions', 'emotion')
        for a,b in zip(his_entitys_seq, his_emotions_seq): # CFG.history_len
            feature_seq.append((a+b)/2)

        # entitys_feature = self.entity_transformer(entitys_seq) # [seq_len, batch_size, dim]
        # emotion_feature = self.emotion_transformer(emotion_seq) # [seq_len, batch_size, dim]
        # # [batch_size, seq*dim]
        # et_em_feature = entitys_feature.permute(1,0,2).flatten(-2) + \
        #                     emotion_feature.permute(1,0,2).flatten(-2)
        # feature_seq.append(self.drop2(self.entity_linear(et_em_feature)))
        
        # his_itemId_feature = self.history_itemId_transformer(his_itemId_seq) # [seq_len, batch_size, dim]
        
        # his_logTs_feature = self.history_logTs_transformer(his_logTs_seq) # [seq_len, batch_size, dim]
        # his_feature = his_itemId_feature.permute(1,0,2).flatten(-2) + \
        #                     his_logTs_feature.permute(1,0,2).flatten(-2)
        # feature_seq.append(self.drop1(self.history_linear(his_feature)))

        final_feature = self.feature_transformer(torch.stack(feature_seq)) # [feat_num, batch_size, dim]
        out = self.linear(final_feature.permute(1,0,2).flatten(-2)) # [batch_size, 2]

        if 'label' in batch:
            label = torch.LongTensor(batch['label']).to(self.device)
            loss = self.loss(out, label)
            return loss, out
        else:
            return out

## eval

In [9]:
def get_metrics(preds, logits, labels): # pre
    metrics = {}
    metrics['f1'] = f1_score(preds, labels, average='macro')
    metrics['auc'] = roc_auc_score(labels, logits[:,1])
    return metrics

def evaluate(model, valid_dataloader, device):
    model.eval()
    labels = []
    preds = []
    logits = []
    tk0 = tqdm(enumerate(valid_dataloader),total=len(valid_dataloader))
    total_loss = 0
    for step, batch in tk0:            
        with torch.no_grad():
            with torch.cuda.amp.autocast(enabled=CFG.apex):
                loss, batch_logits = model(batch)
        total_loss += loss.item()
        labels.append(batch['label'].detach().cpu())
        preds.append(batch_logits.argmax(-1).detach().cpu())
        logits.append(F.softmax(batch_logits, dim=-1).detach().cpu()) # softmax
        
    metrics = get_metrics(preds=torch.cat(preds).numpy(),
                          logits=torch.cat(logits).numpy(),
                          labels=torch.cat(labels).numpy())
    metrics['eval_loss'] = total_loss / len(valid_dataloader)
    return metrics


## train loop

In [10]:
def train_eval(model, train_dataloader, valid_dataloader, save_path):
    device = CFG.device
    best_score = 0
    total_step = 0
    model.to(device)
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    if not len(train_dataloader):
        raise EOFError("Empty train_dataloader.")

    # 过滤掉冻结的权重
    param_optimizer = [(n, p) for n, p in model.named_parameters() if p.requires_grad]

    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    # 设置权重decay
    optimizer_grouped_parameters = [
        {"params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": CFG.weight_decay},
        {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0}]
    optimizer = AdamW(optimizer_grouped_parameters, lr=CFG.learning_rate, weight_decay=CFG.weight_decay)
    
    num_train_steps = int(len(train_dataloader) * CFG.epochs)
    if CFG.scheduler=='cosine':
        scheduler = get_cosine_schedule_with_warmup(
                    optimizer, 
                    num_warmup_steps=CFG.num_warmup_steps, 
                    num_training_steps=num_train_steps, 
                    num_cycles=CFG.num_cycles, 
#                     last_epoch = ((CFG.last_epoch+1)/CFG.epochs)*num_train_steps
                )
    else:
        scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=CFG.num_warmup_steps, num_training_steps=num_train_steps
            )

    metrics = evaluate(model, valid_dataloader, device)
    print(f"eval at begin metrics = ")
    pprint(metrics)
    if CFG.wandb:
        wandb.log(metrics, step=total_step)

    for cur_epc in range(int(CFG.epochs)):
        training_loss = 0
        print("Epoch: {}".format(cur_epc))
        model.train()
        tk0 = tqdm(enumerate(train_dataloader),total=len(train_dataloader))
        for step, batch in tk0:
            total_step += 1
            with torch.cuda.amp.autocast(enabled=CFG.apex):
                loss, logits = model(batch)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            if CFG.batch_scheduler:
                scheduler.step()
            training_loss += loss.item()
            tk0.set_postfix(Epoch=cur_epc, Loss=training_loss/(step+1))
            if CFG.wandb and (step + 1) % CFG.log_step == 0:
                wandb.log({'train_loss':loss, 'lr':optimizer.param_groups[0]["lr"], 'epoch': cur_epc},
                          step=total_step)
        if cur_epc % CFG.eval_epoch == 0:
            metrics = evaluate(model, valid_dataloader, device)
            print(f"eval at epoch {cur_epc} metrics = ")
            pprint(metrics)
            if CFG.wandb:
                wandb.log(metrics, step=total_step)
            if cur_epc > 0 and metrics[CFG.key_metrics] >= best_score:
                best_score = metrics[CFG.key_metrics]
                # model_save_path = os.path.join(save_path,f'epoch{cur_epc}.pt') # 保留所有checkpoint
                model_save_path = os.path.join(save_path,f'best_checkpoint.pt') # 保留最优checkpoint
                torch.save(model, model_save_path)
                print(f'save at {model_save_path}')
        
    torch.cuda.empty_cache()     

## 读取数据

### 预处理

In [11]:
# train_df = []
# for path in CFG.train_files:
#     train_df.append(read_raw_data(path))
# train_df = pd.concat(train_df)
# # 划分训练测试集
# pvId_list = list(set(train_df['pvId']))
# random.shuffle(pvId_list)
# length = len(pvId_list)
# train_pvId = pvId_list[:int(length*0.95)]
# valid_pvId = pvId_list[int(length*0.95):]
# train = train_df.loc[train_df['pvId'].isin(train_pvId)]
# valid = train_df.loc[train_df['pvId'].isin(valid_pvId)]
# test = read_raw_data(CFG.test_file)

# # 预处理
# train_dataset = TrainDataset(train, 'train')
# valid_dataset = TrainDataset(valid, 'valid')
# test_dataset = TrainDataset(test, 'test')

## 主程序

In [12]:
if __name__ == '__main__':
    seed_everything(seed=42)
    if not os.path.exists(CFG.output_dir):
        os.makedirs(CFG.output_dir)
    with open(os.path.join(CFG.output_dir, 'config.txt'), 'w') as f:
        for k,v in CFG.__dict__.items():
            f.write(f'{k}: {v}\n')

    # 加载数据
    train_dataset = TrainDataset(feature_name='train')
    valid_dataset = TrainDataset(feature_name='valid')
    test_dataset = TrainDataset(feature_name='test')
    train_dataloader = DataLoader(train_dataset, batch_size=CFG.batch_size)
    valid_dataloader = DataLoader(valid_dataset, batch_size=CFG.batch_size)
    test_dataloader = DataLoader(test_dataset, batch_size=CFG.batch_size)

    # 加载模型
    model = AutoregressiveModel(CFG.dim, CFG.heads, CFG.layers, CFG.device)    
    if CFG.wandb:
        wandb.init(project='sohu-2022-Transformer推荐', 
                   name=f'dim{CFG.dim}-head{CFG.heads}-layer{CFG.layers}')
    
    # 训练
    train_eval(model, train_dataloader, valid_dataloader, CFG.output_dir)




  0%|          | 0/275 [00:00<?, ?it/s]

eval at begin metrics = 
{'auc': 0.48506335314483695,
 'eval_loss': 0.5589232290874828,
 'f1': 0.4649077099712953}
Epoch: 0


  0%|          | 0/5227 [00:00<?, ?it/s]

  0%|          | 0/275 [00:00<?, ?it/s]

eval at epoch 0 metrics = 
{'auc': 0.6171822044691363,
 'eval_loss': 0.5225819716670297,
 'f1': 0.4481230365087934}
Epoch: 1


  0%|          | 0/5227 [00:00<?, ?it/s]

  0%|          | 0/275 [00:00<?, ?it/s]

eval at epoch 1 metrics = 
{'auc': 0.6225705959561388,
 'eval_loss': 0.5110003240541978,
 'f1': 0.4496319671878873}
save at ./checkpoint/tmp/best_checkpoint.pt
Epoch: 2


  0%|          | 0/5227 [00:00<?, ?it/s]