In [1]:
import paddle
from paddle import nn

import json
import pandas as pd
import numpy as np
from scipy.special import erfinv 
import warnings
warnings.filterwarnings("ignore")

  from collections import Callable
  'nearest': Image.NEAREST,
  'bilinear': Image.BILINEAR,
  'bicubic': Image.BICUBIC,
  'box': Image.BOX,
  'lanczos': Image.LANCZOS,
  'hamming': Image.HAMMING


In [2]:
#load train data
with open('./data/CVPR_2022_NAS_Track2_train.json', 'r') as f:
    train_data = json.load(f)

In [3]:
target_cols = ['cplfw_rank', 'market1501_rank', 'dukemtmc_rank', 'msmt17_rank', 'veri_rank', 'vehicleid_rank', 'veriwild_rank', 'sop_rank']

def get_df(train_data):
    ret = []
    for k, v in train_data.items():
        tmp = list(v['arch'])
        tmp1 = []
        for c in target_cols:
            tmp1.append(v[c])
        ret.append(tmp+tmp1+[k,v['arch']])
    retf = pd.DataFrame(ret,columns=[f'col{_}' for _ in range(len(tmp))]+target_cols+['id','arch'])
    retf['col0'] = retf['col0'].map({'l':1, 'j':2, 'k':3})
    int_cols = [x for x in retf.columns if x not in ['id','arch']]
    retf[int_cols] = retf[int_cols].astype(float)
    return retf

train_df = get_df(train_data)

base_cols = [x for x in train_df.columns if x[:3]=='col']
len(base_cols)

37

In [4]:
# erfinv trans
for c in target_cols:
    train_y = train_df[c]
    mmin = np.min(train_y)+1
    mmax = np.max(train_y)+1
    train_y = np.sqrt(2) * erfinv(2 * (train_y+mmin)/(mmin+mmax)-1)
    train_df[c+'_trans_y'] = train_y

In [5]:
def train_ohe(train_df, use_cols):
    for c in use_cols:
        for j in sorted(train_df[c].unique()):
            train_df[f'ohe_{c}_{int(j)}'] = np.where(train_df[c]==j, 1, 0)

In [6]:
train_ohe(train_df, base_cols)

In [7]:
len(base_cols)

37

In [8]:
ohe_cols = [x for x in train_df.columns if 'ohe' in x]

In [9]:
len(ohe_cols)

93

In [10]:
from sklearn.model_selection import KFold
from tqdm import tqdm

#定义数据
class MyDataset(paddle.io.Dataset):
    def __init__(self, df, use_cols, target_cols, show=0, is_val=0):
        self.df = df
        self.show = show
        self.use_cols = use_cols
        self.target_cols = target_cols
        self.is_val = is_val

        self.prepare_data()
        
    def __len__(self):
        return self.df.shape[0]
    
    def prepare_data(self):
        self.inputs = self.df[self.use_cols].values
        if self.is_val==0:
            self.y = self.df[self.target_cols].values
        
        if self.show==1:
            print('inputs_shape',self.inputs.shape)
            if self.is_val==0:
                print('y_shape',self.y.shape)
        
    def __getitem__(self, idx):
        if self.is_val==0:
            data = {
                "input": paddle.to_tensor(self.inputs[idx], dtype='float32'),
                "y": paddle.to_tensor(self.y[idx], dtype='float32'),
            }
        else:
            data = {
                "input": paddle.to_tensor(self.inputs[idx], dtype='float32')
            }
        
        return data

In [11]:
use_cols = ohe_cols

In [12]:
len(use_cols)

93

In [13]:
target_cols = [x+'_trans_y' for x in target_cols]
target_cols

['cplfw_rank_trans_y',
 'market1501_rank_trans_y',
 'dukemtmc_rank_trans_y',
 'msmt17_rank_trans_y',
 'veri_rank_trans_y',
 'vehicleid_rank_trans_y',
 'veriwild_rank_trans_y',
 'sop_rank_trans_y']

In [14]:
import gc
import os
import time
import random
import numpy as np
import paddle
from paddle import nn
from paddle.io import DataLoader

def seed_everything(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    paddle.seed(seed)
    
def count_parameters(model, all=False):
    if all:
        return sum(p.numel() for p in model.parameters())
    else:
        return sum(p.numel() for p in model.parameters() if p.stop_gradient==False)


def save_model_weights(model, modelpath, filename):
    paddle.save(model.state_dict(), modelpath+filename)
    return f"\n -> Save weights to {modelpath+filename}\n"

from paddle.optimizer.lr import LRScheduler
#自定义warmup，warmup到达lr后再线性衰减
class LinearWarmup(LRScheduler):
    def __init__(self,
                 learning_rate,
                 warmup_steps,
                 num_training_steps,
                 end_lr=0.1,
                 start_lr=0.,
                 last_epoch=-1,
                 verbose=False):
        type_check = isinstance(learning_rate, float) or isinstance(
            learning_rate, int) or isinstance(learning_rate, LRScheduler)
        if not type_check:
            raise TypeError(
                "the type of learning_rate should be [int, float or LRScheduler], the current type is {}".
                format(learning_rate))
        self.learning_rate = learning_rate
        assert warmup_steps > 0 and isinstance(
            warmup_steps, int), " 'warmup_steps' must be a positive integer."
        self.warmup_steps = warmup_steps
        self.start_lr = start_lr
        self.end_lr = learning_rate
        assert end_lr > start_lr, "end_lr {} must be greater than start_lr {}".format(
            end_lr, start_lr)
        super(LinearWarmup, self).__init__(start_lr, last_epoch, verbose)
        self.num_training_steps = num_training_steps

    def state_dict(self):
        """
        Returns the state of the LinearWarmup scheduler as a :class:`dict`.
        It is a subset of ``self.__dict__`` .
        """
        state_dict = super(LinearWarmup, self).state_dict()
        if isinstance(self.learning_rate, LRScheduler):
            state_dict["LinearWarmup_LR"] = self.learning_rate.state_dict()
        return state_dict

    def set_state_dict(self, state_dict):
        """
        Loads state_dict for LinearWarmup scheduler.
        """
        super(LinearWarmup, self).set_state_dict(state_dict)
        if isinstance(self.learning_rate, LRScheduler):
            self.learning_rate.set_state_dict(state_dict["LinearWarmup_LR"])

    def get_lr(self):
        if self.last_epoch < self.warmup_steps:
            return (self.end_lr - self.start_lr) * float(
                self.last_epoch) / float(self.warmup_steps) + self.start_lr
        else:
            return (self.end_lr - self.start_lr) * max(
            0.0, float(self.num_training_steps - self.last_epoch) / float(max(1, self.num_training_steps - self.warmup_steps))
        )

In [15]:
import scipy

def compute_metric(pred, y):
    corr = []
    if pred.shape[1]>2:
        for i in range(8):
            corr.append(scipy.stats.stats.kendalltau(pred[:, i], y[:, i])[0])
    else:
        corr.append(scipy.stats.stats.kendalltau(pred, y)[0])
    return np.array(corr)

In [16]:
def train(model, 
        train_dataset, 
        val_dataset, 
        verbose=20, 
        fold_=0,
        modelname='MLP_base',
        modelpath=r'./model'+'//',
        input='input',
        y='y',
        early_stop_round=60,
        debug=False):
    
    print(f'Model parameters count: {count_parameters(model)}')
    #data loader
    train_loader = DataLoader(
        train_dataset,
        batch_size=BATCH_SIZE,
        shuffle=False,
        drop_last=True,
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=BATCH_SIZE_TEST,
        shuffle=False,
        drop_last=False,
    )
    print(f'train batch num: {len(train_loader)}')
    print(f'val batch num: {len(val_loader)}')
            

    # Scheduler
    num_warmup_steps = int(0.1 * EPOCHS * len(train_loader))
    num_training_steps = int(EPOCHS * len(train_loader))
    
    scheduler = LinearWarmup(
        learning_rate=LR,
        warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )
    # Optimizer
    optimizer = getattr(paddle.optimizer, optim)(learning_rate=scheduler, parameters=model.parameters())
    print(f'optim: {optim}, lr: {LR}, warmup_steps: {num_warmup_steps}, training steps: {num_training_steps}')
    
    print(f'early stopping round: {early_stop_round}\n')
    #train
    bst_epoch=0
    score_best=0
    first_epoch_eval=0
    for epoch in range(EPOCHS):
        if epoch > early_stop_round and (epoch - bst_epoch > early_stop_round):
            print(f'early stopping.')
            break
        
        model.train()
        optimizer.clear_grad()
        start_time = time.time()

        avg_loss = 0
        for data in train_loader:
            pred = model(data[input])

            loss = loss_fct(
                pred,
                data[y]
            ).mean()

            loss.backward()
            avg_loss += loss.item() / len(train_loader)

            optimizer.step()
            scheduler.step()

            optimizer.clear_grad()

        #VAL
        model.eval()
        mae, avg_val_loss = 0, 0
        preds = []

        with paddle.no_grad():
            for data in val_loader:
                pred = model(data[input])

                loss = loss_fct(
                    pred,
                    data[y]
                ).mean()

                avg_val_loss += loss.item() / len(val_loader)

                preds.append(pred.numpy())

        preds = np.concatenate(preds, 0)
        if y=='y':
            mae = compute_metric(preds,val_dataset.df[target_cols].values).mean()
        else:
            mae = compute_metric(preds,val_dataset.df[[target_cols[int(y.replace('y',''))]]].values).mean()
        
        elapsed_time = time.time() - start_time
        if (epoch + 1) % verbose == 0:
            elapsed_time = elapsed_time * verbose
            lr = scheduler.get_lr()
            
            print(
                f"Epoch {epoch + 1:02d}/{ EPOCHS:02d} \t lr={lr:.1e}\t t={elapsed_time:.0f}s \t"
                f"loss={avg_loss:.4f}",
                end="\t",
            )

            if (epoch + 1 >= first_epoch_eval) or (epoch + 1 == EPOCHS):
                print(f"val_loss={avg_val_loss:.4f}\tcorr={mae:.4f}")
            else:
                print("")
                
        #保存最优模型
        if mae>score_best:
            bst = save_model_weights(model, modelpath, f'{modelname}_{fold_}.pt')
            score_best = mae
            bst_epoch = epoch
            if y=='y':
                bst_list = compute_metric(preds,val_dataset.df[target_cols].values)
            else:
                bst_list = compute_metric(preds,val_dataset.df[[target_cols[int(y.replace('y',''))]]].values).mean()
            bst_preds = preds
    print(f'best score {score_best}, best epoch: {bst_epoch}, {bst} ' )
    print(np.mean(bst_list),bst_list,'\n\n')
    del (val_loader, train_loader, loss, data, pred)
    gc.collect()
    paddle.device.cuda.empty_cache()
    
    return bst_preds, bst_list

In [17]:
class CVPRModel(nn.Layer):
    def __init__(
        self,
        input_dim=3,
        num_classes=8,
    ):
        super(CVPRModel, self).__init__()
        
        self.SuperLinear = nn.Linear(input_dim, num_classes)
        
    def forward(self, x):
        pred = self.SuperLinear(x)
        return pred

In [18]:
seed = 666
BATCH_SIZE = 400
BATCH_SIZE_TEST = 128
EPOCHS = 300
LR = 0.1
optim = "Adam"
paddle.set_device("gpu")
modelpath = r'./model/'

# K折，对做了目标变换及ohe的8目标同时训练
loss_fct = nn.MSELoss()
k=5
scoref = []
skf = KFold(n_splits=k, shuffle=False)
for index, (train_index, test_index) in enumerate(skf.split(train_df)):   
    print(f'FOLD {index}')
    train0 = train_df.iloc[train_index]
    val0 = train_df.iloc[test_index]   
    train_dataset = MyDataset(train0, use_cols, target_cols)
    val_dataset = MyDataset(val0, use_cols, target_cols)
    print(f'train size: {len(train0)}, val size: {len(val0)}')

    modelname = f'paddle_superlinear_ranker'
    seed_everything(seed)
    model = CVPRModel(input_dim=len(use_cols),
                    num_classes=8,
                   )
    preds,_ = train(model, 
                train_dataset, 
                val_dataset, 
                verbose=20, 
                fold_=index,
                modelname=modelname,
                modelpath=modelpath,
                input='input',
                y='y',
                early_stop_round=60,
                debug=False
                 )
    scoref.append(_)
scoreff = scoref
np.round(np.array(scoreff).mean(1).mean(), 5), [round(x, 5) for x in np.array(scoreff).mean(0)]

FOLD 0
train size: 400, val size: 100
Model parameters count: Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=False,
       [752])
train batch num: 1
val batch num: 1
optim: Adam, lr: 0.1, warmup_steps: 30, training steps: 300
early stopping round: 60

Epoch 20/300 	 lr=6.7e-02	 t=1s 	loss=0.3333	val_loss=0.4005	corr=0.6499
Epoch 40/300 	 lr=9.6e-02	 t=1s 	loss=0.1816	val_loss=0.2423	corr=0.7374
Epoch 60/300 	 lr=8.9e-02	 t=1s 	loss=0.1364	val_loss=0.1754	corr=0.7774
Epoch 80/300 	 lr=8.1e-02	 t=1s 	loss=0.1311	val_loss=0.1725	corr=0.7855
Epoch 100/300 	 lr=7.4e-02	 t=1s 	loss=0.1305	val_loss=0.1709	corr=0.7886
Epoch 120/300 	 lr=6.7e-02	 t=1s 	loss=0.1304	val_loss=0.1707	corr=0.7878
Epoch 140/300 	 lr=5.9e-02	 t=1s 	loss=0.1304	val_loss=0.1706	corr=0.7885
Epoch 160/300 	 lr=5.2e-02	 t=1s 	loss=0.1304	val_loss=0.1706	corr=0.7881
early stopping.
best score 0.788888888888889, best epoch: 106, 
 -> Save weights to ./model/paddle_superlinear_ranker_0.pt
 
0.788888888888889

(0.78087,
 [0.26432, 0.8678, 0.89648, 0.96024, 0.89228, 0.66602, 0.91327, 0.78651])

In [19]:
np.round(np.array(scoreff).mean(1).mean(), 5), [round(x, 5) for x in np.array(scoreff).mean(0)]

(0.78087,
 [0.26432, 0.8678, 0.89648, 0.96024, 0.89228, 0.66602, 0.91327, 0.78651])

加载test
______________________________________________________________________________________________________

In [20]:
with open('./data/CVPR_2022_NAS_Track2_test.json', 'r') as f:
    test_data = json.load(f)
    
target_cols = ['cplfw_rank', 'market1501_rank', 'dukemtmc_rank', 'msmt17_rank', 'veri_rank', 'vehicleid_rank', 'veriwild_rank', 'sop_rank']
test_df = get_df(test_data)

In [21]:
len(base_cols)

37

In [22]:
train_ohe(test_df, base_cols)

In [23]:
len([x for x in test_df if x[:3]=='ohe'])

93

In [24]:
def pred_cuda(test_df, model):
    #获得预测
    test_dataset = MyDataset(test_df, use_cols, target_cols, 0)
    test_loader = DataLoader(
        test_dataset,
        batch_size=1024*128,
        shuffle=False,
        drop_last=False
    )

    preds = []
    model.eval()
    with paddle.no_grad():
        for data in tqdm(test_loader):
            pred = model(data['input'])
            preds.append(pred.numpy())
    preds = np.concatenate(preds, 0)
    print(preds.shape)
    
    del test_dataset, test_loader, model;
    gc.collect()
    paddle.device.cuda.empty_cache()
    return preds

def predict_all(test_df, k=5, modelname = f'paddle_7937_LSTM_2layer_hardtanh'):
    print(f'Model {modelname}')
    #pred
    cols = []
    for fold_ in range(k):
        model = CVPRModel(input_dim=93,
                    num_classes=8,
                   )
        state_dict = paddle.load(modelpath+f'{modelname}_{fold_}.pt')
        model.set_state_dict(state_dict)
        
        pred_ = pred_cuda(test_df, model)
        tmp_c = [f'{target}_{fold_}' for target in target_cols]
        test_df[tmp_c] = pred_
        cols += tmp_c
    #get rank
    print(cols)
    test_df[cols] = test_df[cols].rank()
    for c in target_cols:
        test_df[c] = test_df[[f'{c}_{fold_}' for fold_ in range(k)]].mean(axis=1)
        test_df[c] = test_df[c].rank()

In [25]:
k, modelname

(5, 'paddle_superlinear_ranker')

In [26]:
model_dt = predict_all(test_df, k=k, modelname='paddle_superlinear_ranker')

Model paddle_superlinear_ranker


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:14<00:00, 14.26s/it]


(99500, 8)


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:13<00:00, 13.71s/it]


(99500, 8)


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:13<00:00, 13.68s/it]


(99500, 8)


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:13<00:00, 13.72s/it]


(99500, 8)


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:13<00:00, 13.82s/it]


(99500, 8)
['cplfw_rank_0', 'market1501_rank_0', 'dukemtmc_rank_0', 'msmt17_rank_0', 'veri_rank_0', 'vehicleid_rank_0', 'veriwild_rank_0', 'sop_rank_0', 'cplfw_rank_1', 'market1501_rank_1', 'dukemtmc_rank_1', 'msmt17_rank_1', 'veri_rank_1', 'vehicleid_rank_1', 'veriwild_rank_1', 'sop_rank_1', 'cplfw_rank_2', 'market1501_rank_2', 'dukemtmc_rank_2', 'msmt17_rank_2', 'veri_rank_2', 'vehicleid_rank_2', 'veriwild_rank_2', 'sop_rank_2', 'cplfw_rank_3', 'market1501_rank_3', 'dukemtmc_rank_3', 'msmt17_rank_3', 'veri_rank_3', 'vehicleid_rank_3', 'veriwild_rank_3', 'sop_rank_3', 'cplfw_rank_4', 'market1501_rank_4', 'dukemtmc_rank_4', 'msmt17_rank_4', 'veri_rank_4', 'vehicleid_rank_4', 'veriwild_rank_4', 'sop_rank_4']


In [27]:
test_df[target_cols] = test_df[target_cols].astype(int)-1

In [28]:
# to_sub
def to_sub(test_df, test_data, name='CVPR_2022_lgb_score'):
    for i in tqdm(test_df[['id']+target_cols].values):
        id_ = i[0]
        for k,v in enumerate(target_cols):
            k += 1
            test_data[id_][v] = i[k]
            
    with open(f'./sub/{name}.json', 'w') as f:
        json.dump(test_data, f)

In [29]:
to_sub(test_df, test_data, name='CVPR_2022_paddle_superliner_score')

100%|████████████████████████████████████████████████████████████████████████| 99500/99500 [00:00<00:00, 705032.22it/s]
