In [1]:
import os
import json
import torch
import scipy
import numpy as np
import pandas as pd
from tqdm import tqdm
import lightgbm as lgb
from scipy.special import erfinv

In [2]:
path="./data/"

In [3]:
with open('./data/CVPR_2022_NAS_Track2_train.json', 'r') as f:
    train_data = json.load(f)
    
with open('./data/CVPR_2022_NAS_Track2_test.json', 'r') as f:
    test_data = json.load(f)

In [4]:
def get_df(train_data):
    ret = []
    for k, v in train_data.items():
        tmp = list(v['arch'])
        tmp1 = []
        for c in target_cols:
            tmp1.append(v[c])
        ret.append(tmp+tmp1+[k,v['arch']])
    retf = pd.DataFrame(ret,columns=[f'col{_}' for _ in range(len(tmp))]+target_cols+['id','arch'])
    retf['col0'] = retf['col0'].map({'l':2, 'j':0, 'k':1})
    int_cols = [x for x in retf.columns if x not in ['id','arch']]
    retf[int_cols] = retf[int_cols].astype(float)
    return retf

In [5]:
target_cols = ['cplfw_rank', 'market1501_rank', 'dukemtmc_rank', 'msmt17_rank','veri_rank', 'vehicleid_rank', 'veriwild_rank', 'sop_rank']
train = get_df(train_data)
test = get_df(test_data)

In [6]:
fe_list = ['col0']+[f'col{1+_*3}' for _ in range(12)]+[f'col{2+_*3}' for _ in range(12)]+[f'col{3+_*3}' for _ in range(12)]

In [7]:
len(fe_list)

37

In [8]:
#train lgb model
params = {
        'num_leaves': 2,
        'objective': 'regression_l2',
        'max_depth': 10,
        'min_data_in_leaf': 1,
        'learning_rate': 0.8,
        'feature_fraction': 0.99,
        'bagging_fraction': 0.99,
        'bagging_freq': 1,
        'metric': 'mse',
        'num_threads': 32,
        'seed':2018
          }

num_round = {}
model_dic = {}
for l in tqdm(target_cols):
    model_dic[l] = {}

    num_round[l]=300
    #"""
    num_round["cplfw_rank"]=50
    num_round["market1501_rank"]=200
    num_round["dukemtmc_rank"]=300
    num_round["msmt17_rank"]=500
    num_round["veri_rank"]=100
    num_round["vehicleid_rank"]=100
    num_round["veriwild_rank"]=250
    num_round["sop_rank"]=100
    #"""
    
    # random 20 times
    for i in range(20):
        train_temp = train.sample(frac=0.75, random_state=i).reset_index(drop=True)
        train_temp[l] = (train_temp[l].rank()-1).astype(int)
        train_y = train_temp[l]

        #""" erfinv trans
        mmin = np.min(train_y) + 1
        mmax = np.max(train_y) + 1
        train_y = np.sqrt(2) * erfinv(2 * (train_y + mmin) / (mmin + mmax) - 1)
        #"""
        
        params["seed"] = i

        train_matrix = lgb.Dataset(
            train_temp[fe_list], 
            label = train_y,
        )
        
        model = lgb.train(
            params, 
            train_matrix,
            num_round[l],
        )
        
        model_dic[l][i] = model


100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:03<00:00,  2.29it/s]


In [9]:
def get_pred(model_dic, test, fe_list, target_cols):
    "predict test rank"
    for l in tqdm(target_cols):
        sub_list = []
        for i in range(20):
            model = model_dic[l][i]
            sub = model.predict(test[fe_list])
            sub_list.append(sub)
        sub = np.mean(np.array(sub_list), axis=0)
        test[l] = sub
        test[l] = test[l].rank().astype(int)

In [10]:
get_pred(model_dic, test, fe_list, target_cols)

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:11<00:00,  1.41s/it]


In [11]:
# to_sub
def to_sub(test_df, test_data, name='CVPR_2022_lgb_score'):
    for i in tqdm(test_df[['id']+target_cols].values):
        id_ = i[0]
        for k,v in enumerate(target_cols):
            k += 1
            test_data[id_][v] = i[k]
            
    with open(f'./sub/{name}.json', 'w') as f:
        json.dump(test_data, f)

In [12]:
to_sub(test, test_data, name='CVPR_2022_lgb_score')

100%|████████████████████████████████████████████████████████████████████████| 99500/99500 [00:00<00:00, 715177.50it/s]


In [13]:
# 模型保存
import joblib
joblib.dump(model_dic, './model/lgb_score_ranker.pkl')

['./model/lgb_score_ranker.pkl']