In [None]:
import sys
sys.path.append("../src")
import argparse
from torch.nn import DataParallel
from importlib import import_module
from argparse import Namespace
import lightgbm as lgb
from collections import Counter
from sklearn.model_selection import KFold
from timeit import default_timer as timer
try:
  import silence_tensorflow.auto
except ImportError:
  pass
from albumentations import Normalize
import copy
import shutil
import csv
import pandas as pd
from scipy import spatial
import gc
import operator
import torch
import torch.nn.functional as F
from tqdm import tqdm
from layers.normalization import L2N
from torch.autograd import Variable
from config.config import *
from config.en_config import *
from dataset.landmark_dataset import *
from utilities.vectors_utils import *
import struct
import faiss
import pydegensac
import tensorflow as tf
import PIL
from sklearn.cluster import DBSCAN as dbscan
from scipy.spatial import cKDTree
from skimage.transform import AffineTransform
from skimage.measure import ransac as _ransac
from utilities.superpointglue_util import read_image as spg_read_image
import pickle

In [None]:

parser = argparse.ArgumentParser(description='PyTorch Classification')
parser.add_argument('-f', default='', type=str)
parser.add_argument('--en_cfgs', type=str, default='en_m4_b7_b6_b5_r152_i800', help='')
parser.add_argument('--module', '-m', type=str, default='efficientnet_gem_fc_face', help='model ')
parser.add_argument('--model_name', type=str, default='class_efficientnet_b7_gem_fc_arcface2_1head', help='model name')
parser.add_argument('--gpus', default='0', type=str, help='use gpu (default: None (use cpu))')
parser.add_argument('--num_classes', default=81313, type=int, help='number of classes (default: 203094)')
parser.add_argument('--in_channels', default=3, type=int, help='in channels (default: 3)')
parser.add_argument('--img_size', default=800, type=int, help='image size (default: None)')
parser.add_argument('--scale', default=None, type=str, help='scale (default: None)')
parser.add_argument('--loss', default='SoftmaxLoss', type=str, help='loss function SoftmaxLoss')
parser.add_argument('--scheduler', default='Adam', type=str, help='scheduler name')
parser.add_argument('--out_dir', default='r101', type=str, help='output dir (default: None)')
parser.add_argument('--kaggle', default=0, type=int, help='0:local 1:kaggle')
parser.add_argument('--debug', default=0, type=int, help='is debug')
parser.add_argument('--overwrite', default=0, type=int, help='is overwrite feature cache')
parser.add_argument('--predict_epoch', default=None, type=str, help='number epoch to predict')
parser.add_argument('--batch_size', default=4, type=int)
parser.add_argument('--preprocessing', type=int, default=1)
parser.add_argument('--num_to_rerank', type=int, default=10)
parser.add_argument('--top_k', type=int, default=3)
parser.add_argument('--ransac', default=1, type=int)
parser.add_argument('--nolandmark_num', type=int, default=5000)
parser.add_argument('--valid_num', type=int, default=20000)
parser.add_argument('--do_train', type=int, default=1)
parser.add_argument('--do_valid', type=int, default=1)
parser.add_argument('--do_test', type=int, default=0)
parser.add_argument('--ransac_type', type=str, default='ssp')
parser.add_argument('--ransac_weight', type=float, default=1)
parser.add_argument('--store_keypoint', type=int, default=1)
parser.add_argument('--ransac_parts', default=1, type=int)
parser.add_argument('--ransac_part', default=0, type=int)
parser.add_argument('--lgb_model_dir', type=str, default='/kaggle/input/models')
args = parser.parse_args()


In [None]:
NUM_PUBLIC_TRAIN_IMAGES = 1580470

# RANSAC parameters:
MAX_INLIER_SCORE = 70
MAX_REPROJECTION_ERROR = 4.0
MAX_RANSAC_ITERATIONS = 1000
HOMOGRAPHY_CONFIDENCE = 0.99

def load_labelmap(TRAIN_LABELMAP_PATH):
  with open(TRAIN_LABELMAP_PATH, mode='r') as csv_file:
    csv_reader = csv.DictReader(csv_file)
    labelmap = {row['id']: row['landmark_id'] for row in csv_reader}
  return labelmap

def save_submission_csv(args, DATASET_DIR, predictions=None):
  """Saves optional `predictions` as submission.csv.

  The csv has columns {id, landmarks}. The landmarks column is a string
  containing the label and score for the id, separated by a ws delimeter.

  If `predictions` is `None` (default), submission.csv is copied from
  sample_submission.csv in `IMAGE_DIR`.

  Args:
    predictions: Optional dict of image ids to dicts with keys {class, score}.
  """

  if predictions is None:
    # Dummy submission!
    shutil.copyfile(
        os.path.join(DATASET_DIR, 'sample_submission.csv'), 'submission.csv')
    return

  if args.kaggle:
    submit_fname = 'submission.csv'
  else:
    submit_dir = f'{RESULT_DIR}/submissions/{args.out_dir}'
    os.makedirs(submit_dir, exist_ok=True)
    submit_fname = f'{submit_dir}/submission.csv'

  with open(submit_fname, 'w') as submission_csv:
    csv_writer = csv.DictWriter(submission_csv, fieldnames=['id', 'landmarks'])
    csv_writer.writeheader()
    for image_id, prediction in predictions.items():
      label = prediction['class']
      score = prediction['score']
      csv_writer.writerow({'id': image_id, 'landmarks': f'{label} {score}'})
  return submit_fname

In [None]:

class TestDataset(Dataset):

  def __init__(self, args, df, img_dir):
    self.args = args
    self.img_size = (args.img_size, args.img_size)
    self.img_dir = img_dir
    self.img_ids = df[ID].values
    print(f'img_size: {self.img_size}')
    self.norm = Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), max_pixel_value=255.0)

  def __len__(self):
    return len(self.img_ids)

  def __getitem__(self, idx):
    img_id = self.img_ids[idx]
    img_dir = self.img_dir
    if self.args.kaggle:
      fname = f'{img_dir}/{img_id[0]}/{img_id[1]}/{img_id[2]}/{img_id}.jpg'
      if not os.path.exists(fname):
        fname = f'{img_dir}/{img_id}.jpg'
      image = cv2.imread(fname)
    else:
      fname = f'{img_dir}/{img_id}.jpg'
      if not os.path.exists(fname):
        fname = f'{DATA_DIR}/images/test/{img_id}.jpg'
      image = cv2.imread(fname)
    image = image[..., ::-1]

    if self.args.img_size is not None:
      if image.shape[:2] != self.img_size:
        image = cv2.resize(image, self.img_size)
    else:
      raise Exception()

    if self.args.preprocessing==1:
      image = self.norm(image=image)['image']
    else:
      image = image / 255.0
    image = np.transpose(image, (2, 0, 1))
    image = torch.from_numpy(image).float()
    return image

def create_dataset(args, df, img_dir):
  dataset = TestDataset(args, df, img_dir)
  data_loader = DataLoader(
    dataset,
    sampler=SequentialSampler(dataset),
    batch_size=args.batch_size,
    drop_last=False,
    num_workers=8,
    pin_memory=True,
    collate_fn=default_collate,
  )
  return data_loader

In [None]:
def get_prediction_map(test_ids, train_ids_labels_and_scores, top_k=3):
  """Makes dict from test ids and ranked training ids, labels, scores."""
  prediction_map = dict()
  for test_index, test_id in enumerate(test_ids):
    aggregate_scores = {}
    if top_k > 0:
      sub_train_ids_labels_and_scores = train_ids_labels_and_scores[test_index][:top_k]
    else:
      sub_train_ids_labels_and_scores = train_ids_labels_and_scores[test_index]
    for _, label, score in sub_train_ids_labels_and_scores:
      if label not in aggregate_scores:
        aggregate_scores[label] = 0
      aggregate_scores[label] += float(score)
    label, score = max(aggregate_scores.items(), key=operator.itemgetter(1))
    prediction_map[test_id] = {'score': score, 'class': label}
  return prediction_map

def extract_global_features(args, model, df, image_dir, dataset, return_prob=False):
  N = len(df)
  if args.kaggle:
    features_dir = f'/kaggle/input/features/{args.out_dir}'
  else:
    features_dir = f'{RESULT_DIR}/features/{args.out_dir}'
    os.makedirs(features_dir, exist_ok=True)
  if args.scale is not None:
    features_fname = f'{features_dir}/epoch{args.predict_epoch}_i{args.scale}_{dataset}_features_{N}.fvecs'
    img_ids_fname = f'{features_dir}/epoch{args.predict_epoch}_i{args.scale}_{dataset}_img_ids_{N}.npy'
  elif args.img_size is not None:
    features_fname = f'{features_dir}/epoch{args.predict_epoch}_i{args.img_size}_{dataset}_features_{N}.fvecs'
    img_ids_fname = f'{features_dir}/epoch{args.predict_epoch}_i{args.img_size}_{dataset}_img_ids_{N}.npy'
  else:
    raise Exception()
  try:
    if args.parts > 1:
      block = len(df) // args.parts + 1
      df = df.iloc[args.part * block:(args.part + 1) * block].reset_index(drop=True)
      features_fname = features_fname.replace(f'_{N}.fvecs', f'_{N}_{args.parts}_{args.part}.fvecs')
      img_ids_fname = img_ids_fname.replace(f'_{N}.npy', f'_{N}_{args.parts}_{args.part}.npy')
  except:
    pass

  if return_prob:
    cls_idxes_fname = features_fname.replace('_features_', '_cls_idxes_top50_').\
      replace('.fvecs', '.npy')
    cls_probs_fname = features_fname.replace('_features_', '_cls_probs_top50_').\
      replace('.fvecs', '.npy')
    print(cls_probs_fname)

  print(features_fname)
  do_cache = not args.kaggle
  if ope(img_ids_fname) and not args.overwrite:
    img_ids = np.load(img_ids_fname, allow_pickle=True)
    embeddings = fvecs_read(features_fname)
    if return_prob:
      cls_idxes = np.load(cls_idxes_fname, allow_pickle=True)
      cls_probs = np.load(cls_probs_fname, allow_pickle=True)
  else:
    if do_cache:
      f = open(features_fname, 'wb')
    dataloader = create_dataset(args, df, image_dir)
    embeddings = []
    cls_idxes = []
    cls_probs = []
    for it, images in tqdm(enumerate(dataloader),total=len(dataloader), desc=f'extract {dataset}'):
      image_tensor = Variable(images.cuda(), volatile=True)

      if return_prob:
        logit, embedding = model(image_tensor, None)
      else:
        if hasattr(model.module, 'extract_feature'):
          embedding = model.module.extract_feature(image_tensor)
        elif hasattr(model.module, 'extract_feat'):
          embedding = model.module.extract_feat(image_tensor)
        else:
          raise Exception('extract_feature')
      embedding = L2N()(embedding)
      embedding = embedding.cpu().detach().numpy()
      embeddings.append(embedding)

      if return_prob:
        prob = F.softmax(logit, dim=1).cpu().numpy()
        top50_idxes = np.argsort(prob, axis=1)[:, :-51:-1].astype('int32')
        top50_probs = prob[
          np.concatenate([np.arange(len(prob)).reshape(-1, 1)] * top50_idxes.shape[1], axis=1), top50_idxes].astype(
          'float32')
        cls_idxes.append(top50_idxes)
        cls_probs.append(top50_probs)

      if do_cache:
        for ebd in embedding:
          D = len(ebd)
          f.write(struct.pack('<I%df' % D, D, *list(ebd)))

    img_ids = dataloader.dataset.img_ids
    if return_prob:
      cls_idxes = np.concatenate(cls_idxes, axis=0)
      cls_probs = np.concatenate(cls_probs, axis=0)
    if do_cache:
      f.flush()
      f.close()
      np.save(img_ids_fname, img_ids)
      if return_prob:
        np.save(cls_idxes_fname, cls_idxes)
        np.save(cls_probs_fname, cls_probs)
    embeddings = np.concatenate(embeddings, axis=0)

  if return_prob:
    return img_ids, np.array(embeddings), np.array(cls_idxes), np.array(cls_probs)
  else:
    return img_ids, np.array(embeddings)


In [None]:
def load_image_tensor(image_path):
  return tf.convert_to_tensor(
      np.array(PIL.Image.open(image_path).convert('RGB')))

def extract_local_features(local_model_tf, local_model_tf_constant, image_path):
  """Extracts local features for the given `image_path`."""

  image_tensor = load_image_tensor(image_path)

  features = local_model_tf(
    image_tensor,
    local_model_tf_constant['DELG_IMAGE_SCALES_TENSOR'],
    local_model_tf_constant['DELG_SCORE_THRESHOLD_TENSOR'],
    local_model_tf_constant['LOCAL_FEATURE_NUM_TENSOR'],
  )

  # Shape: (N, 2)
  keypoints = tf.divide(
      tf.add(
          tf.gather(features[0], [0, 1], axis=1),
          tf.gather(features[0], [2, 3], axis=1)), 2.0).numpy()

  # Shape: (N, 128)
  descriptors = tf.nn.l2_normalize(
      features[1], axis=1, name='l2_normalization').numpy()

  return keypoints, descriptors

def compute_putative_matching_keypoints(test_keypoints,
                                        test_descriptors,
                                        train_keypoints,
                                        train_descriptors,
                                        max_distance=0.9):
  """Finds matches from `test_descriptors` to KD-tree of `train_descriptors`."""

  train_descriptor_tree = spatial.cKDTree(train_descriptors)
  _, matches = train_descriptor_tree.query(
      test_descriptors, distance_upper_bound=max_distance)

  test_kp_count = test_keypoints.shape[0]
  train_kp_count = train_keypoints.shape[0]

  test_matching_keypoints = np.array([
      test_keypoints[i,]
      for i in range(test_kp_count)
      if matches[i] != train_kp_count
  ])
  train_matching_keypoints = np.array([
      train_keypoints[matches[i],]
      for i in range(test_kp_count)
      if matches[i] != train_kp_count
  ])

  return test_matching_keypoints, train_matching_keypoints

def compute_num_inliers(test_keypoints, test_descriptors, train_keypoints,
                        train_descriptors, do_kdtree=True):
  """Returns the number of RANSAC inliers."""

  if do_kdtree:
    test_match_kp, train_match_kp = compute_putative_matching_keypoints(
        test_keypoints, test_descriptors, train_keypoints, train_descriptors)
  else:
    test_match_kp, train_match_kp = test_keypoints, train_keypoints
  if test_match_kp.shape[0] <= 4:  # Min keypoints supported by `pydegensac.findHomography()`
    return 0

  try:
    _, mask = pydegensac.findHomography(test_match_kp, train_match_kp,
                                        MAX_REPROJECTION_ERROR,
                                        HOMOGRAPHY_CONFIDENCE,
                                        MAX_RANSAC_ITERATIONS)
  except np.linalg.LinAlgError:  # When det(H)=0, can't invert matrix.
    return 0

  return int(copy.deepcopy(mask).astype(np.float32).sum())

def get_inliers(loc1, desc1, loc2, desc2):
  n_feat1, n_feat2 = loc1.shape[0], loc2.shape[0]

  # from scipy.spatial import cKDTree
  KD_THRESH = 0.8
  d1_tree = cKDTree(desc1)
  distances, indices = d1_tree.query(desc2, distance_upper_bound=KD_THRESH)

  loc2_to_use = np.array([loc2[i, ] for i in range(n_feat2) if indices[i] != n_feat1])
  loc1_to_use = np.array([loc1[indices[i], ] for i in range(n_feat2) if indices[i] != n_feat1])

  np.random.seed(114514)

  # from skimage.measure import ransac as _ransac
  # from skimage.transform import AffineTransform
  try:
    model_robust, inliers = _ransac(
      (loc1_to_use, loc2_to_use),
      AffineTransform,
      min_samples=3,
      residual_threshold=20,
      max_trials=1000)
    return sum(inliers)
  except:
    return 0

In [None]:
def get_total_score(num_inliers, global_score, weight=1.0, max_inlier_score=None):
  if max_inlier_score is None:
    max_inlier_score = MAX_INLIER_SCORE
  local_score = min(num_inliers, max_inlier_score) / max_inlier_score
  return local_score*weight + global_score


def get_cached_num_inliers(ransac_cache_dir, test_image_id, train_image_id):
  ransac_fname = f'{ransac_cache_dir}/{test_image_id}_{train_image_id}.npy'
  if ope(ransac_fname):
    num_inliers = np.load(ransac_fname, allow_pickle=True)
  else:
    ransac_fname = f'{ransac_cache_dir}/{train_image_id}_{test_image_id}.npy'
    if ope(ransac_fname):
      num_inliers = np.load(ransac_fname, allow_pickle=True)
    else:
      ransac_fname = f'{ransac_cache_dir}/{test_image_id}_{train_image_id}.npy'
      num_inliers = None
  return ransac_fname, num_inliers

def get_whole_cached_num_inliers(args):
  ransac_cache_dir, keypoint_cache_dir = get_ransac_cache_dir(args)
  whole_ransac_fname = f'{ransac_cache_dir}/whole_ransac_inliers.pkl'
  if ope(whole_ransac_fname):
    with open(whole_ransac_fname, 'rb') as dbfile:
      data = pickle.load(dbfile)
  else:
    data = dict()
  return data

def save_whole_cached_num_inliers(args, data):
  ransac_cache_dir, keypoint_cache_dir = get_ransac_cache_dir(args)
  whole_ransac_fname = f'{ransac_cache_dir}/whole_ransac_inliers.pkl'
  with open(whole_ransac_fname, 'wb') as dbfile:
    pickle.dump(data, dbfile)

def load_cached_keypoints(keypoint_cache_dir, img_id):
  keypoint_fname = f'{keypoint_cache_dir}/keypoint_{img_id}.pkl'
  if ope(keypoint_fname):
    with open(keypoint_fname, 'rb') as dbfile:
      data = pickle.load(dbfile)
    return data
  else:
    return None

def save_cached_keypoints(keypoint_cache_dir, img_id, keypoints, scores, descriptors, scales):
  keypoint_fname = f'{keypoint_cache_dir}/keypoint_{img_id}.pkl'
  if not ope(keypoint_fname):
    data = {
      'keypoints': keypoints[0].cpu().numpy(),
      'scores': scores[0].data.cpu().numpy(),
      'descriptors': descriptors[0].data.cpu().numpy(),
      'scales': scales,
    }
    with open(keypoint_fname, 'wb') as dbfile:
      pickle.dump(data, dbfile)

def load_cached_matches(keypoint_cache_dir, query_image_id, index_image_id):
  match_fname = f'{keypoint_cache_dir}/match_query_{query_image_id}_index_{index_image_id}.pkl'
  if ope(match_fname):
    try:
      with open(match_fname, 'rb') as dbfile:
        data = pickle.load(dbfile)
    except:
      data = None
    return data
  else:
    return None

def save_cached_matches(keypoint_cache_dir, query_image_id, index_image_id,
                        matches0, matches1, matching_scores0, matching_scores1):
  match_fname = f'{keypoint_cache_dir}/match_query_{query_image_id}_index_{index_image_id}.pkl'
  if not ope(match_fname):
    data = {
      'matches0': matches0.cpu().numpy(),
      'matches1': matches1.cpu().numpy(),
      'matching_scores0': matching_scores0.data.cpu().numpy(),
      'matching_scores1': matching_scores1.data.cpu().numpy(),
    }
    with open(match_fname, 'wb') as dbfile:
      pickle.dump(data, dbfile)

def generate_superpoint_superglue(args, test_image_id, test_image_path, train_image_id, train_image_path,
                                  test_image_dict, superpointglue_net, do_cache, keypoint_cache_dir):
  if test_image_id in test_image_dict:
    test_image, test_inp, test_scales, test_keypoints, test_scores, test_descriptors = test_image_dict[test_image_id]
  else:
    test_image, test_inp, test_scales = spg_read_image(test_image_path, resize=[800], rotation=0, resize_float=False)
    test_keypoints, test_scores, test_descriptors = None, None, None
  train_image, train_inp, train_scales = spg_read_image(train_image_path, resize=[800], rotation=0, resize_float=False)

  data_inp = {'image0': test_inp, 'image1': train_inp}
  if test_keypoints is not None:
    data_inp = {**data_inp, **{'keypoints0': test_keypoints, 'scores0': test_scores, 'descriptors0': test_descriptors}}
  pred = superpointglue_net(data_inp)

  test_keypoints, test_scores, test_descriptors = pred['keypoints0'], pred['scores0'], pred['descriptors0']
  train_keypoints, train_scores, train_descriptors = pred['keypoints1'], pred['scores1'], pred['descriptors1']
  test_train_matches0, test_train_matches1 = pred['matches0'], pred['matches1']
  test_train_matching_scores0, test_train_matching_scores1 = pred['matching_scores0'], pred['matching_scores1']
  if do_cache and args.store_keypoint:
    save_cached_keypoints(keypoint_cache_dir, test_image_id, test_keypoints, test_scores, test_descriptors, test_scales)
    save_cached_keypoints(keypoint_cache_dir, train_image_id, train_keypoints, train_scores, train_descriptors, train_scales)
    save_cached_matches(keypoint_cache_dir, test_image_id, train_image_id, test_train_matches0,
                        test_train_matches1, test_train_matching_scores0, test_train_matching_scores1)

  test_image_dict[test_image_id] = (test_image, test_inp, test_scales, test_keypoints, test_scores, test_descriptors)

  pred['scales0'] = test_scales
  pred['scales1'] = train_scales
  return pred

In [None]:
def get_ransac_cache_dir(args):
  if args.kaggle:
    cache_root = '/kaggle/working'
  else:
    cache_root = f'{DATA_DIR}/cache'
  if (args.ransac_type is None) or (args.ransac_type == '') or (args.ransac_type.lower() == 'degensac'):
    ransac_cache_dir = f'{cache_root}/ransac_1s/'
    keypoint_cache_dir = None
  elif args.ransac_type.lower() == 'skransac':
    ransac_cache_dir = f'{cache_root}/ransac_20191st_1s/'
    keypoint_cache_dir = None
  elif args.ransac_type.lower() == 'superpointglue':
    ransac_cache_dir = f'{cache_root}/ransac_superpointglue_l800_1s/'
    keypoint_cache_dir = f'{cache_root}/keypoint_superpoint_l800_1s/'
  elif args.ransac_type.lower() == 'ssp':
    ransac_cache_dir = f'{cache_root}/ransac_ssp_l800_1s/'
    keypoint_cache_dir = f'{cache_root}/keypoint_superpoint_l800_1s/'
  else:
    raise ValueError(f'{args.ransac_type} error, only available [degensac, skransac, SuperPointGlue, ssp]')
  os.makedirs(ransac_cache_dir, exist_ok=True)
  if keypoint_cache_dir is not None:
    os.makedirs(keypoint_cache_dir, exist_ok=True)
  return ransac_cache_dir, keypoint_cache_dir

def rescore_and_rerank_by_num_inliers(args, test_image_dir, train_image_dir,
                                      test_image_id, train_ids_labels_and_scores, ignore_global_score=False, do_sort=True,
                                      superpointglue_net=None, return_num_inliers=False, cache_num_inliers_dict=None):
  """Returns rescored and sorted training images by local feature extraction."""
  do_cache = not args.kaggle
  ransac_cache_dir, keypoint_cache_dir = get_ransac_cache_dir(args)
  cache_num_inliers_dict = dict() if cache_num_inliers_dict is None else cache_num_inliers_dict

  if args.kaggle:
    test_image_path = f'{test_image_dir}/{test_image_id[0]}/{test_image_id[1]}/{test_image_id[2]}/{test_image_id}.jpg'
  else:
    test_image_path = f'{test_image_dir}/{test_image_id}.jpg'
    if not ope(test_image_path):
      test_image_path = f'{DATA_DIR}/images/test/{test_image_id}.jpg'
  test_image_dict = {}

  ransac_inliers = []
  for i in range(len(train_ids_labels_and_scores)):
    train_image_id, label, global_score = train_ids_labels_and_scores[i]
    ransac_fname, num_inliers = None, cache_num_inliers_dict.get((test_image_id, train_image_id), None)
    if num_inliers is None:
      ransac_fname, num_inliers = get_cached_num_inliers(ransac_cache_dir, test_image_id, train_image_id)
    if num_inliers is None:

      if args.kaggle:
        train_image_path = f'{train_image_dir}/{train_image_id[0]}/{train_image_id[1]}/{train_image_id[2]}/{train_image_id}.jpg'
        if not ope(train_image_path):
          train_image_path = f'{train_image_dir}/{train_image_id}.jpg'
      else:
        train_image_path = f'{train_image_dir}/{train_image_id}.jpg'
        if not ope(train_image_path):
          train_image_path = f'{DATA_DIR}/images/test/{train_image_id}.jpg'

      if (args.ransac_type is not None) and (args.ransac_type.lower() == 'ssp'):
        match_data = load_cached_matches(keypoint_cache_dir, test_image_id, train_image_id)
        if match_data is None:
          pred = generate_superpoint_superglue(args, test_image_id, test_image_path, train_image_id, train_image_path,
                                               test_image_dict, superpointglue_net, do_cache, keypoint_cache_dir)
          test_scales = pred['scales0']
          test_keypoints = copy.deepcopy(pred['keypoints0'])[0].cpu().numpy()

          train_scales = pred['scales1']
          train_keypoints = copy.deepcopy(pred['keypoints1'])[0].cpu().numpy()

          matches0 = pred['matches0'].cpu().numpy()[0]
        else:
          test_keypoint_data = load_cached_keypoints(keypoint_cache_dir, test_image_id)
          test_keypoints, test_scales = test_keypoint_data['keypoints'], test_keypoint_data['scales']

          train_keypoint_data = load_cached_keypoints(keypoint_cache_dir, train_image_id)
          train_keypoints, train_scales = train_keypoint_data['keypoints'], train_keypoint_data['scales']

          matches0 = match_data['matches0'][0]

        test_keypoints = test_keypoints * np.array([list(test_scales)])
        test_keypoints = test_keypoints[:, ::-1]
        train_keypoints = train_keypoints * np.array([list(train_scales)])
        train_keypoints = train_keypoints[:, ::-1]

        valid0 = matches0 > -1
        test_keypoints = test_keypoints[valid0]
        train_keypoints = train_keypoints[matches0[valid0]]
        num_inliers = compute_num_inliers(test_keypoints, None, train_keypoints, None, do_kdtree=False)
      if do_cache and ransac_fname is not None:
        np.save(ransac_fname, num_inliers)

    cache_num_inliers_dict[(test_image_id, train_image_id)] = num_inliers
    if ignore_global_score:
      total_score = get_total_score(num_inliers, 0.)
    else:
      total_score = get_total_score(num_inliers, global_score, weight=args.ransac_weight, max_inlier_score=90)
    train_ids_labels_and_scores[i] = (train_image_id, label, total_score)
    ransac_inliers.append((train_image_id, num_inliers))
  if do_sort:
    train_ids_labels_and_scores.sort(key=lambda x: x[2], reverse=True)

  if return_num_inliers:
    return ransac_inliers
  else:
    return train_ids_labels_and_scores

In [None]:
def get_nolandmark_by_dbscan(test_ids, test_embeddings, nolandmark_ids, nolandmark_embeddings):
  # dbscan
  features = np.vstack([test_embeddings, nolandmark_embeddings])
  clusters = dbscan(eps=0.85, n_jobs=-1, min_samples=1).fit_predict(features)
  clusters_np = np.c_[np.r_[test_ids, nolandmark_ids], clusters]
  clusters_df = pd.DataFrame(data=clusters_np, columns=[ID, 'clusters'])
  clusters_df['is_nolandmark'] = [0]*len(test_ids) + [1]*len(nolandmark_ids)
  clusters_gb = clusters_df.groupby('clusters')['is_nolandmark'].agg(['count', 'sum']).reset_index()
  clusters_gb.columns = ['clusters', 'clusters_num', 'nolandmark_num']
  clusters_gb['nolandmark_rate'] = clusters_gb['nolandmark_num'] / clusters_gb['clusters_num']

  test_clusters = clusters_df[0: len(test_ids)]
  test_clusters = test_clusters.merge(clusters_gb, on='clusters', how='left')
  return test_clusters

def do_retrieval(args, labelmap, train_ids, train_embeddings,
                 test_embeddings, num_to_rerank, do_dba=False, gallery_set='index'):
  train_ids_labels_and_scores = [None] * test_embeddings.shape[0]

  if do_dba:
    faiss_index = faiss.IndexFlatIP(train_embeddings.shape[1])
    faiss_index.add(train_embeddings)
    dba_lens = 10
    weights = np.logspace(0, -1.5, dba_lens)
    weights /= np.sum(weights)
    D, I = faiss_index.search(train_embeddings, dba_lens)
    new_xb = 0
    for i, weight in enumerate(weights):
      new_xb = new_xb + train_embeddings[I[:, i]] * weight
    train_embeddings = new_xb

  faiss_index = faiss.IndexFlatIP(train_embeddings.shape[1])
  faiss_index.add(train_embeddings)
  D, I = faiss_index.search(test_embeddings, num_to_rerank)  # actual search
  if not args.kaggle:
    save_faiss_results(args, D, I, gallery_set, num_to_rerank, test_embeddings)
  for test_index in range(test_embeddings.shape[0]):
    train_ids_labels_and_scores[test_index] = [
      (train_ids[train_index], labelmap[train_ids[train_index]], distance)
      for train_index, distance in zip(I[test_index], D[test_index])
    ]
  return train_ids_labels_and_scores


def save_faiss_results(args, D, I, gallery_set, topn, df, suffix=''):
  dataset = '%s_%s' % ('test', gallery_set)
  faiss_dir = f'{RESULT_DIR}/faiss/{args.out_dir}'
  os.makedirs(faiss_dir, exist_ok=True)
  if args.scale is not None:
    I_fname = f'{faiss_dir}/epoch{args.predict_epoch}_i{args.scale}_{dataset}_knn_top{topn}_i_{len(df)}{suffix}.npz'
    D_fname = f'{faiss_dir}/epoch{args.predict_epoch}_i{args.scale}_{dataset}_knn_top{topn}_d_{len(df)}{suffix}.npz'
  elif args.img_size is not None:
    I_fname = f'{faiss_dir}/epoch{args.predict_epoch}_i{args.img_size}_{dataset}_knn_top{topn}_i_{len(df)}{suffix}.npz'
    D_fname = f'{faiss_dir}/epoch{args.predict_epoch}_i{args.img_size}_{dataset}_knn_top{topn}_d_{len(df)}{suffix}.npz'
  np.savez_compressed(I_fname, i=I)
  np.savez_compressed(D_fname, d=D)

def get_retrieval_type(args, labelmap, train_ids, train_embeddings, test_embeddings, num_to_rerank):
  faiss_index = faiss.IndexFlatIP(train_embeddings.shape[1])
  faiss_index.add(train_embeddings)
  D, I = faiss_index.search(test_embeddings, num_to_rerank)  # actual search
  test_retrieval_type = {}
  for test_index in tqdm(range(test_embeddings.shape[0]), total=test_embeddings.shape[0]):
    index_idx = I[test_index]
    target_nunique = len(np.unique([labelmap[train_ids[i]] for i in index_idx]))
    max_score = np.max(D[test_index])
    min_score = np.min(D[test_index])
    if target_nunique <= 2 and min_score > 0.9:
      _type = 1
    elif target_nunique <= 2 and max_score > 0.85:
      _type = 2
    elif target_nunique == num_to_rerank:
      _type = 4
    else:
      _type = 3
    test_retrieval_type[test_index] = _type

  return test_retrieval_type

In [None]:
def do_rerank(args, local_model_tf, local_model_tf_constant, superpointglue_net, test_image_dir, predictions, test_ids, test_embeddings, rerank_topk=2000):
  predictions_df = pd.DataFrame.from_dict(predictions, orient='index', columns=['score', 'class'])
  predictions_df = predictions_df.reset_index().rename(columns={'index': ID})
  predictions_df = predictions_df.sort_values('score', ascending=False).reset_index(drop=True)
  predictions_df = predictions_df.head(rerank_topk)
  labelmap = {}
  for _id in predictions_df[ID]:
    labelmap[_id] = -1
  rerank_ids = []
  for _idx, row in tqdm(predictions_df.iterrows(), desc='rerank', total=len(predictions_df)):
    search_id = row[ID]
    if search_id in rerank_ids:
      continue
    search_idx = test_ids.tolist().index(search_id)
    search_ebd = test_embeddings[search_idx]
    query_ids = predictions_df[ID].values[_idx+1:].tolist()
    query_ids = list(set(query_ids) - set(rerank_ids))
    if len(query_ids) <= 0:
      continue
    query_idx = pd.Series(index=test_ids, data=np.arange(len(test_ids)))[query_ids].values
    query_ebds = test_embeddings[query_idx]

    train_ids_labels_and_scores = do_retrieval(args, labelmap, query_ids, query_ebds, search_ebd.reshape(1, -1), args.rerank_retrieval_num)

    ransac_inliers = rescore_and_rerank_by_num_inliers(args, local_model_tf, local_model_tf_constant,
                                                          test_image_dir, test_image_dir, search_id, train_ids_labels_and_scores[0],
                                                          superpointglue_net=superpointglue_net, ignore_global_score=False, do_sort=False, return_num_inliers=True)
    ransac_inliers = pd.DataFrame(ransac_inliers, columns=[ID, 'inliers'])
    ransac_inliers['inliers'] = ransac_inliers['inliers'].astype(int)
    ransac_inliers = ransac_inliers[ransac_inliers['inliers'] > args.rerank_inliers_limit]
    ransac_inliers = ransac_inliers.sort_values('inliers', ascending=False).reset_index(drop=True)
    rerank_ids.extend(ransac_inliers[ID].values.tolist())
    for _rank, row in ransac_inliers.iterrows():
      _score = predictions[search_id]['score'] - (_rank+1) * 0.001
      predictions[row[ID]]['score'] = _score
  print(f'rerank: {len(rerank_ids)}')
  return predictions

def detect_nolandmark(args, predictions, test_ids, test_image_dir):
  nl_ids = []
  do_cache = not args.kaggle
  detect_cache_dir = f'{DATA_DIR}/cache/detect/'
  os.makedirs(detect_cache_dir, exist_ok=True)

  # load model
  if args.kaggle:
    detector_model_dir = '/kaggle/input/pretrained/d2r_frcnn_20190411'
  else:
    detector_model_dir = '/data5/data/pretrained/d2r_frcnn_20190411'
  detector_fn = detector.MakeDetector(detector_model_dir)

  for _, test_image_id in tqdm(enumerate(test_ids), total=len(test_ids), desc='do detect'):
    if args.kaggle:
      test_image_path = f'{test_image_dir}/{test_image_id[0]}/{test_image_id[1]}/{test_image_id[2]}/{test_image_id}.jpg'
    else:
      test_image_path = f'{test_image_dir}/{test_image_id}.jpg'
      if not ope(test_image_path):
        test_image_path = f'{DATA_DIR}/images/test/{test_image_id}.jpg'

    boxes_path = f'{detect_cache_dir}/{test_image_id}.boxes'
    if ope(boxes_path):
      (boxes_out, scores_out, class_indices_out) = box_io.ReadFromFile(boxes_path)
    else:
      im = np.expand_dims(np.array(utils.RgbLoader(test_image_path)), 0)
      (boxes_out, scores_out, class_indices_out) = detector_fn(im)
      boxes_out, scores_out, class_indices_out = boxes_out[0], scores_out[0], class_indices_out[0]
      if do_cache:
        box_io.WriteToFile(boxes_path, boxes_out, scores_out, class_indices_out)

    (selected_boxes, selected_scores, selected_class_indices) = \
      _FilterBoxesByScore(boxes_out, scores_out, class_indices_out, args.detect_thresh)
    if len(selected_boxes) > 0:
      selected_areas = (selected_boxes[:, 3] - selected_boxes[:, 1]) * (selected_boxes[:, 2] - selected_boxes[:, 0])
      max_area = selected_areas.max()
    else:
      max_area = 0

    if max_area <= args.detect_area:
      nl_ids.append(test_image_id)
      predictions[test_image_id]['score'] = predictions[test_image_id]['score'] - 2
  print(f'detect_nl: {len(nl_ids)}')
  return predictions

In [None]:
def post_process(args, local_model_tf, local_model_tf_constant, superpointglue_net, test_image_dir, predictions, labelmap, test_ids, test_embeddings, train_ids, train_embeddings, nolandmark_ids, nolandmark_embeddings):
  if args.nolandmark_cluster_type != 0:
    nolandmark_preds = get_nolandmark_by_dbscan(test_ids, test_embeddings, nolandmark_ids, nolandmark_embeddings)
    if args.nolandmark_cluster_type == 1:
      nolandmark_preds = nolandmark_preds[nolandmark_preds['nolandmark_num'] > args.nolandmark_cluster_num_limit]
      print(f'set {len(nolandmark_preds)} nolandmark')
      for index, row in nolandmark_preds.iterrows():
        predictions[row[ID]]['score'] = 0
    elif args.nolandmark_cluster_type in [2,3]:
      nolandmark_preds1 = nolandmark_preds[nolandmark_preds['nolandmark_num'] >= args.nolandmark_cluster_num_limit]
      print(f'0: set {len(nolandmark_preds1)} nolandmark')
      for index, row in nolandmark_preds1.iterrows():
        if args.nolandmark_cluster_type == 2:
          predictions[row[ID]]['score'] = predictions[row[ID]]['score'] - 2 * row['nolandmark_rate'] - 0.1 * row['nolandmark_num']
        else:
          predictions[row[ID]]['score'] = predictions[row[ID]]['score'] - row['nolandmark_rate']*math.exp(row['nolandmark_rate'])

      nolandmark_preds2 = nolandmark_preds[nolandmark_preds['nolandmark_num'] < args.nolandmark_cluster_num_limit]
      print(f'1: set {len(nolandmark_preds2)} nolandmark')
      for index, row in nolandmark_preds2.iterrows():
        predictions[row[ID]]['score'] = predictions[row[ID]]['score'] - min(0.2, row['nolandmark_rate'])

  if args.nolandmark_retrieval:
    nl_labelmap = copy.deepcopy(labelmap)
    for nl_id in nolandmark_ids:
      nl_labelmap[nl_id] = -1
    train_nl_ids = np.hstack((train_ids, nolandmark_ids))
    train_nl_embeddings = np.vstack((train_embeddings, nolandmark_embeddings))
    nl_train_ids_labels_and_scores = do_retrieval(args, nl_labelmap, train_nl_ids, train_nl_embeddings,
                                                  test_embeddings, args.num_to_rerank, gallery_set='nolandmark')
    nl_predictions = get_prediction_map(test_ids, nl_train_ids_labels_and_scores, args.num_to_rerank)
    nl_predictions_df = pd.DataFrame.from_dict(nl_predictions, orient='index', columns=['score', 'class']).reset_index().rename(columns={'index': ID})
    nl_predictions_df = nl_predictions_df[nl_predictions_df['class'] == -1]
    print(f'nl retrieval: set {len(nl_predictions_df)} nolandmark')
    for index, row in nl_predictions_df.iterrows():
      predictions[row[ID]]['score'] = predictions[row[ID]]['score'] - row['score']

  retrieval2_nl_ids = None
  if args.nolandmark_retrieval2:
    nl_labelmap = {}
    for nl_id in nolandmark_ids:
      nl_labelmap[nl_id] = -1
    nl_train_ids_labels_and_scores = do_retrieval(args, nl_labelmap, nolandmark_ids, nolandmark_embeddings,
                                                  test_embeddings, args.num_to_rerank, gallery_set='nolandmark')
    nl_train_ids_labels_and_scores = np.array(nl_train_ids_labels_and_scores)
    nl_predictions_df = pd.DataFrame(nl_train_ids_labels_and_scores[:, :3, -1].astype(float), columns=['top1', 'top2', 'top3'])
    nl_predictions_df.insert(0, ID, test_ids)
    nl_predictions_df['top_mean'] = nl_predictions_df[['top1', 'top2', 'top3']].mean(axis=1)
    retrieval2_nl_ids = nl_predictions_df[nl_predictions_df['top1'] >= 0.55][ID].values
    nl_predictions_df = nl_predictions_df[nl_predictions_df['top3'] >= 0.3]
    print(f'nl retrieval2: set {len(nl_predictions_df)} nolandmark')
    for index, row in nl_predictions_df.iterrows():
      if args.nolandmark_retrieval2_type == 0:
        predictions[row[ID]]['score'] = predictions[row[ID]]['score'] - row['top3'] * 1.5
      elif args.nolandmark_retrieval2_type == 1:
        predictions[row[ID]]['score'] = predictions[row[ID]]['score'] - row['top3'] * 1.5
        if row['top3'] > 0.5:
          predictions[row[ID]]['score'] = predictions[row[ID]]['score'] - 3
      elif args.nolandmark_retrieval2_type == 2:
        predictions[row[ID]]['score'] = predictions[row[ID]]['score'] - (row['top3']-0.3) * 4
      elif args.nolandmark_retrieval2_type == 3:
        predictions[row[ID]]['score'] = predictions[row[ID]]['score'] - (row['top1']+row['top2']+row['top3'])*0.5

  if args.rule:
    predictions_df = pd.DataFrame.from_dict(predictions, orient='index', columns=['score', 'class'])
    predictions_df = predictions_df.reset_index().rename(columns={'index':ID})
    predictions_vc = predictions_df.groupby('class')[ID].count()
    nl_classes = predictions_vc[predictions_vc > args.rule_limit].index.values
    c = 0
    for k in predictions.keys():
      if predictions[k]['class'] in nl_classes:
        if predictions[k]['score'] > args.protect_score:
          continue
        predictions[k]['score'] = predictions[k]['score'] - 2
        c = c + 1
    print(f'rule: set {c} nolandmark')

  if args.rule2:
    retrieval_type = get_retrieval_type(args, labelmap, train_ids, train_embeddings, test_embeddings, 5)
    retrieval_type_df = pd.DataFrame.from_dict(retrieval_type, orient='index', columns=['type']).reset_index().rename(columns={'index':ID})
    print(f'rule2:')
    print(retrieval_type_df['type'].value_counts())
    for index, row in retrieval_type_df.iterrows():
      _id = test_ids[row[ID]]
      if args.rule2_type == 1:
        if retrieval2_nl_ids is not None:
          if _id in retrieval2_nl_ids:
            continue
      if row['type'] == 1:
        predictions[_id]['score'] = predictions[_id]['score'] + 3
      elif row['type'] == 2:
        predictions[_id]['score'] = predictions[_id]['score'] + 1
      elif row['type'] == 3:
        predictions[_id]['score'] = predictions[_id]['score'] + 0.5

  if args.detect_nl:
    predictions = detect_nolandmark(args, predictions, test_ids, test_image_dir)

  if args.rerank:
    predictions = do_rerank(args, local_model_tf, local_model_tf_constant, superpointglue_net, test_image_dir, predictions, test_ids, test_embeddings, rerank_topk=args.rerank_num)

  return predictions


In [None]:
def do_filter_index(args, train_ids, train_embeddings, nolandmark_ids, nolandmark_embeddings):
  nl_labelmap = dict([(i, -1) for i in nolandmark_ids])
  nl_train_ids_labels_and_scores = do_retrieval(args, nl_labelmap, nolandmark_ids, nolandmark_embeddings, train_embeddings, 3)
  nl_train_ids_labels_and_scores = np.array(nl_train_ids_labels_and_scores)
  nl_predictions_df = pd.DataFrame(nl_train_ids_labels_and_scores[:, :, -1].astype(float), columns=['top1', 'top2', 'top3'])
  nl_predictions_df.insert(0, ID, train_ids)
  nl_predictions_df['top_mean'] = nl_predictions_df[['top1', 'top2', 'top3']].mean(axis=1)
  nl_predictions_df = nl_predictions_df[nl_predictions_df['top3'] <= args.filter_index_thresh]
  print(f'nl filter_index: set {len(nl_predictions_df)} not nolandmark')
  nnl_indexs = nl_predictions_df.index.values
  train_ids = train_ids[nnl_indexs]
  train_embeddings = train_embeddings[nnl_indexs]
  return train_ids, train_embeddings


def GAP_vector(pred, conf, true):
  '''
  Compute Global Average Precision (aka micro AP), the metric for the
  Google Landmark Recognition competition.
  This function takes predictions, labels and confidence scores as vectors.
  In both predictions and ground-truth, use None/np.nan for "no label".

  Args:
      pred: vector of integer-coded predictions
      conf: vector of probability or confidence scores for pred
      true: vector of integer-coded labels for ground truth
      return_x: also return the data frame used in the calculation

  Returns:
      GAP score
  '''
  x = pd.DataFrame({'pred': pred, 'conf': conf, 'true': true})
  x.sort_values('conf', ascending=False, inplace=True, na_position='last')
  x['correct'] = (x.true == x.pred).astype(int)
  x['prec_k'] = x.correct.cumsum() / (np.arange(len(x)) + 1)
  x['term'] = x.prec_k * x.correct
  gap = x.term.sum() / x.true.count()
  return gap

In [None]:
def generate_Xy(test_df, topk_labels, topk_scores, topk=5):
  # generate X
  _topk_scores = []
  for i in range(topk_scores.shape[0]):
    counter = Counter()
    for j in range(topk_scores.shape[1]):
      counter[topk_labels[i, j]] += topk_scores[i, j]
    sub_topk_scores = counter.most_common(topk)
    sub_topk_scores.extend([(-1, 0.)] * max(topk - len(sub_topk_scores), 0))
    _topk_scores.append(sub_topk_scores)
  topk_scores = np.array(_topk_scores)
  topk_X, topk_labels = topk_scores[:, :, 1], topk_scores[:, :, 0]
  topk_labels = topk_labels.astype('int32')

  topk_df = pd.DataFrame(data=topk_X, columns=[f'top{i}_sum_score' for i in range(topk)])
  topk_df['top0_top1'] = topk_df['top0_sum_score'] - topk_df['top1_sum_score']
  topk_df.insert(0, ID, test_df[ID].values)
  if CTARGET in test_df.columns:
    topk_df.insert(1, CTARGET, test_df[CTARGET].values)
  else:
    topk_df.insert(1, CTARGET, None)

  # generate y
  # topk + nl + other
  topk_df['y'] = None
  if CTARGET in test_df.columns:
    ys = []
    for ctarget, topk_label in zip(test_df[CTARGET], topk_labels):
      if 'nan' == ctarget:
        y = topk+1
      else:
        if ctarget in topk_label:
          y = np.where(topk_label == ctarget)[0][0]
        else:
          y = topk + 1
      ys.append(y)
    topk_df['y'] = ys
  return topk_df, topk_labels

def add_topn_features(feats_df, topn_labels, labels_and_scores, prefix, topk, model_idx):
  features_labels = labels_and_scores[:, :, 1].astype('int32')
  features_scores = labels_and_scores[:, :, 2].astype('float32')
  _topk_scores = []
  for i in range(features_scores.shape[0]):
    counter = Counter()
    for j in range(features_scores.shape[1]):
      counter[features_labels[i, j]] += features_scores[i, j]
    sub_topk_scores = [(tl, counter[tl]) for tl in topn_labels[i]]
    _topk_scores.append(sub_topk_scores)
  topk_scores = np.array(_topk_scores)[:, :, 1]
  features_cols = [f'm{model_idx}_{prefix}_top{i}_score' for i in range(topk)]
  for idx in range(len(features_cols)):
    feats_df[features_cols[idx]] = topk_scores[:, idx]
  return feats_df

def add_features(args, feats_df, topn_labels,
                 retrieval_train_ids_labels_and_scores,
                 ransac_train_ids_labels_and_scores,
                 nolandmark_ids_labels_and_scores,
                 model_idx):
  ret_topk_labels = retrieval_train_ids_labels_and_scores[:, :, 1].astype('int32')
  ret_topk_scores = retrieval_train_ids_labels_and_scores[:, :, 2].astype('float32')
  nolandmark_scores = nolandmark_ids_labels_and_scores[:, :, 2].astype('float32')
  assert len(feats_df)==len(topn_labels)
  assert len(ret_topk_labels)==len(topn_labels)
  assert len(ret_topk_labels)==len(nolandmark_scores)

  for i in range(args.top_k):
    select_idx = np.array([topn_labels[:, i] == ret_topk_labels[:, j] for j in range(args.num_to_rerank)]).T
    feats_df[f'm{model_idx}_retrieval_top{i}_max'] = np.max(ret_topk_scores*select_idx, axis=1)
    feats_df[f'm{model_idx}_retrieval_top{i}_mean'] = np.mean(ret_topk_scores*select_idx, axis=1)

  feats_df[f'm{model_idx}_retrieval_top0_top1_max'] = feats_df[f'm{model_idx}_retrieval_top0_max'] - \
                                                      feats_df[f'm{model_idx}_retrieval_top1_max']
   # groupby
  for i in range(args.top_k):
    feats_df[f'top0'] = topn_labels[:, 0]
    feats_df[f'm{model_idx}_gp_top{i}_retrieval_mean'] = feats_df.groupby([f'top0'])[f'm{model_idx}_retrieval_top{i}_max'].transform('mean')
    feats_df[f'm{model_idx}_gp_top{i}_retrieval_max'] = feats_df.groupby([f'top0'])[f'm{model_idx}_retrieval_top{i}_max'].transform('max')
    feats_df[f'm{model_idx}_gp_top{i}_retrieval_std'] = feats_df.groupby([f'top0'])[f'm{model_idx}_retrieval_top{i}_max'].transform('std')
    del feats_df[f'top0']
  return feats_df

def add_multi_models_features(feats_df, train_labels_scores_list, model_num, topk):
  for top in range(topk):
    cols = [f'm{m}_nol_top{top}_score' for m in range(model_num)]
    feats_df[f'nol_top{top}_score_mean'] = feats_df[cols].mean(axis=1)
    feats_df[f'nol_top{top}_score_std'] = feats_df[cols].std(axis=1)

    cols = [f'm{m}_retrieval_top{top}_max' for m in range(model_num)]
    feats_df[f'retrieval_top{top}_max_mean'] = feats_df[cols].mean(axis=1)
    feats_df[f'retrieval_top{top}_max_std'] = feats_df[cols].std(axis=1)

    cols = [f'm{m}_retrieval_top{top}_score' for m in range(model_num)]
    feats_df[f'retrieval_top{top}_score_mean'] = feats_df[cols].mean(axis=1)
    feats_df[f'retrieval_top{top}_score_std'] = feats_df[cols].std(axis=1)
  return feats_df

def add_nl_features(feats_df, nolandmark_ids_labels_and_scores, model_idx, suffix=''):
  nolandmark_scores = nolandmark_ids_labels_and_scores[:, :, 2].astype('float32')
  nolandmark_cols = [f'm{model_idx}_nol{suffix}_top{i}_score' for i in range(nolandmark_scores.shape[1])]
  for idx in range(len(nolandmark_cols)):
    feats_df[nolandmark_cols[idx]] = nolandmark_scores[:, idx]
  return feats_df

def get_lgb_params(num_class=5):
  params = {
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'metric': 'multi_logloss',

    'learning_rate': 0.1,
    'num_leaves': 7,  # 2^max_depth - 1
    'max_depth': 3,  # -1 means no limit
    'min_child_samples': 100,  # Minimum number of data need in a child(min_data_in_leaf)
    'max_bin': 255,  # Number of bucketed bin for feature values
    'subsample': 0.8,  # Subsample ratio of the training instance.
    'subsample_freq': 1,  # frequence of subsample, <=0 means no enable
    'colsample_bytree': 0.8,  # Subsample ratio of columns when constructing each tree.
    'min_child_weight': 0,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
    'num_boost_round': 300,
    'early_stopping_rounds': 50,
    'num_threads': 8,
    'num_class': num_class,
    'verbose': -1,
  }
  return params

def get_split_ix(df, n_splits, fold, random_state=100):
  kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
  for idx, (train_indices, valid_indices) in enumerate(kf.split(df[ID].values)):
    if idx == fold:
      return train_indices, valid_indices

def get_train_val_data(df_trainval, n_splits, fold, random_state=100):
  train_indices, valid_indices = get_split_ix(df_trainval, n_splits, fold, random_state=random_state)
  df_train = df_trainval.iloc[train_indices]
  df_val = df_trainval.iloc[valid_indices]
  return df_train, df_val

def train_model(args, params, lgbm_feats_df, feat_cols, folds_num, fold, random_state=100, topk=5):
  df_train, df_val = get_train_val_data(lgbm_feats_df, folds_num, fold, random_state=random_state)
  X_train = df_train[feat_cols]
  y_train = df_train['y']
  # print(np.unique(y_train))

  X_val = df_val[feat_cols]
  y_val = df_val['y']
  print(X_train.shape, X_val.shape)

  xgtrain = lgb.Dataset(X_train.values, y_train.values, feature_name=feat_cols)
  xgvalid = lgb.Dataset(X_val.values, y_val.values, feature_name=feat_cols)

  valid_sets = [xgtrain, xgvalid]
  valid_names = ['train', 'valid']
  params['metric'] = 'multi_logloss'
  _eval_func = None
  clf = lgb.train(params,
                  xgtrain,
                  valid_sets=valid_sets,
                  valid_names=valid_names,
                  evals_result={},
                  num_boost_round=params['num_boost_round'],
                  early_stopping_rounds=params['early_stopping_rounds'],
                  verbose_eval=10,
                  feval=_eval_func)

  return clf

def get_lgbm_prediction_map(probs, img_ids, topk_labels, topk=5, show=True):
  idxes = np.arange(len(img_ids))
  preds = np.argmax(probs, axis=1)
  if show:
    print('pred result')
    print(pd.Series(preds).value_counts().sort_index())

  nl_idxes = preds == topk
  preds[preds > 1] = 0
  preds[topk_labels[idxes, preds] == -1] = 0  # none use top1
  if show:
    print('final pred result')
    print(pd.Series(preds).value_counts().sort_index())

  lgbm_preds = topk_labels[idxes, preds]
  lgbm_scores = probs[idxes, preds]
  lgbm_scores[nl_idxes] = (1-np.max(probs, axis=1))[nl_idxes]
  predictions = {
    img_ids[i]: {'score': lgbm_scores[i], 'class': lgbm_preds[i]} for i in range(len(img_ids))
  }
  return predictions

def save_model(clf, model_fpath):
  with open(model_fpath, 'wb') as dbfile:
    pickle.dump(clf, dbfile)

def load_model(model_fpath):
  with open(model_fpath, 'rb') as dbfile:
    clf = pickle.load(dbfile)
  return clf

def do_metric(args, pred_probs, pred_img_ids, pred_labels, targets):
  predictions = get_lgbm_prediction_map(pred_probs, pred_img_ids, pred_labels, topk=args.top_k, show=False)
  preds = []
  scores = []
  for image_id in pred_img_ids:
    prediction = predictions[image_id]
    label = prediction['class']
    score = prediction['score']
    preds.append(label)
    scores.append(score)
  score = GAP_vector(np.array(preds).astype(float), np.array(scores), targets.astype(float))
  return score

def do_lgbm(args, test_df,
            en_train_ids_labels_and_scores,
            train_labels_scores_list,
            nolandmark_labels_scores_list,
            ransac_labels_scores_list):
  if args.kaggle:
    lgbm_dir = f'{args.lgb_model_dir}/lgbm/{args.out_dir}'
  else:
    lgbm_dir = f'{RESULT_DIR}/models/lgbm/{args.out_dir}'
  os.makedirs(lgbm_dir, exist_ok=True)

  # generate features
  en_train_ids_labels_and_scores = np.array(en_train_ids_labels_and_scores)
  topk_labels = en_train_ids_labels_and_scores[:, :, 1].astype('int32')
  topk_scores = en_train_ids_labels_and_scores[:, :, 2].astype('float32')

  feats_df, topn_labels = generate_Xy(test_df, topk_labels, topk_scores, topk=args.top_k)
  print(feats_df['y'].value_counts().sort_index())

  M = len(train_labels_scores_list)
  for i in range(M):
    retrieval_train_ids_labels_and_scores = np.array(train_labels_scores_list[i])
    nolandmark_ids_labels_and_scores = np.array(nolandmark_labels_scores_list[i])

    if i == 0:
      ransac_train_ids_labels_and_scores = np.array(ransac_labels_scores_list[i])
      feats_df = add_topn_features(feats_df, topn_labels, ransac_train_ids_labels_and_scores, 'ransac', args.top_k, i)
    else:
      ransac_train_ids_labels_and_scores = None

    feats_df = add_topn_features(feats_df, topn_labels, retrieval_train_ids_labels_and_scores, 'retrieval', args.top_k, i)
    feats_df = add_nl_features(feats_df, nolandmark_ids_labels_and_scores, i)
    feats_df = add_features(args, feats_df, topn_labels,
                            retrieval_train_ids_labels_and_scores,
                            ransac_train_ids_labels_and_scores,
                            nolandmark_ids_labels_and_scores, i)
  feats_df = add_multi_models_features(feats_df, train_labels_scores_list, model_num=M, topk=args.top_k)

  # prepare data
  folds_num = 5
  models_num = 10
  num_class = args.top_k + 2  # topk + nl + other
  params = get_lgb_params(num_class=num_class)
  base_feat_cols = [col for col in feats_df.columns if col not in [ID, CTARGET, 'y']]

  np.random.seed(100)
  model_feat_cols = []
  model_random_states = []
  for model_idx in range(models_num):
    feat_cols = np.random.choice(base_feat_cols, size=int(1.0 * len(base_feat_cols)), replace=False).tolist()
    model_feat_cols.append(feat_cols)
    model_random_states.append(np.random.randint(0, 100000))

  # train and predict
  feat_imp_list = []
  base_pred_probs = []
  for model_idx in range(models_num):
    feat_cols = model_feat_cols[model_idx]
    random_state = model_random_states[model_idx]
    print(f'model{model_idx} - random_state{random_state}')
    print(len(feat_cols), feat_cols)
    pred_probs = np.zeros((len(feats_df), num_class))
    feat_imp = pd.Series(index=feat_cols, data=0.)
    for fold_idx in range(folds_num):
      print(fold_idx, '*' * 50)
      model_fpath = f'{lgbm_dir}/m{models_num}.{model_idx}_f{folds_num}.{fold_idx}_top{args.top_k}_feats{len(feat_cols)}_{args.valid_num}.pkl'
      if args.do_train:
        clf = train_model(args, params, feats_df, feat_cols, folds_num, fold_idx, random_state=random_state, topk=args.top_k)
        save_model(clf, model_fpath)
      else:
        clf = load_model(model_fpath)

      fold_feat_imp = pd.Series(data=clf.feature_importance(), index=clf.feature_name())
      feat_imp += fold_feat_imp / float(folds_num)

      if args.do_valid:
        _, valid_indices = get_split_ix(feats_df, folds_num, fold_idx, random_state=random_state)
        valid_probs = clf.predict(feats_df.iloc[valid_indices][feat_cols])
        pred_probs[valid_indices] = valid_probs
      elif args.do_test:
        test_probs = clf.predict(feats_df[feat_cols])
        pred_probs += test_probs / folds_num
    feat_imp_list.append(feat_imp)
    base_pred_probs.append(pred_probs)

  pred_img_ids = feats_df[ID].values
  pred_labels = topn_labels
  pred_targets = feats_df[CTARGET].values

  # select models
  selected_model_fpath = f'{lgbm_dir}/selected_indices.npy'
  if args.do_valid:
    scores = []
    for pred_probs in base_pred_probs:
      score = do_metric(args, pred_probs, pred_img_ids, pred_labels, pred_targets)
      scores.append(score)
    scores = np.array(scores)
    print('selected before: [', ', '.join([f'{s:.4f}' for s in scores.tolist()]) + ']')
    selected_idxes = np.argsort(scores)[::-1][:5] # top5
    print('selected after: [', ', '.join([f'{s:.4f}' for s in scores[selected_idxes].tolist()]) + ']')
    np.save(selected_model_fpath, selected_idxes)

    pred_probs = np.mean(np.array(base_pred_probs)[selected_idxes], axis=0)
  elif args.do_test:
    selected_idxes = np.load(selected_model_fpath)
    pred_probs = np.mean(np.array(base_pred_probs)[selected_idxes], axis=0)
  else:
    selected_idxes = np.arange(models_num)
    pred_probs = None

  # feature importance
  feat_imp = pd.Series(index=base_feat_cols, data=0.)
  for selected_idx in selected_idxes:
    feat_imp += feat_imp_list[selected_idx].reindex(index=base_feat_cols).fillna(0) / len(selected_idxes)
  print(feat_imp.sort_values(ascending=False)[:50])

  return pred_probs, pred_img_ids, pred_labels

In [None]:
def get_img_ids(test_ids, sub_test_ids):
  if test_ids is None:
    test_ids = sub_test_ids
  else:
    assert np.array_equal(test_ids, sub_test_ids)
  return test_ids

def norm(test_embeddings):
  test_embeddings = test_embeddings / (np.linalg.norm(test_embeddings, ord=2, axis=1, keepdims=True) + EPS)
  return test_embeddings

def merge_retrieval(train_ids_labels_and_scores, sub_train_ids_labels_and_scores, weight=1.0):
  for test_index in range(len(sub_train_ids_labels_and_scores)):
    sub_train_ids_labels_and_scores[test_index] = [
      (train_id, int(label), float(score) * weight) for train_id, label, score in sub_train_ids_labels_and_scores[test_index]
    ]

  if train_ids_labels_and_scores is None:
    train_ids_labels_and_scores = sub_train_ids_labels_and_scores
  else:
    for test_index in range(len(sub_train_ids_labels_and_scores)):
      train_ids_labels_and_scores_map = {
        train_id: (train_id, int(label), float(score)) for train_id, label, score in train_ids_labels_and_scores[test_index]
      }
      for train_id, label, score in sub_train_ids_labels_and_scores[test_index]:
        train_ids_labels_and_scores_map[train_id] = (train_id, int(label), train_ids_labels_and_scores_map.get(train_id, (None, None, 0.))[-1] + float(score))
      train_ids_labels_and_scores[test_index] = sorted([v for _, v in train_ids_labels_and_scores_map.items()], key=lambda x: -x[-1])
  return train_ids_labels_and_scores

def np_save(fname, v, kaggle=False):
  if not kaggle:
    np.save(fname, v)

def get_predictions(args, en_cfgs, superpointglue_net,
                    labelmap, train_df, test_df, nolandmark_df,
                    test_image_dir, train_image_dir, nolandmark_image_dir):
  test_ids, test_embeddings_list = None, []
  train_ids, train_embeddings_list = None, []
  nolandmark_ids, nolandmark_embeddings_list = None, []

  with torch.no_grad():
    """Gets predictions using embedding similarity and local feature reranking."""

    for en_cfg in en_cfgs:
      net = en_cfg['net']
      _args = Namespace(**{
        'out_dir': en_cfg['out_dir'],
        'kaggle': args.kaggle,
        'scale': en_cfg.get('scale', None),
        'img_size': en_cfg.get('img_size', None),
        'predict_epoch': en_cfg['predict_epoch'],
        'batch_size': en_cfg['batch_size'],
        'preprocessing': en_cfg['preprocessing'],
        'overwrite': args.overwrite,
      })

      sub_test_ids, sub_test_embeddings = extract_global_features(_args, net, test_df, test_image_dir,
                                                                  dataset='test')
      sub_train_ids, sub_train_embeddings = extract_global_features(_args, net, train_df, train_image_dir,
                                                                    dataset='train')
      sub_nolandmark_ids, sub_nolandmark_embeddings = extract_global_features(_args, net, nolandmark_df,
                                                                      nolandmark_image_dir, dataset='nolandmark')
      test_ids = get_img_ids(test_ids, sub_test_ids)
      train_ids = get_img_ids(train_ids, sub_train_ids)
      nolandmark_ids = get_img_ids(nolandmark_ids, sub_nolandmark_ids)
      test_embeddings_list.append(sub_test_embeddings)
      train_embeddings_list.append(sub_train_embeddings)
      nolandmark_embeddings_list.append(sub_nolandmark_embeddings)

  en_test_embeddings = []
  en_train_embeddings = []
  en_nolandmark_embeddings = []
  for i,en_cfg in enumerate(en_cfgs):
    en_test_embeddings.append(test_embeddings_list[i] * en_cfg['weight'])
    en_train_embeddings.append(train_embeddings_list[i] * en_cfg['weight'])
    en_nolandmark_embeddings.append(nolandmark_embeddings_list[i] * en_cfg['weight'])

  en_test_embeddings = norm(np.concatenate(en_test_embeddings, axis=1))
  print('test_embeddings shape', en_test_embeddings.shape)
  en_train_embeddings = norm(np.concatenate(en_train_embeddings, axis=1))
  print('train_embeddings shape', en_train_embeddings.shape)
  en_nolandmark_embeddings = norm(np.concatenate(en_nolandmark_embeddings, axis=1))
  print('nolandmark_embeddings shape', en_nolandmark_embeddings.shape)

  test_embeddings_list.insert(0, en_test_embeddings)
  train_embeddings_list.insert(0, en_train_embeddings)
  nolandmark_embeddings_list.insert(0, en_nolandmark_embeddings)

  nolandmark_labelmap = dict([(i, -1) for i in nolandmark_ids])
  train_labels_scores_list = []
  nolandmark_labels_scores_list = []
  ransac_labels_scores_list = []
  en_train_ids_labels_and_scores = None

  cache_dir = f'{RESULT_DIR}/cache/{args.out_dir}/'
  if not args.kaggle:
    os.makedirs(cache_dir, exist_ok=True)
  for i in tqdm(range(len(test_embeddings_list))):
    train_embeddings = train_embeddings_list[i]
    test_embeddings = test_embeddings_list[i]
    nolandmark_embeddings = nolandmark_embeddings_list[i]

    retrieval_fname = f'{cache_dir}/m{i}_retrieval_{args.valid_num}.npy'
    if ope(retrieval_fname) and False:
      print('load', retrieval_fname)
      train_ids_labels_and_scores = np.load(retrieval_fname, allow_pickle=True).tolist()
    else:
      train_ids_labels_and_scores = do_retrieval(args, labelmap, train_ids,
                                               train_embeddings, test_embeddings,
                                               args.num_to_rerank, gallery_set='index')
      np_save(retrieval_fname, train_ids_labels_and_scores, kaggle=args.kaggle)

    nolandmark_ids_labels_and_scores = do_retrieval(args, nolandmark_labelmap, nolandmark_ids,
                                                    nolandmark_embeddings, test_embeddings,
                                                    args.num_to_rerank, gallery_set='nolandmark')

    def do_ransac(ransac_fname, test_ids, test_image_dir, train_image_dir, train_ids_labels_and_scores):
      if ope(ransac_fname):
        print('load', ransac_fname)
        ransac_train_ids_labels_and_scores = np.load(ransac_fname, allow_pickle=True)
      else:
        ransac_train_ids_labels_and_scores = None
        if args.ransac:
          ransac_train_ids_labels_and_scores = copy.deepcopy(train_ids_labels_and_scores)
          if args.ransac_parts > 1:
            block = len(ransac_train_ids_labels_and_scores) // args.ransac_parts + 1
            ransac_train_ids_labels_and_scores = \
              ransac_train_ids_labels_and_scores[args.ransac_part * block:(args.ransac_part + 1) * block]
            test_ids = test_ids[args.ransac_part * block:(args.ransac_part + 1) * block]

          cache_num_inliers_dict = None
          if args.kaggle:
            cache_num_inliers_dict = get_whole_cached_num_inliers(args)
          for test_index, test_id in tqdm(enumerate(test_ids), total=len(test_ids), desc='do ransac'):
            ransac_train_ids_labels_and_scores[test_index] = rescore_and_rerank_by_num_inliers(
              args, test_image_dir, train_image_dir, test_id,
              ransac_train_ids_labels_and_scores[test_index],
              superpointglue_net=superpointglue_net, ignore_global_score=True, do_sort=False,
                cache_num_inliers_dict=cache_num_inliers_dict)
          if args.kaggle:
            save_whole_cached_num_inliers(args, cache_num_inliers_dict)

          if args.ransac_parts>1:
            return
          np_save(ransac_fname, ransac_train_ids_labels_and_scores, kaggle=args.kaggle)
      return ransac_train_ids_labels_and_scores

    ransac_fname = f'{cache_dir}/m{i}_ransac_{args.ransac_type}_{args.valid_num}.npy'
    if i == 0:
      ransac_train_ids_labels_and_scores = do_ransac(ransac_fname, test_ids, test_image_dir,
                                                       train_image_dir, train_ids_labels_and_scores)
    else:
      ransac_train_ids_labels_and_scores = None

    nl_ransac_fname = f'{cache_dir}/m{i}_ransac_nl_{args.ransac_type}_{args.valid_num}.npy'
    if i > 0:
      en_train_ids_labels_and_scores = merge_retrieval(en_train_ids_labels_and_scores,
                                                       copy.deepcopy(train_ids_labels_and_scores),
                                                        weight=en_cfgs[i-1]['weight'])

    train_labels_scores_list.append(train_ids_labels_and_scores)
    nolandmark_labels_scores_list.append(nolandmark_ids_labels_and_scores)
    ransac_labels_scores_list.append(ransac_train_ids_labels_and_scores)

  en_train_ids_labels_and_scores = [i[:args.num_to_rerank] for i in en_train_ids_labels_and_scores]
  en_fname = f'{cache_dir}/en_{args.valid_num}.npy'
  if ope(en_fname) and False:
    print('load', en_fname)
    en_train_ids_labels_and_scores = np.load(en_fname, allow_pickle=True).tolist()
  else:
    cache_num_inliers_dict = None
    if args.kaggle:
      cache_num_inliers_dict = get_whole_cached_num_inliers(args)
    for test_index, test_id in tqdm(enumerate(test_ids), total=len(test_ids), desc='do ransac'):
      en_train_ids_labels_and_scores[test_index] = rescore_and_rerank_by_num_inliers(
        args, test_image_dir, train_image_dir, test_id,
        en_train_ids_labels_and_scores[test_index], superpointglue_net=superpointglue_net,
        cache_num_inliers_dict=cache_num_inliers_dict)
    np_save(en_fname, en_train_ids_labels_and_scores, kaggle=args.kaggle)

  pred_probs, pred_img_ids, pred_labels = do_lgbm(args, test_df,
                                                  en_train_ids_labels_and_scores,
                                                  train_labels_scores_list,
                                                  nolandmark_labels_scores_list,
                                                  ransac_labels_scores_list
                                                  )

  predictions = get_lgbm_prediction_map(pred_probs, pred_img_ids, pred_labels, topk=args.top_k)

  del test_embeddings_list
  del train_embeddings_list
  del nolandmark_embeddings_list
  gc.collect()
  return predictions


In [None]:
def load_model(args, en_cfg):
  _module = import_module(f'net_torch.{en_cfg["module"]}')
  net = getattr(_module, en_cfg['model_name'])(args=Namespace(**{
    'num_classes': en_cfg['num_classes'],
    'in_channels': en_cfg['in_channels'],
    'can_print': True,
  }))

  if args.kaggle:
    model_file = f'/kaggle/input/models2/{en_cfg["out_dir"]}/{en_cfg["predict_epoch"]}.pth'
  else:
    model_file = f'{RESULT_DIR}/models/{en_cfg["out_dir"]}/{en_cfg["predict_epoch"]}.pth'
  print('load model file: %s' % model_file)
  checkpoint = torch.load(model_file)
  net.load_state_dict(checkpoint['state_dict'])

  net = DataParallel(net)
  net.cuda()
  net.eval()
  return net


def load_superpointglue_model():
  from net_torch.superpointglue.matching import Matching
  if args.kaggle:
    model_dir = '/kaggle/input/superpointglue-models/superpoint_superglue_models'
  else:
    model_dir = f'{DATA_DIR}/input/superpoint_superglue_models'
  config = {
    'superpoint': {
      'nms_radius': 4,
      'keypoint_threshold': 0.005,
      'max_keypoints': 1024,
      'model_dir': model_dir,
    },
    'superglue': {
      'weights': 'outdoor',  # indoor, outdoor
      'sinkhorn_iterations': 20,
      'match_threshold': 0.2,
      'model_dir': model_dir,
    }
  }
  superpointglue = Matching(config).eval().cuda()
  return superpointglue

def main():
  start_time = timer()
  if args.kaggle and args.debug:
    args.nolandmark_num = 20
  print(f'nolandmark_num: {args.nolandmark_num}')
  args.can_print = True

  if args.gpus is not None:
    print('using gpu ' + args.gpus)
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpus

  en_cfgs = eval(args.en_cfgs)
  for en_cfg in en_cfgs:
    print(f'weight: {en_cfg["weight"]}')
    if args.gpus is None:
      en_cfg['net'] = None
    else:
      en_cfg['net'] = load_model(args, en_cfg)

  superpointglue_net = None
  if (args.gpus is not None) and (args.ransac):
    superpointglue_net = load_superpointglue_model()

  if args.kaggle:
    INPUT_DIR = os.path.join('..', 'input')
    DATASET_DIR = os.path.join(INPUT_DIR, 'landmark-recognition-2020')
    TEST_IMAGE_DIR = os.path.join(DATASET_DIR, 'test')
    TRAIN_IMAGE_DIR = os.path.join(DATASET_DIR, 'train')
    TRAIN_LABELMAP_PATH = os.path.join(DATASET_DIR, 'train.csv')
    test_df = pd.read_csv(os.path.join(DATASET_DIR, 'sample_submission.csv'))
    train_df = pd.read_csv(TRAIN_LABELMAP_PATH)

    if not args.debug and len(train_df) == NUM_PUBLIC_TRAIN_IMAGES:
      print(
        f'Found {NUM_PUBLIC_TRAIN_IMAGES} training images. Copying sample submission.'
      )
      save_submission_csv(args, DATASET_DIR)
      return
  else:
    DATASET_DIR = DATA_DIR
    TEST_IMAGE_DIR = f'{DATA_DIR}/images/train'
    TRAIN_IMAGE_DIR = f'{DATA_DIR}/images/train'
    TRAIN_LABELMAP_PATH = f'{DATA_DIR}/input/train_labelmap_lgb.csv'

    c_test_df = pd.read_csv(f'{DATA_DIR}/raw/recognition_solution_v2.1.csv')
    v2c_df = pd.read_csv(f'{DATA_DIR}/split/train2020.csv')
    mapping_df = v2c_df[[TARGET, CTARGET]].drop_duplicates(TARGET, keep='first')
    landmark_test2019_df = c_test_df[c_test_df[CTARGET].isin(mapping_df[CTARGET].astype(str))]
    c_test_df[CTARGET] = c_test_df[CTARGET].astype(str)

    if args.debug:
      test_num = 10000
    else:
      test_num = args.valid_num
    index_num = test_num*10
    test_landmark_num = int(test_num * 0.2)
    landmark_test2019_df[CTARGET] = landmark_test2019_df[CTARGET].astype(int)
    print('test 2019 landmark num', len(landmark_test2019_df))
    print('test 2019 landmark nunique', landmark_test2019_df[CTARGET].nunique())

    num = test_landmark_num - len(landmark_test2019_df)
    v2xc_df = pd.read_csv(f'{DATA_DIR}/split/v2xc/random_train_cv0.csv')
    v2xc_df = v2xc_df[~v2xc_df[ID].isin(v2c_df[ID])]
    v2xc_df = v2xc_df.merge(mapping_df[[TARGET, CTARGET]], how='left', on=TARGET)
    v2xc_landmark_df = v2xc_df.drop_duplicates(TARGET, keep='first')
    v2xc_landmark = v2xc_landmark_df.sample(num//2, random_state=1, replace=False)[TARGET]
    v2xc_landmark_df = v2xc_df[v2xc_df[TARGET].isin(v2xc_landmark)]
    v2xc_landmark_df = v2xc_landmark_df.groupby(TARGET).head(20)
    v2xc_landmark_df = v2xc_landmark_df.sample(num, random_state=1, replace=False)
    # print(v2xc_landmark_df[CTARGET].value_counts()[:20])
    print('v2xc landmark num', len(v2xc_landmark_df))
    print('v2xc landmark nunique', v2xc_landmark_df[CTARGET].nunique())

    landmark_test_df = pd.concat((landmark_test2019_df, v2xc_landmark_df[[ID, CTARGET]]))
    print('landmark num', len(landmark_test_df))
    # print(landmark_test_df[CTARGET].value_counts()[:20])

    nolandmark_test_df = c_test_df[c_test_df[CTARGET]=='nan']
    nolandmark_test_df = nolandmark_test_df.sample(test_num - len(landmark_test_df), random_state=1, replace=False)
    print('nolandmark num', len(nolandmark_test_df))
    test_df = pd.concat((landmark_test_df, nolandmark_test_df[[ID, CTARGET]]))
    test_df.to_csv(f'{DATA_DIR}/input/valid_v2_{test_num}.csv', index=False)

    v2c_train_df = pd.read_csv(f'{DATA_DIR}/split/v2c/random_train_cv0.csv')
    v2c_train_df = v2c_train_df.merge(v2c_df[[ID, CTARGET]], how='left', on=ID)
    v2c_in_test = v2c_train_df[CTARGET].isin(landmark_test_df[CTARGET])
    v2c_index = v2c_train_df[v2c_in_test]
    v2c_other_index = v2c_train_df[~v2c_in_test].sample(index_num - len(v2c_index), random_state=1, replace=False)
    print('v2c index num', len(v2c_index))
    print('v2c other index num', len(v2c_other_index))
    train_df = pd.concat((v2c_index, v2c_other_index))
    train_df[TARGET] = train_df[CTARGET].values
    train_df[[ID, TARGET]].to_csv(TRAIN_LABELMAP_PATH, index=False)

  if args.kaggle:
    nolandmark_df = pd.read_csv(os.path.join(INPUT_DIR, '2019test-5k', 'nolandmark_v1.csv'))
    NOLANDMARK_IMAGE_DIR = os.path.join(INPUT_DIR, '2019test', 'test')
  else:
    nolandmark_df = pd.read_csv(f'{DATA_DIR}/split/nolandmark_v1.csv')
    nolandmark_df = nolandmark_df[~nolandmark_df[ID].isin(test_df[ID])]
    NOLANDMARK_IMAGE_DIR = f'{DATA_DIR}/images/test'

  nolandmark_df = nolandmark_df.sample(args.nolandmark_num, random_state=1, replace=False)
  print('sample nolandmark num', len(nolandmark_df))

  if args.kaggle and args.debug:
      test_df = test_df[:10]
      train_df = train_df[:10]

  print('test num', len(test_df))
  print('train num', len(train_df))

  labelmap = load_labelmap(TRAIN_LABELMAP_PATH)

  args.out_dir = f'{args.en_cfgs}_lgb'
  predictions = get_predictions(args, en_cfgs, superpointglue_net,
                                labelmap, train_df, test_df, nolandmark_df,
                                TEST_IMAGE_DIR, TRAIN_IMAGE_DIR, NOLANDMARK_IMAGE_DIR)
  submit_fname = save_submission_csv(args, DATASET_DIR, predictions)

  if not args.kaggle:
    pred_df = pd.read_csv(submit_fname).fillna('')
    assert np.array_equal(np.sort(test_df[ID].values), np.sort(pred_df[ID].values))
    pred_df = pd.merge(test_df[[ID]], pred_df, on=ID, how='left')
    pred = [i.split(' ')[0] if i != '' else np.nan for i in pred_df[CTARGET]]
    conf = [i.split(' ')[1] if i != '' else np.nan for i in pred_df[CTARGET]]
    gap = GAP_vector(np.array(pred).astype(float), np.array(conf).astype(float), test_df[CTARGET].astype(float))
    print('gap: %.4f' % gap)
    shutil.copy(submit_fname, submit_fname.replace('.csv', f'_{gap:.4f}.csv'))
    time = (timer() - start_time) / 60
    print('run time: %.2fmin' % time)

if __name__ == '__main__':
  main()
