In [3]:
import pandas as pd
import numpy as np

from tqdm import tqdm

from collections import Counter

import gc
import cloudpickle

import os, sys
import random

import warnings

import xgboost as xgb

print('Using \033[34mXGBoost', xgb.__version__, '\033[0m')

Using [34mXGBoost 1.5.0 [0m


## Inference

In [8]:
pd.set_option('display.max_colwidth', None)

In [9]:
ranker = xgb.sklearn.XGBRanker()
ranker.load_model('../input/semi-playground-ranking-model-c-00-10-maxdepth-4/ranker-00.model')

In [10]:
test_images = pd.read_csv('../input/final-image-traindataset-part0-4-count-5/final_images.csv')
test_images['image_id'] = test_images.index
matchings = pd.read_csv('../input/combiner-target-dataset-0-of-5/final_matchings.csv')
test_images

Unnamed: 0,image_url,count,filename,pured_filename,spaced_filename,undigit_filename,filename_lang,filename_en,section,spaced_filename_translit,ext,filename_contains_digit,id,undigit_filename_translation,final_filename,PREfinal_filename,spaced_undigit_filename,image_id
0,http://upload.wikimedia.org/wikipedia/commons/0/00/Apoxyomenos_Pio-Clementino_Inv1185_n2.jpg,4,Apoxyomenos Pio-Clementino Inv1185 n2,Apoxyomenos Pio Clementino Inv1185 n2,Apoxyomenos Pio Clementino Inv 1185 n 2,Apoxyomenos Pio Clementino Inv n,en,True,commons,,jpg,True,0,,Apoxyomenos Pio Clementino Inv n,Apoxyomenos Pio Clementino Inv n,Apoxyomenos Pio Clementino Inv n,0
1,http://upload.wikimedia.org/wikipedia/commons/0/00/Babybox_-_venkovn%C3%AD_strana.jpg,4,Babybox - venkovní strana,Babybox venkovní strana,Babybox venkovní strana,Babybox venkovní strana,cs,False,commons,Babybox venkovni strana,unk,False,1,Babybox outdoor party,Babybox outdoor party,Babybox outdoor party,Babybox venkovní strana,1
2,http://upload.wikimedia.org/wikipedia/commons/0/00/Betania_royal_portrait.jpg,4,Betania royal portrait,Betania royal portrait,Betania royal portrait,Betania royal portrait,en,True,commons,,jpg,False,2,,Betania royal portrait,Betania royal portrait,Betania royal portrait,2
3,http://upload.wikimedia.org/wikipedia/commons/0/00/BlochGlassHarmonica.jpg,5,BlochGlassHarmonica,BlochGlassHarmonica,Bloch Glass Harmonica,Bloch Glass Harmonica,en,True,commons,,jpg,False,3,,Bloch Glass Harmonica,Bloch Glass Harmonica,Bloch Glass Harmonica,3
4,http://upload.wikimedia.org/wikipedia/commons/0/00/BolivianChilePowder2.JPG,4,BolivianChilePowder2,BolivianChilePowder2,Bolivian Chile Powder 2,Bolivian Chile Powder,en,True,commons,,jpg,True,4,,Bolivian Chile Powder,Bolivian Chile Powder,Bolivian Chile Powder,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69475,https://upload.wikimedia.org/wikipedia/ja/c/c3/The_five_professors_of_Tokyo_Bible_Semiary.jpg,4,The five professors of Tokyo Bible Semiary,The five professors of Tokyo Bible Semiary,The five professors of Tokyo Bible Semiary,The five professors of Tokyo Bible Semiary,en,True,ja,,jpg,False,69475,,The five professors of Tokyo Bible Semiary,The five professors of Tokyo Bible Semiary,The five professors of Tokyo Bible Semiary,69475
69476,https://upload.wikimedia.org/wikipedia/ru/7/7b/%D0%9A%D1%83%D0%BD%D1%86%D0%B5%D0%B2%D0%BE1940.jpg,4,Кунцево1940,Кунцево1940,Кунцево 1940,Кунцево,ru,False,ru,Kuntsevo 1940,jpg,True,69476,Kuntsevo,Kuntsevo,Kuntsevo,Кунцево,69476
69477,https://upload.wikimedia.org/wikipedia/ru/9/95/%D0%9A%D0%BE%D0%BC%D0%B0%D0%BD%D0%B4%D0%B8%D1%80%D1%8B%D0%9F%D0%B5%D1%80%D0%B2%D0%BE%D0%B9%D0%9A%D0%BE%D0%BD%D0%BD%D0%BE%D0%B9%D0%B0%D1%80%D0%BC%D0%B8%D0%B8.jpg,4,КомандирыПервойКоннойармии,КомандирыПервойКоннойармии,Командиры Первой Коннойармии,Командиры Первой Коннойармии,ru,False,ru,Komandiry Pervoj Konnojarmii,jpg,False,69477,First Cavalry Commanders,First Cavalry Commanders,First Cavalry Commanders,Командиры Первой Коннойармии,69477
69478,https://upload.wikimedia.org/wikipedia/ru/c/cb/Politburo1934.jpg,4,Politburo1934,Politburo1934,Politburo 1934,Politburo,fr,False,ru,,jpg,True,69478,Politburo,Politburo,Politburo,Politburo,69478


In [11]:
def pre_competition_inference(ranker, X, image_ids, target_ids):
    rank_pred = ranker.predict(X)
    
    answer = pd.DataFrame({ 'image_id': image_ids, 'target_id': target_ids, 'rank': rank_pred })

    answer.sort_values(by=['image_id', 'rank'], inplace=True, kind='mergesort', ascending=False)
    
    result = answer.groupby('image_id').head(50)
    
    temp = matchings[['target_id', 'target']]
    result = pd.merge(result, temp, on='target_id')

    temp = test_images[['image_id', 'count', 'undigit_filename', 'image_url']]
    result = pd.merge(result, temp, on='image_id')
    result.sort_values(by=['image_id', 'rank'], kind='mergesort', ascending=False, inplace=True)
    
    result['hyperref'] = result['image_url']
    
    return result, answer


def make_clickable(val):
    return '<a target="_blank" href="{}">click</a>'.format(val)

In [12]:
columns2float16 = ['CAPITAL_RANK', 'CAPTION_SENSE_CAPITAL_RANK_3', 'TITLE_SENSE_CAPITAL_RANK_3', 'CAPTION_SENSE_CAPITAL_RANK_3_mean-std', 'TITLE_SENSE_CAPITAL_RANK_3_mean-std',
    'WEAK_ENTITY_RANK', 'TITLE_VISUAL_WEAK_ENTITY_RANK_3', 'TITLE_SENSE_WEAK_ENTITY_RANK_3', 'TITLE_VISUAL_WEAK_ENTITY_RANK_3_mean-std', 'CAPTION_VISUAL_WEAK_ENTITY_RANK_3',
    'TITLE_SENSE_WEAK_ENTITY_RANK_3_mean-std', 'XTITLE_WEAK_ENTITY_RANK', 'CAPTION_VISUAL_WEAK_ENTITY_RANK_3_mean-std', 'CAPTION_SENSE_WEAK_ENTITY_RANK_3',
    'CAPITAL_ACRONYM_RANK', 'CRUDE_ACRONYM_RANK', 'FUZZY_TITLE_RANK', 'XCAPTION_WEAK_ENTITY_RANK', 'CAPTION_SENSE_WEAK_ENTITY_RANK_3_mean-std',
    'VISUAL_UNDIGIT_CAPTION', 'VISUAL_UNDIGIT_TITLE', 'VISUAL_FINAL_CAPTION', 'VISUAL_FINAL_TITLE', 'CAPTION_VISUAL_AGGR', 'TITLE_VISUAL_AGGR',
    'SENSE_UNDIGIT_FILENAME_UNDIGIT_TITLE',
    'SENSE_FINAL_FILENAME_UNDIGIT_TITLE',
    'TITLE_SENSE_AGGR',
    'SENSE_UNDIGIT_FILENAME_FINAL_TITLE',
    'SENSE_FINAL_FILENAME_FINAL_TITLE',
    'FUZZY_CAPTION_RANK',
    'TITLE_VISUAL_FUZZY_TITLE_RANK_3',
    'SENSE_UNDIGIT_FILENAME_UNDIGIT_CAPTION',
    'SENSE_FINAL_FILENAME_UNDIGIT_CAPTION',
    'TITLE_VISUAL_TITLE_CROSS_FUZZY_RANK_3',
    'TITLE_VISUAL_FUZZY_CAPTION_RANK_3',
    'TITLE_CROSS_FUZZY_RANK',
    'CAPTION_SENSE_AGGR',
    'SENSE_UNDIGIT_FILENAME_FINAL_CAPTION',
    'TITLE_VISUAL_FUZZY_TITLE_RANK_3_mean-std',
    'SENSE_FINAL_FILENAME_FINAL_CAPTION',
    'TITLE_VISUAL_CRUDE_ACRONYM_RANK_3',
    'TITLE_VISUAL_FUZZY_CAPTION_RANK_3_mean-std',
    'TITLE_VISUAL_CAPTION_CROSS_FUZZY_RANK_3',
    'CAPTION_VISUAL_FUZZY_TITLE_RANK_3',
    'TITLE_SENSE_FUZZY_TITLE_RANK_3',
    'TITLE_VISUAL_CRUDE_ACRONYM_RANK_3_mean-std',
    'TITLE_VISUAL_CAPITAL_ACRONYM_RANK_3_mean-std',
    'TITLE_VISUAL_CAPITAL_ACRONYM_RANK_3',
    'TITLE_SENSE_FUZZY_CAPTION_RANK_3',
    'TITLE_VISUAL_TITLE_CROSS_FUZZY_RANK_3_mean-std',
    'CAPTION_VISUAL_TITLE_CROSS_FUZZY_RANK_3',
    'TITLE_SENSE_CAPTION_CROSS_FUZZY_RANK_3',
    'TITLE_VISUAL_NUMERIC_RANK_3',
    'CAPTION_CROSS_FUZZY_RANK',
    'TITLE_SENSE_TITLE_CROSS_FUZZY_RANK_3']


def compress(X):
    for c, dtype in X.dtypes.iteritems():
        if dtype == np.float64:
            X[c] = X[c].astype(np.float32)
        
        if c.startswith('DCAPTION') or c.startswith('DTITLE') or c.startswith('DFILENAME') or c.startswith('DIMG'):
            X[c] = X[c].astype(np.float16)
            
    for c in columns2float16:
        X[c] = X[c].astype(np.float16)
            
    X.fillna(-1000., inplace=True)
    
    _ = X
    X = _.copy()
    del _
    gc.collect()
    
    return X

In [17]:
%%time


import os

results = []

parts = ['../input/mining-for-ranking-model-c-00-10/valid-70', '../input/mining-for-ranking-model-c-00-10/valid-71', \
    '../input/mining-for-ranking-model-c-50-60/train-09', '../input/mining-for-ranking-model-c-60-70/train-09']

for k, part in enumerate(parts):

    X = pd.read_parquet(os.path.join(part, 'features.parquet'))
    y = pd.read_parquet(os.path.join(part, 'targets.parquet')).iloc[:, 0].values
    
    with open(os.path.join(part, 'group.pickle'), 'rb') as file:
        group = cloudpickle.load(file)
        
    
    image_ids = pd.read_parquet(os.path.join(part, 'image_ids.parquet')).iloc[:, 0].values
    target_ids = pd.read_parquet(os.path.join(part, 'target_ids.parquet')).iloc[:, 0].values


    X = compress(X)
    r, answer = pre_competition_inference(ranker, X, image_ids, target_ids)
    
    answer.to_csv('ranks-{}.csv'.format(k), index=False)
    
    r.to_csv('short-ranks-{}.csv'.format(k), index=False)
    
    results.append(r)
    
    del X
    del y
    del group
    del image_ids
    del target_ids
    
    gc.collect()

CPU times: user 22min 30s, sys: 1min 11s, total: 23min 41s
Wall time: 8min 27s


In [18]:
answer

Unnamed: 0,image_id,target_id,rank
3120718,69458,51661,6.409403
3120886,69458,73940,5.359495
3120888,69458,73942,3.097748
3120716,69458,51659,2.413559
3120514,69458,25794,1.812747
...,...,...,...
34,52,3373,-14.759327
732,52,68001,-14.775700
515,52,46911,-15.036949
813,52,75030,-15.396125


In [19]:
r.iloc[0:10].style.format({'hyperref': make_clickable}).hide_columns(['image_url'])

Unnamed: 0,image_id,target_id,rank,target,count,undigit_filename,hyperref
0,69458,51661,6.409403,Hellboy – Call of Darkness [SEP] Die Kathedrale von Wells in der Grafschaft Somerset im Südwesten Englands,4,Wells Cathedral West Front Exterior UK Diliff,click
1,69458,73940,5.359495,Bisbat antic de Bath i Wells [SEP] La catedral de Wells.,4,Wells Cathedral West Front Exterior UK Diliff,click
2,69458,73942,3.097748,Diözese Bath und Wells [SEP] Kathedrale in Wells,4,Wells Cathedral West Front Exterior UK Diliff,click
3,69458,51659,2.413559,Keuskupan Bath dan Wells (Katolik Roma) [SEP] Katedral Wells,4,Wells Cathedral West Front Exterior UK Diliff,click
4,69458,25794,1.812747,Kathedrale St. Peter (Exeter) [SEP] Exeter Cathedral,4,Wells Cathedral West Front Exterior UK Diliff,click
5,69458,102282,1.716949,춤 [SEP] 잉글랜드 웰스 지방의 웰스 대성당(Wells Cathedral) 정원에서 추는 모리스 춤(Morris dancing),4,Wells Cathedral West Front Exterior UK Diliff,click
6,69458,25793,1.515627,Exeter [SEP] Exeterin katedraali,4,Wells Cathedral West Front Exterior UK Diliff,click
7,69458,25796,1.244264,Exeter Cathedral [SEP] The side of the cathedral,4,Wells Cathedral West Front Exterior UK Diliff,click
8,69458,73943,1.038435,"Готична архітектура [SEP] Сомерсет, Кафедральний собор, Британія.",4,Wells Cathedral West Front Exterior UK Diliff,click
9,69458,171609,1.025891,"Katedrála Panny Marie (Girona) [SEP] Severní fasáda: vlevo patrný nižší podkovovitý závěr s fiálami opěrných pilířů, více vpravo přiléhá těsně k lodi jen o málo vyšší románská věž s dvojicemi podvojných oken v každém podlaží, patrné jsou rovněž mohutné opěrné pilíře lodi",4,Wells Cathedral West Front Exterior UK Diliff,click
