In [None]:
import pandas as pd
import numpy as np

from tqdm import tqdm

from collections import Counter

import gc
import cloudpickle

import os, sys
import random

import warnings

import xgboost as xgb

## Inference

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
ranker = xgb.sklearn.XGBRanker()
ranker.load_model('../input/train-final-model/final.model')

In [None]:
test_images = pd.read_csv('../input/test-final-image-dataset/final_images.csv')
test_images['image_id'] = test_images.index
matchings = pd.read_csv('../input/test-combiner-target-dataset/final_matchings.csv')
test_images

In [None]:
def pre_competition_inference(ranker, X, image_ids, target_ids):
    rank_pred = ranker.predict(X)
    
    answer = pd.DataFrame({ 'image_id': image_ids, 'target_id': target_ids, 'rank': rank_pred })

    answer.sort_values(by=['image_id', 'rank'], inplace=True, kind='mergesort', ascending=False)
    
    result = answer.groupby('image_id').head(5)
    
    temp = matchings[['target_id', 'target']]
    result = pd.merge(result, temp, on='target_id')

    temp = test_images[['image_id', 'count', 'undigit_filename', 'image_url']]
    result = pd.merge(result, temp, on='image_id')
    result.sort_values(by=['image_id', 'rank'], kind='mergesort', ascending=False, inplace=True)
    
    result['hyperref'] = result['image_url']
    
    return result


def make_clickable(val):
    return '<a target="_blank" href="{}">click</a>'.format(val)

In [None]:
ranks = []

for k in range(7):
    _ = pd.read_parquet('../input/test-ranks-for-model-0{}/ranks.parquet'.format(k))
    _.rename(columns={'rank': 'rank' + str(k)}, inplace=True)
    
    ranks.append(_)

In [None]:
topk = 100
result = ranks[0]

for k, _ranks in enumerate(ranks[1:]):
    rank_field = 'rank{}'.format(k + 1)
    index = k + 1
    r = ranks[index]

    result = result.merge(r, how='outer', on=['image_id', 'target_id'])
    
    
rank_fields = ['rank' + str(k) for k in range(len(ranks))]

for rank_field in rank_fields:
    result[rank_field] = result[rank_field].fillna(np.float32(result[rank_field].min() - 0.5))

In [None]:
rank_fields = ['rank' + str(k) for k in range(7)]

stat_df = result.groupby('image_id', as_index=False).agg({ c: ['mean', 'std'] for c in rank_fields})
stat_df.columns = ["_".join(x) for x in stat_df.columns.ravel()]
stat_df.rename(columns={ 'image_id_': 'image_id' }, inplace=True)

result = pd.merge(result, stat_df, on='image_id')

for rank_field in rank_fields:
    result[rank_field + '_std'] = np.float32(result[rank_field + '_std'])
    result[rank_field + '_var'] = np.float32(result[rank_field + '_mean'] / (result[rank_field + '_std'] + 1e-2))
    result[rank_field + '_normed'] = np.float32((result[rank_field] - result[rank_field + '_mean']) / (result[rank_field + '_std'] + 1e-2))

In [None]:
result_ranks = result
result_ranks

In [None]:
result_ranks['image_id'].is_monotonic

In [None]:
result_ranks.sort_values(by='image_id', inplace=True)
groups = result_ranks.groupby('image_id').size().values.tolist()

In [None]:
image_ids = result_ranks.pop('image_id')
target_ids = result_ranks.pop('target_id')

In [None]:
r = pre_competition_inference(ranker, result_ranks, image_ids, target_ids)

In [None]:
r.iloc[1190:1200].style.format({'hyperref': make_clickable}).hide_columns(['image_url'])

In [None]:
results = r

In [None]:
original_images = pd.read_csv('../input/wikipedia-image-caption/test.tsv', sep='\t')
sample = pd.read_csv('../input/wikipedia-image-caption/sample_submission.csv')

In [None]:
almost_answer = pd.merge(original_images, results, on='image_url')

answer = almost_answer[['id', 'target', 'rank', 'count']].sort_values(by=['id', 'rank'], kind='mergesort', ascending=False)
answer.rename(columns={ 'target': 'caption_title_and_reference_description'}, inplace=True)

answer.iloc[100:120]

In [None]:
answer[['id', 'caption_title_and_reference_description']].to_csv('submission.csv', index=None)