In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm

from collections import Counter

import gc
import cloudpickle

import os, sys
import random

import warnings

import xgboost as xgb

print('Using \033[34mXGBoost', xgb.__version__, '\033[0m')

Using [34mXGBoost 1.5.0 [0m


## Main Part

In [5]:
images = pd.read_csv('../input/final-image-traindataset-part0-4-count-5/final_images.csv')
images['image_id'] = images.index
matchings = pd.read_csv('../input/combiner-target-dataset-0-of-5/final_matchings.csv')

In [9]:
import os

all_results = []

parts = ['../input/data-for-model-00/valid-70', '../input/data-for-model-00/valid-71', \
    '../input/data-for-model-05/train-09']

for N, part in enumerate(parts):
    
    ranks = []

    for k in range(7):
        _ = pd.read_csv('../input/train-ranks-for-model-0{0}/short-ranks-{1}.csv'.format(k, N))
        _.drop(columns=['image_url', 'hyperref', 'target', 'count', 'undigit_filename'], inplace=True)
        _.rename(columns={'rank': 'rank' + str(k)}, inplace=True)
    
        ranks.append(_)
        
    topk = 100

    ranks[0].sort_values(by=['image_id', 'rank0'], inplace=True, kind='mergesort', ascending=False)
    result = ranks[0]#.groupby('image_id').head(topk)
    
    for k, _ranks in enumerate(ranks[1:]):
        rank_field = 'rank{}'.format(k + 1)
        index = k + 1
        ranks[index].sort_values(by=['image_id', rank_field], inplace=True, kind='mergesort', ascending=False)
#         r = ranks[index].groupby('image_id').head(topk)

        r = ranks[index]

        result = result.merge(r, how='outer', on=['image_id', 'target_id'])


    rank_fields = ['rank' + str(k) for k in range(len(ranks))]

    for rank_field in rank_fields:
        result[rank_field] = result[rank_field].fillna(np.float32(result[rank_field].min() - 0.5))

    rank_fields = ['rank' + str(k) for k in range(7)]

    stat_df = result.groupby('image_id', as_index=False).agg({ c: ['mean', 'std'] for c in rank_fields})
    stat_df.columns = ["_".join(x) for x in stat_df.columns.ravel()]
    stat_df.rename(columns={ 'image_id_': 'image_id' }, inplace=True)

    result = pd.merge(result, stat_df, on='image_id')

    for rank_field in rank_fields:
        result[rank_field + '_std'] = np.float32(result[rank_field + '_std'])
        result[rank_field + '_var'] = np.float32(result[rank_field + '_mean'] / (result[rank_field + '_std'] + 1e-2))
        result[rank_field + '_normed'] = np.float32((result[rank_field] - result[rank_field + '_mean']) / (result[rank_field + '_std'] + 1e-2))
        
    
    y = pd.read_parquet(os.path.join(part, 'targets.parquet'))
    image_ids = pd.read_parquet(os.path.join(part, 'image_ids.parquet'))
    target_ids= pd.read_parquet(os.path.join(part, 'target_ids.parquet'))
    
    X = pd.concat([y, image_ids, target_ids], axis=1)
    
    
    result = pd.merge(result, X, on=['image_id', 'target_id'])
    
        
    all_results.append(result)



In [10]:
total_result = pd.concat(all_results)

total_result.sort_values(by=['image_id'], inplace=True, kind='mergesort', ascending=True)

total_result.to_parquet('train.parquet')

total_result

Unnamed: 0,image_id,target_id,rank0,rank1,rank2,rank3,rank4,rank5,rank6,rank0_mean,...,rank2_normed,rank3_var,rank3_normed,rank4_var,rank4_normed,rank5_var,rank5_normed,rank6_var,rank6_normed,MATCH
86262,68,137717,6.767020,7.589909,6.981566,8.064589,7.789616,7.245657,7.790066,-1.826752,...,2.304940,-0.217325,2.160819,-0.231773,2.001936,-0.216139,1.879396,-0.288423,2.098608,1
86263,68,137719,5.563097,6.411661,5.403645,6.262428,7.076739,5.920494,6.193605,-1.826752,...,1.893191,-0.217325,1.726514,-0.231773,1.839937,-0.216139,1.575202,-0.288423,1.727637,1
86264,68,137718,5.254476,6.209639,5.109680,6.214959,7.070467,5.503346,6.032679,-1.826752,...,1.816483,-0.217325,1.715074,-0.231773,1.838512,-0.216139,1.479445,-0.288423,1.690243,1
86265,68,212117,5.243744,6.207840,5.616974,6.043822,6.644480,6.292782,6.555391,-1.826752,...,1.948858,-0.217325,1.673832,-0.231773,1.741708,-0.216139,1.660662,-0.288423,1.811706,0
86266,68,137716,4.299229,5.391079,4.410022,5.383885,6.233411,5.200714,5.502605,-1.826752,...,1.633912,-0.217325,1.514793,-0.231773,1.648294,-0.216139,1.409975,-0.288423,1.567069,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,69464,244066,-6.730565,-6.215149,-6.592605,-5.714254,-5.980622,-5.797653,-2.003985,-3.807902,...,-0.999150,-1.216273,-1.016111,-1.062039,-1.012258,-1.028829,-1.026801,-1.019766,0.341815,1
85,69464,292335,-6.730565,-6.215149,-6.592605,-5.714254,-5.980622,-5.797653,-2.078722,-3.807902,...,-0.999150,-1.216273,-1.016111,-1.062039,-1.012258,-1.028829,-1.026801,-1.019766,0.316532,0
86,69464,5265,-6.730565,-6.215149,-6.592605,-5.714254,-5.980622,-5.797653,-2.084232,-3.807902,...,-0.999150,-1.216273,-1.016111,-1.062039,-1.012258,-1.028829,-1.026801,-1.019766,0.314668,0
87,69464,186697,-6.730565,-6.215149,-6.592605,-5.714254,-5.980622,-5.797653,-2.191618,-3.807902,...,-0.999150,-1.216273,-1.016111,-1.062039,-1.012258,-1.028829,-1.026801,-1.019766,0.278339,0


In [11]:
import os

all_results = []

parts = ['../input/data-for-model-06/train-09']

for N, part in enumerate(parts):
    
    N = 3
    
    ranks = []

    for k in range(7):
        _ = pd.read_csv('../input/train-ranks-for-model-0{0}/short-ranks-{1}.csv'.format(k, N))
        _.drop(columns=['image_url', 'hyperref', 'target', 'count', 'undigit_filename'], inplace=True)
        _.rename(columns={'rank': 'rank' + str(k)}, inplace=True)
    
        ranks.append(_)
        
    topk = 100

    ranks[0].sort_values(by=['image_id', 'rank0'], inplace=True, kind='mergesort', ascending=False)
    result = ranks[0]#.groupby('image_id').head(topk)
    
    for k, _ranks in enumerate(ranks[1:]):
        rank_field = 'rank{}'.format(k + 1)
        index = k + 1
        ranks[index].sort_values(by=['image_id', rank_field], inplace=True, kind='mergesort', ascending=False)
#         r = ranks[index].groupby('image_id').head(topk)

        r = ranks[index]

        result = result.merge(r, how='outer', on=['image_id', 'target_id'])


    rank_fields = ['rank' + str(k) for k in range(len(ranks))]

    for rank_field in rank_fields:
        result[rank_field] = result[rank_field].fillna(np.float32(result[rank_field].min() - 0.5))

    rank_fields = ['rank' + str(k) for k in range(7)]

    stat_df = result.groupby('image_id', as_index=False).agg({ c: ['mean', 'std'] for c in rank_fields})
    stat_df.columns = ["_".join(x) for x in stat_df.columns.ravel()]
    stat_df.rename(columns={ 'image_id_': 'image_id' }, inplace=True)

    result = pd.merge(result, stat_df, on='image_id')

    for rank_field in rank_fields:
        result[rank_field + '_std'] = np.float32(result[rank_field + '_std'])
        result[rank_field + '_var'] = np.float32(result[rank_field + '_mean'] / (result[rank_field + '_std'] + 1e-2))
        result[rank_field + '_normed'] = np.float32((result[rank_field] - result[rank_field + '_mean']) / (result[rank_field + '_std'] + 1e-2))
        
    
    y = pd.read_parquet(os.path.join(part, 'targets.parquet'))
    image_ids = pd.read_parquet(os.path.join(part, 'image_ids.parquet'))
    target_ids= pd.read_parquet(os.path.join(part, 'target_ids.parquet'))
    
    X = pd.concat([y, image_ids, target_ids], axis=1)
    
    
    result = pd.merge(result, X, on=['image_id', 'target_id'])
    
        
    all_results.append(result)



In [12]:
total_result = pd.concat(all_results)

total_result.sort_values(by=['image_id'], inplace=True, kind='mergesort', ascending=True)

total_result.to_parquet('valid.parquet')

total_result

Unnamed: 0,image_id,target_id,rank0,rank1,rank2,rank3,rank4,rank5,rank6,rank0_mean,...,rank2_normed,rank3_var,rank3_normed,rank4_var,rank4_normed,rank5_var,rank5_normed,rank6_var,rank6_normed,MATCH
86469,52,233063,7.178525,6.538880,6.433822,6.433228,7.546990,7.375566,6.843082,-2.993165,...,2.904336,-0.535776,2.401313,-0.563619,2.646683,-0.583808,2.562964,-0.594538,2.352816,1
86470,52,233064,5.607776,5.350560,4.994696,4.913579,6.568535,5.669404,5.264292,-2.993165,...,2.459852,-0.535776,1.960638,-0.563619,2.376617,-0.583808,2.105134,-0.594538,1.947158,1
86471,52,233065,4.395752,5.107599,4.264780,5.157209,5.308811,5.173628,5.063816,-2.993165,...,2.234413,-0.535776,2.031287,-0.563619,2.028918,-0.583808,1.972097,-0.594538,1.895647,1
86472,52,127163,2.353567,3.259148,2.255012,3.231598,3.326448,3.246206,3.551573,-2.993165,...,1.613683,-0.535776,1.472890,-0.563619,1.481761,-0.583808,1.454894,-0.594538,1.507087,0
86473,52,152990,1.777707,2.624292,0.927192,2.583427,3.171491,2.743405,2.506178,-2.993165,...,1.203577,-0.535776,1.284930,-0.563619,1.438991,-0.583808,1.319972,-0.594538,1.238481,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,69458,269097,-6.605800,-6.353398,-6.282269,-5.450472,-5.780550,-6.079898,0.146490,-3.142374,...,-0.996508,-0.702036,-0.983249,-0.675824,-1.010808,-0.659865,-1.029812,-0.697645,0.736987,0
86,69458,269098,-6.605800,-6.353398,-6.282269,-5.450472,-5.780550,-6.079898,0.146490,-3.142374,...,-0.996508,-0.702036,-0.983249,-0.675824,-1.010808,-0.659865,-1.029812,-0.697645,0.736987,0
87,69458,269100,-6.605800,-6.353398,-6.282269,-5.450472,-5.780550,-6.079898,-0.787915,-3.142374,...,-0.996508,-0.702036,-0.983249,-0.675824,-1.010808,-0.659865,-1.029812,-0.697645,0.486037,0
88,69458,247179,-6.605800,-6.353398,-6.282269,-5.450472,-5.780550,-6.079898,-0.823042,-3.142374,...,-0.996508,-0.702036,-0.983249,-0.675824,-1.010808,-0.659865,-1.029812,-0.697645,0.476603,0


In [13]:
total_result.columns.tolist()

['image_id',
 'target_id',
 'rank0',
 'rank1',
 'rank2',
 'rank3',
 'rank4',
 'rank5',
 'rank6',
 'rank0_mean',
 'rank0_std',
 'rank1_mean',
 'rank1_std',
 'rank2_mean',
 'rank2_std',
 'rank3_mean',
 'rank3_std',
 'rank4_mean',
 'rank4_std',
 'rank5_mean',
 'rank5_std',
 'rank6_mean',
 'rank6_std',
 'rank0_var',
 'rank0_normed',
 'rank1_var',
 'rank1_normed',
 'rank2_var',
 'rank2_normed',
 'rank3_var',
 'rank3_normed',
 'rank4_var',
 'rank4_normed',
 'rank5_var',
 'rank5_normed',
 'rank6_var',
 'rank6_normed',
 'MATCH']