Артём Панов, TG - @arsepan

## Описание проекта

Необходимо разработать алгоритм, который для всех товаров из validation.csv предложит несколько вариантов наиболее похожих товаров из base, оценить качество алгоритма по метрике accuracy@5. Речь идёт о задаче метчинга - для каждого объекта из множества A найти один или несколько объектов из B, которые близки к нему по некоторой заданной метрике.

Данные:
- **base.csv** - анонимизированный набор товаров. Каждый товар представлен как уникальный id (0-base, 1-base, 2-base) и вектор признаков размерностью 72.
- **train.csv -** обучающий датасет. Каждая строчка - один товар, для которого известен уникальный id (0-query, 1-query, …) , вектор признаков И id товара из *base.csv*, который максимально похож на него (по мнению экспертов).
- **validation.csv** - датасет с товарами (уникальный id и вектор признаков), для которых надо найти наиболее близкие товары из *base.csv*
- **validation_answer.csv** - правильные ответы к предыдущему файлу.

## Создание зависимостей и функций

In [1]:
import pandas as pd
import numpy as np
import faiss

from sklearn.preprocessing import StandardScaler

In [2]:
def show_df(df):
    display(df.head(3))
    print(f'Количество наблюдений: {df.shape[0]}')
    print(f'Количество признаков: {df.shape[1]}')
    print(f'Количество NaN значений: {df.isna().sum().sum()}')
    print(f'Количество дубликатов: {df.duplicated().sum()}')
    try:
        print(f'Значение размаха: {min(list(df.min()[:-1])), max(list(df.max()[:-1]))}')
    except:
        pass

## Распаковка данных

**Base** - можно сказать, что это база данных готовых векторов после определённого преобразования. Именно из этой БД мы будем искать самые близкие товары.

In [3]:
base = pd.read_csv('base.csv', index_col='Id')
show_df(base)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,62,63,64,65,66,67,68,69,70,71
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0-base,-115.08389,11.152912,-64.42676,-118.88089,216.48244,-104.69806,-469.070588,44.348083,120.915344,181.4497,...,-42.808693,38.800827,-151.76218,-74.38909,63.66634,-4.703861,92.93361,115.26919,-112.75664,-60.830353
1-base,-34.562202,13.332763,-69.78761,-166.53348,57.680607,-86.09837,-85.076666,-35.637436,119.718636,195.23419,...,-117.767525,41.1,-157.8294,-94.446806,68.20211,24.346846,179.93793,116.834,-84.888941,-59.52461
2-base,-54.233746,6.379371,-29.210136,-133.41383,150.89583,-99.435326,52.554795,62.381706,128.95145,164.38147,...,-76.3978,46.011803,-207.14442,127.32557,65.56618,66.32568,81.07349,116.594154,-1074.464888,-32.527206


Количество наблюдений: 2918139
Количество признаков: 72
Количество NaN значений: 0
Количество дубликатов: 0
Значение размаха: (-1297.931468499947, 1557.433379634992)


In [4]:
train = pd.read_csv('train.csv', index_col='Id')
show_df(train)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,63,64,65,66,67,68,69,70,71,Target
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0-query,-53.882748,17.971436,-42.117104,-183.93668,187.51749,-87.14493,-347.360606,38.307602,109.08556,30.413513,...,70.10736,-155.80257,-101.965943,65.90379,34.4575,62.642094,134.7636,-415.750254,-25.958572,675816-base
1-query,-87.77637,6.806268,-32.054546,-177.26039,120.80333,-83.81059,-94.572749,-78.43309,124.9159,140.33107,...,4.669178,-151.69771,-1.638704,68.170876,25.096191,89.974976,130.58963,-1035.092211,-51.276833,366656-base
2-query,-49.979565,3.841486,-116.11859,-180.40198,190.12843,-50.83762,26.943937,-30.447489,125.771164,211.60782,...,78.039764,-169.1462,82.144186,66.00822,18.400496,212.40973,121.93147,-1074.464888,-22.547178,1447819-base


Количество наблюдений: 100000
Количество признаков: 73
Количество NaN значений: 0
Количество дубликатов: 0
Значение размаха: (-1297.8719841036757, 1557.383334499213)


In [5]:
validation = pd.read_csv('validation.csv', index_col='Id')
show_df(validation)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,62,63,64,65,66,67,68,69,70,71
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100000-query,-57.372734,3.597752,-13.213642,-125.92679,110.74594,-81.279594,-461.003172,139.81572,112.88098,75.21575,...,-75.51302,52.830902,-143.43945,59.051935,69.28224,61.927513,111.59253,115.140656,-1099.130485,-117.07936
100001-query,-53.758705,12.7903,-43.268543,-134.41762,114.44991,-90.52013,-759.626065,63.995087,127.117905,53.128998,...,-79.44183,29.185436,-168.6059,-82.872443,70.7656,-65.97595,97.07716,123.39164,-744.442332,-25.00932
100002-query,-64.175095,-3.980927,-7.679249,-170.16093,96.44616,-62.37774,-759.626065,87.477554,131.27011,168.92032,...,-134.79541,37.36873,-159.66231,-119.232725,67.71044,86.00206,137.63641,141.08163,-294.052271,-70.969604


Количество наблюдений: 100000
Количество признаков: 72
Количество NaN значений: 0
Количество дубликатов: 0
Значение размаха: (-1297.9239987642586, 1557.293478372255)


In [6]:
validation_answer = pd.read_csv('validation_answer.csv', index_col='Id')
show_df(validation_answer)

Unnamed: 0_level_0,Expected
Id,Unnamed: 1_level_1
100000-query,2676668-base
100001-query,91606-base
100002-query,472256-base


Количество наблюдений: 100000
Количество признаков: 1
Количество NaN значений: 0
Количество дубликатов: 8498


## Подготовка данных

In [7]:
features = train.drop('Target', axis=1)
target = train['Target']
print(f'Features shape: {features.shape}')
print(f'Target shape: {target.shape}')

Features shape: (100000, 72)
Target shape: (100000,)


In [8]:
scaler = StandardScaler()

features_idx = features.index.tolist()
features_sc = scaler.fit_transform(features)
features = pd.DataFrame(index=features_idx, data=features_sc)

base_idx = base.index.tolist()
base_sc = scaler.transform(base)
base = pd.DataFrame(index=base_idx, data=base_sc)

In [257]:
features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,62,63,64,65,66,67,68,69,70,71
0-query,1.218659,2.079877,0.039629,-1.850675,1.587961,-0.486308,0.334226,0.258787,-2.153763,-1.474397,...,-0.791579,1.309634,-0.087262,-1.170455,-1.03143,0.206023,-0.173143,0.906949,0.724239,0.547062
1-query,-0.094858,-0.173152,0.296912,-1.523962,0.189363,-0.370205,1.240223,-1.510417,0.284657,0.218924,...,1.314829,-1.270984,0.012808,-0.157858,0.202021,0.037262,0.266272,0.713549,-0.801388,-0.067236
2-query,1.369923,-0.771418,-1.852466,-1.677699,1.642697,0.777931,1.675742,-0.783196,0.416398,1.316968,...,0.949086,1.622456,-0.412557,0.687758,-0.974613,-0.083446,2.234588,0.312377,-0.898375,0.629833
3-query,1.453981,0.286998,-1.834136,1.228656,-0.358857,-1.256206,-1.198981,0.878306,0.144055,1.795195,...,0.638241,0.300385,0.165927,0.807328,-1.995759,1.49688,0.157014,0.143082,-0.898375,0.693994
4-query,0.220765,1.367848,-0.389568,-0.045702,-1.145549,1.982654,0.427564,0.375133,0.442602,-0.350126,...,0.403412,0.320641,-1.069452,-1.328515,-0.475157,0.35022,0.067677,-1.05225,-0.898375,0.663749


## Создание модели

In [10]:
dims = base.shape[1]
n_cells = 25
quantizer = faiss.IndexFlatL2(dims)
idx_l2 = faiss.IndexIVFFlat(quantizer, dims, n_cells)

In [11]:
idx_l2.train(np.ascontiguousarray(base.values[:500000, :]).astype('float32'))

idx_l2.add(np.ascontiguousarray(base.values).astype('float32'))

In [12]:
base_index = {k: i for k, i in enumerate(base.index.tolist())}
list(base_index.items())[:3]

[(0, '0-base'), (1, '1-base'), (2, '2-base')]

In [13]:
vecs, idx = idx_l2.search(np.ascontiguousarray(features.values).astype('float32'), 100)

In [14]:
acc = 0
for trgt, el in zip(target.values.tolist(), idx.tolist()):
    acc += int(trgt in [base_index[i] for i in el])

print('Точность модели:', '{:.2%}'.format(acc / target.shape[0]))

Точность модели: 54.14%


In [126]:
df_cat = []
for i in range(idx.shape[0]): # Проходимся по всем ответам
    if i % 5000 == 0:
        print('{:.2%}'.format(i / idx.shape[0]))
    str_idx = [base_index[i] for i in idx[i]] # Создаём список индексов каждого ответа
    if target[i] in str_idx:
        df_cat.append(np.concatenate((features.iloc[i].values, base.loc[base.index == str_idx[str_idx.index(target[i])]].values[0], np.array([1]))))
        if target[i] == str_idx[0]:
            df_cat.append(np.concatenate((features.iloc[i].values, base.loc[base.index == str_idx[1]].values[0], np.array([0]))))
        else:
            df_cat.append(np.concatenate((features.iloc[i].values, base.loc[base.index == str_idx[0]].values[0], np.array([0]))))

0.00%
5.00%
10.00%
15.00%
20.00%
25.00%
30.00%
35.00%
40.00%
45.00%
50.00%
55.00%
60.00%
65.00%
70.00%
75.00%
80.00%
85.00%
90.00%
95.00%


In [127]:
df_cat = pd.DataFrame(data=df_cat)
df_cat

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,135,136,137,138,139,140,141,142,143,144
0,1.218659,2.079877,0.039629,-1.850675,1.587961,-0.486308,0.334226,0.258787,-2.153763,-1.474397,...,1.712308,0.010568,0.428418,-0.496977,0.403141,-0.535591,0.083045,0.724239,0.735407,1.0
1,1.218659,2.079877,0.039629,-1.850675,1.587961,-0.486308,0.334226,0.258787,-2.153763,-1.474397,...,1.698041,-0.311033,-0.402364,-1.143644,0.421038,-0.345688,0.731960,0.296695,0.291394,0.0
2,0.220765,1.367848,-0.389568,-0.045702,-1.145549,1.982654,0.427564,0.375133,0.442602,-0.350126,...,0.320632,-1.069614,-1.328515,-0.474964,0.350176,0.067644,-1.052127,-0.898375,0.664229,1.0
3,0.220765,1.367848,-0.389568,-0.045702,-1.145549,1.982654,0.427564,0.375133,0.442602,-0.350126,...,0.524454,-1.047661,-1.707922,-0.643422,0.366380,0.069110,-1.166881,-0.898375,0.787702,0.0
4,-0.291257,-0.946164,-0.462619,-1.250656,0.692396,0.504676,1.952317,0.827659,-0.728839,-0.679979,...,-0.950038,0.347621,0.729522,-1.698053,0.748115,-0.140631,0.543268,-0.223341,0.444216,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108283,-0.561866,0.072637,-0.774536,-1.890278,-0.939264,-0.340158,-1.143342,-1.056054,-0.934419,1.957286,...,0.317893,-1.174631,-0.909449,1.919309,-0.693679,0.761175,-0.042787,-0.898375,-0.373225,0.0
108284,1.216121,-0.592752,0.367880,-0.052078,3.193245,-0.973471,0.046847,0.709647,-0.767644,1.021824,...,-1.971809,-0.745338,1.137116,-0.140495,1.460118,-0.552218,-0.105679,-0.012652,1.226066,1.0
108285,1.216121,-0.592752,0.367880,-0.052078,3.193245,-0.973471,0.046847,0.709647,-0.767644,1.021824,...,-2.083689,-0.918866,-1.244925,-0.712622,1.648413,-0.687554,-0.155792,-0.012652,1.036396,0.0
108286,-0.144157,0.660713,-0.243901,0.094449,0.664031,-0.470265,1.292502,0.848588,-0.788420,-0.813754,...,-1.250578,0.425201,-0.937286,-0.497796,0.407031,0.909611,-0.673563,1.641060,-0.767811,1.0


In [131]:
df_cat[144]

0         1.0
1         0.0
2         1.0
3         0.0
4         1.0
         ... 
108283    0.0
108284    1.0
108285    0.0
108286    1.0
108287    0.0
Name: 144, Length: 108288, dtype: float64

In [135]:
cat_features = df_cat.drop(144, axis=1)
cat_target = df_cat[144]

In [150]:
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV

model = CatBoostClassifier(logging_level='Silent', loss_function='Logloss')
parameters_cbc = {'max_depth': np.arange(6, 11),
                  'learning_rate':[0.001, 0.01, 0.1, 1],
                  'iterations': [1000,1500,2000]}
catboost_grid = GridSearchCV(model,
                             parameters_cbc,
                             cv=3,
                             scoring='accuracy',
                             verbose=2).fit(cat_features, cat_target)

Fitting 3 folds for each of 60 candidates, totalling 180 fits
[CV] END ..iterations=1000, learning_rate=0.001, max_depth=6; total time=  32.1s
[CV] END ..iterations=1000, learning_rate=0.001, max_depth=6; total time=  32.0s
[CV] END ..iterations=1000, learning_rate=0.001, max_depth=6; total time=  31.9s
[CV] END ..iterations=1000, learning_rate=0.001, max_depth=7; total time=  45.0s
[CV] END ..iterations=1000, learning_rate=0.001, max_depth=7; total time=  45.5s
[CV] END ..iterations=1000, learning_rate=0.001, max_depth=7; total time=  46.2s
[CV] END ..iterations=1000, learning_rate=0.001, max_depth=8; total time= 1.2min
[CV] END ..iterations=1000, learning_rate=0.001, max_depth=8; total time= 1.2min
[CV] END ..iterations=1000, learning_rate=0.001, max_depth=8; total time= 1.2min
[CV] END ..iterations=1000, learning_rate=0.001, max_depth=9; total time= 2.0min
[CV] END ..iterations=1000, learning_rate=0.001, max_depth=9; total time= 1.9min
[CV] END ..iterations=1000, learning_rate=0.001

[CV] END ....iterations=1500, learning_rate=0.1, max_depth=9; total time= 2.8min
[CV] END ...iterations=1500, learning_rate=0.1, max_depth=10; total time= 8.0min
[CV] END ...iterations=1500, learning_rate=0.1, max_depth=10; total time= 8.1min
[CV] END ...iterations=1500, learning_rate=0.1, max_depth=10; total time= 8.0min
[CV] END ......iterations=1500, learning_rate=1, max_depth=6; total time=  49.1s
[CV] END ......iterations=1500, learning_rate=1, max_depth=6; total time=  48.4s
[CV] END ......iterations=1500, learning_rate=1, max_depth=6; total time=  48.1s
[CV] END ......iterations=1500, learning_rate=1, max_depth=7; total time= 1.1min
[CV] END ......iterations=1500, learning_rate=1, max_depth=7; total time= 1.0min
[CV] END ......iterations=1500, learning_rate=1, max_depth=7; total time= 1.0min
[CV] END ......iterations=1500, learning_rate=1, max_depth=8; total time= 1.6min
[CV] END ......iterations=1500, learning_rate=1, max_depth=8; total time= 1.6min
[CV] END ......iterations=15

In [157]:
catboost_grid.best_score_

0.9125664893617021

In [158]:
model_cat = CatBoostClassifier(iterations = 2000,
                               learning_rate = 0.1,
                               max_depth = 9,
                               logging_level ='Silent',
                               loss_function ='Logloss').fit(cat_features, cat_target)

In [159]:
quantizer_ip = faiss.IndexFlatIP(dims)
idx_ip = faiss.IndexIVFFlat(quantizer_ip, dims, n_cells)

idx_ip.train(np.ascontiguousarray(base.values[:500000, :]).astype('float32'))

idx_ip.add(np.ascontiguousarray(base.values).astype('float32'))

In [160]:
vecs_ip, idx_ip = idx_ip.search(np.ascontiguousarray(features.values).astype('float32'), 100)

In [161]:
acc_ip = 0
for trgt_ip, el_ip in zip(target.values.tolist(), idx_ip.tolist()):
    acc_ip += int(trgt_ip in [base_index[i] for i in el_ip])

print('Точность модели:', '{:.2%}'.format(acc_ip / target.shape[0]))

Точность модели: 61.98%


In [163]:
idx_ip[0]

array([ 755584,  598613,  336969, 1934845,   13374, 1136231,  480296,
        988777, 2360257,  583287,  450667, 1653095, 1818641, 1631947,
        503716,  629775,   89840,  986050, 2346335, 1747988,  143978,
        352715,  223859,  455829, 2295048, 1674977,  443428, 1375244,
        854264,  751217, 1746258,  728097, 1113711,  199196, 2212973,
        792610,  684958, 1546520, 1679537,  108138,  725525, 1299775,
        458465,  164935,  669932, 2398299, 1307670, 2331892,   57113,
       1285249,  679629, 2356601, 1037661,  547719,  232405, 1003399,
        277931,  703484, 1472986, 1374819, 1228782, 2674563,   49353,
        642286, 2700063,  851029,  113294, 1541141,  960901,  105660,
        803577, 1012249,  454434, 2654893, 2711635,  233844,  264577,
       1379669, 2860069, 2042379,   58509, 1379579, 1886302, 1407122,
       1364891,  682218,  363988, 2815333, 2356997, 1131378, 2052752,
        914354,  390350, 1713683, 2909593, 1091351, 2202395, 1628501,
       1598432, 2882

In [165]:
second_mode = []
str_idx_sec = [base_index[i] for i in idx_ip[0]]
for i in range(len(str_idx_sec)):
    second_mode.append(np.concatenate((features.iloc[0].values, base.loc[base.index == str_idx_sec[i]].values[0])))

In [167]:
second_mode = pd.DataFrame(data=second_mode)
second_mode

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,134,135,136,137,138,139,140,141,142,143
0,1.218659,2.079877,0.039629,-1.850675,1.587961,-0.486308,0.334226,0.258787,-2.153763,-1.474397,...,-0.602001,1.698041,-0.311033,-0.402364,-1.143644,0.421038,-0.345688,0.731960,0.296695,0.291394
1,1.218659,2.079877,0.039629,-1.850675,1.587961,-0.486308,0.334226,0.258787,-2.153763,-1.474397,...,-0.851277,1.712308,0.010568,0.428418,-0.496977,0.403141,-0.535591,0.083045,0.724239,0.735407
2,1.218659,2.079877,0.039629,-1.850675,1.587961,-0.486308,0.334226,0.258787,-2.153763,-1.474397,...,-0.260577,1.821196,-0.434070,0.988730,-1.095766,0.289562,-0.460381,0.539531,0.295408,0.108781
3,1.218659,2.079877,0.039629,-1.850675,1.587961,-0.486308,0.334226,0.258787,-2.153763,-1.474397,...,-0.441395,2.220264,-0.425010,-0.333752,-1.191454,0.165092,-1.226741,0.024671,1.644646,0.862929
4,1.218659,2.079877,0.039629,-1.850675,1.587961,-0.486308,0.334226,0.258787,-2.153763,-1.474397,...,-0.209855,1.823598,-0.648673,-0.970468,-0.608199,0.329959,-0.644047,0.179109,1.945150,0.528980
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1.218659,2.079877,0.039629,-1.850675,1.587961,-0.486308,0.334226,0.258787,-2.153763,-1.474397,...,0.490877,2.263516,-0.383584,-1.493979,-0.658389,-0.928480,-0.118126,0.158648,-0.898375,0.133643
96,1.218659,2.079877,0.039629,-1.850675,1.587961,-0.486308,0.334226,0.258787,-2.153763,-1.474397,...,0.271724,1.569022,-0.715068,-0.112024,-0.516561,-0.567459,-0.357619,0.011569,-0.319248,0.582949
97,1.218659,2.079877,0.039629,-1.850675,1.587961,-0.486308,0.334226,0.258787,-2.153763,-1.474397,...,-1.000115,1.358776,-0.504707,0.632585,-1.219792,0.121131,-0.221742,-0.138919,-0.898375,0.314993
98,1.218659,2.079877,0.039629,-1.850675,1.587961,-0.486308,0.334226,0.258787,-2.153763,-1.474397,...,-0.542110,1.468388,-1.149459,-0.548859,-0.940821,0.273576,-0.487927,0.749021,-0.898375,-0.532712


In [224]:
def predict_5(idx_ip, iteration):
    second_mode = []
    str_idx_sec = [base_index[i] for i in idx_ip[iteration]]
    for i in range(len(str_idx_sec)):
        second_mode.append(np.concatenate((test_features.iloc[iteration].values, base.loc[base.index == str_idx_sec[i]].values[0])))
    
    proba = model_cat.predict_proba(second_mode)

    proba = proba.tolist()
    for i in range(len(str_idx_sec)):
        proba[i].append(str_idx_sec[i])

    sorted_proba = sorted(proba, key=lambda x: x[1], reverse=True)

    true_idx = []
    for line in sorted_proba[:5]:
        true_idx.append(line[2])
    return true_idx

In [212]:
test = validation.join(validation_answer)
test_features = test.drop('Expected', axis=1)
test_target = test['Expected']

In [244]:
test_features_idx = test_features.index.tolist()
test_features_sc = scaler.transform(test_features)
test_features = pd.DataFrame(index=test_features_idx, data=test_features_sc)
test_features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,62,63,64,65,66,67,68,69,70,71
100000-query,1.083408,-0.820601,0.778643,0.988113,-0.021481,-0.282074,-0.073071,1.797141,-1.569137,-0.784202,...,0.183615,0.628320,0.214130,0.454690,0.806680,0.701242,0.613806,-0.002273,-0.959134,-1.663807
100001-query,1.223466,1.034371,0.010189,0.572604,0.056169,-0.603834,-1.143342,0.648080,0.623842,-1.124457,...,0.055540,-0.304162,-0.399385,-0.977745,1.613730,-1.604551,0.380450,0.380033,-0.085429,0.570094
100002-query,0.819788,-2.349909,0.920148,-1.176541,-0.321262,0.376098,-1.143342,1.003957,1.263425,0.659352,...,-1.748936,0.018554,-0.181356,-1.344728,-0.048488,1.135248,1.032499,1.199692,1.024018,-0.545043
100003-query,-0.540937,1.707068,1.368014,-0.095025,-0.588534,-2.108414,1.789113,-0.610416,-1.236203,-0.784341,...,0.127444,0.284028,0.480081,-1.214390,1.678126,0.009890,0.960919,1.300299,-0.220717,-0.992043
100004-query,0.224609,-1.620082,0.706699,-1.211971,1.125420,1.562992,1.700787,0.199737,0.898960,-0.370242,...,-1.389526,0.344580,0.437936,-0.002688,1.533224,-0.138761,1.304700,-0.625011,-1.138368,-1.863456
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995-query,1.498075,-0.205857,1.305098,1.467708,-0.665375,0.868079,1.515748,1.317251,-0.052086,-0.346256,...,0.505431,1.022404,0.081796,-0.811519,-0.438645,0.964385,-0.110709,-0.135112,1.311427,-0.832199
199996-query,-0.366522,0.643370,1.466786,1.003157,-1.891473,-0.572684,1.865467,-1.026671,0.078687,-0.802422,...,0.525258,0.044228,0.940895,-0.327397,-1.978959,-2.524976,0.331406,-1.339543,-1.383739,-0.363592
199997-query,1.011646,1.194903,1.695747,0.547724,0.700935,0.344747,1.185630,-0.075080,0.782827,0.658044,...,1.053567,0.109693,0.070371,1.479709,2.045514,0.039805,-1.758723,-0.799964,1.928378,0.495561
199998-query,2.014247,0.808568,0.309102,0.911422,-0.758683,1.276888,-1.143342,-1.537520,0.099201,-1.435434,...,0.477036,-0.832314,0.223021,1.253450,0.231925,1.667144,-1.784805,1.594785,1.322033,-0.301284


In [246]:
_, index = idx_l2.search(np.ascontiguousarray(test_features.values).astype('float32'), 100)

In [251]:
acc5 = 0
n = test_features.shape[0]
for i in range(n):
    ans = predict_5(index, i)
    acc5 += int(test_target[i] in ans)
print(acc5/n*100)

KeyboardInterrupt: 

In [254]:
acc5/i

0.5820934144132488