# Читаем датафрейм с хешами

In [1]:
import pandas as pd
import imagehash

In [2]:
df = pd.read_csv('hash_values/hash_database_full.csv', index_col=0)
df = df.reset_index().drop('index', axis=1)
# df['hash'] = df['hash'].apply(imagehash.hex_to_hash)
df.head()

Unnamed: 0,base,ID,Pic,Pic_url,hash
0,1,0,1,https://forum.vgd.ru/file.php?fid=5476&key=0,806969715521b059
1,1,2,1,https://forum.vgd.ru/file.php?fid=5492&key=0,f4d8cc0ca543d6b6
2,1,3,1,https://forum.vgd.ru/file.php?fid=20890&key=19...,f1732b4d73511723
3,1,4,1,https://forum.vgd.ru/file.php?fid=20894&key=16...,b292ced8b3f2f2f2
4,1,5,1,https://forum.vgd.ru/file.php?fid=20895&key=90...,b8186ca4ae8a989a


In [3]:
df.shape

(44884, 5)

In [4]:
df['base'].value_counts()

3    20882
1    14074
2     9928
Name: base, dtype: int64

# BKTree
## Пример

In [5]:
from pybktree_imagehash import BKTree, hamming_distance

In [6]:
hamming_distance(imagehash.hex_to_hash('806969715521b059'), imagehash.hex_to_hash('806969715521b050'))

TypeError: int() can't convert non-string with explicit base

In [None]:
tree = BKTree(hamming_distance, ['806969715521b059', 'f4d8cc0ca543d6b6', 
                                                   'f1732b4d73511723', 'b292ced8b3f2f2f2'])
# for item in tree:
#     print(type(item))
# tree = BKTree(hamming_distance, df.head()['hash'].to_list())
tree.add('b8186ca4ae8a989a')       
sorted(tree)             

In [None]:
sorted(tree.find('b8186ca4ae8a989b', 1))  # find elements at most 1 bit away from element 13

In [None]:
for item in tree:
    print(item)
    print(imagehash.hex_to_hash(item) - imagehash.hex_to_hash('b8186ca4ae8a989d'))
#     print(type(item))

In [None]:
import collections
Item = collections.namedtuple('Item', 'hash_value hash_db_index')

In [None]:
def item_distance(x, y):
     return hamming_distance(x.hash_value, y.hash_value)

In [None]:
tree = BKTree(item_distance, [Item('806969715521b059', '1'), Item('f4d8cc0ca543d6b6', '2'),
                                           Item('f1732b4d73511723', '3'), Item('b8186ca4ae8a989a', '4')])
tree.add(Item('b292ced8b3f2f2f2', '5'))
sorted(tree)

In [None]:
sorted(tree.find(Item('b8186ca4ae8a989b', '1'), 1))

## Создаем дерево для сканов открыток

In [None]:
tree = BKTree(item_distance, [Item(value, ind) for ind, value in enumerate(df['hash'].to_list())])
sorted(tree)[:10]

## Поиск в дереве

Ищутся хеши на расстоянии **меньше либо равном 1** до данного

In [None]:
sorted(tree.find(Item('0000000000000000', 18964), 1))

Составляем таблицу возможных дубликатов (расстояние <= 3)

In [None]:
from tqdm import tqdm

In [None]:
possible_doubles = []

for ind, row in tqdm(df.iterrows(), total=df.shape[0]):
    this_id = row.name
    this_hash = row['hash']
    search_results = sorted(tree.find(Item(this_hash, this_id), 3))
    possible_doubles += [(t[0], this_id, t[1].hash_db_index) for t in search_results 
                         if t[1].hash_db_index != this_id]  # not the picture itself

In [None]:
possible_doubles = pd.DataFrame(possible_doubles, columns=['hash_dist', 'hash_db_index_1', 'hash_db_index_2'])
possible_doubles.to_csv('possible_doubles.csv')

In [None]:
possible_doubles = pd.read_csv('possible_doubles.csv', index_col=0)
possible_doubles.head()

## Чистим и дополняем таблицу

Удаляем дублирующиеся пары

In [None]:
possible_doubles['temp'] = possible_doubles.apply(
    lambda row: (min(row['hash_db_index_1'], row['hash_db_index_2']), max(row['hash_db_index_1'], row['hash_db_index_2'])),
    axis=1
)
possible_doubles['hash_db_index_1'] = possible_doubles['temp'].apply(lambda x: x[0])
possible_doubles['hash_db_index_2'] = possible_doubles['temp'].apply(lambda x: x[1])
possible_doubles = possible_doubles.drop('temp', axis=1)

possible_doubles = possible_doubles.drop_duplicates().reset_index().drop('index', axis=1)

In [None]:
possible_doubles.head()

In [None]:
possible_doubles.shape

In [None]:
possible_doubles['hash_dist'].value_counts()

In [None]:
possible_doubles['hash_db_index_1'].nunique(), possible_doubles['hash_db_index_2'].nunique()

Добавляем в таблицу данные об изображениях

In [None]:
possible_doubles['base_1'] = possible_doubles['hash_db_index_1'].apply(lambda x: df.iloc[x]['base'])
possible_doubles['ID_1'] = possible_doubles['hash_db_index_1'].apply(lambda x: df.iloc[x]['ID'])
possible_doubles['Pic_1'] = possible_doubles['hash_db_index_1'].apply(lambda x: df.iloc[x]['Pic'])
possible_doubles['Pic_url_1'] = possible_doubles['hash_db_index_1'].apply(lambda x: df.iloc[x]['Pic_url'])
possible_doubles['hash_value_1'] = possible_doubles['hash_db_index_1'].apply(lambda x: df.iloc[x]['hash'])

possible_doubles['base_2'] = possible_doubles['hash_db_index_2'].apply(lambda x: df.iloc[x]['base'])
possible_doubles['ID_2'] = possible_doubles['hash_db_index_2'].apply(lambda x: df.iloc[x]['ID'])
possible_doubles['Pic_2'] = possible_doubles['hash_db_index_2'].apply(lambda x: df.iloc[x]['Pic'])
possible_doubles['Pic_url_2'] = possible_doubles['hash_db_index_2'].apply(lambda x: df.iloc[x]['Pic_url'])
possible_doubles['hash_value_2'] = possible_doubles['hash_db_index_2'].apply(lambda x: df.iloc[x]['hash'])

possible_doubles = possible_doubles[['hash_dist', 
                                     'hash_db_index_1', 'base_1', 'ID_1', 'Pic_1', 'Pic_url_1', 'hash_value_1',
                                     'hash_db_index_2', 'base_2', 'ID_2', 'Pic_2', 'Pic_url_2', 'hash_value_2']]
possible_doubles.head()

Статистика по базам

In [None]:
possible_doubles['base_1'].value_counts()

In [None]:
possible_doubles['base_2'].value_counts()

Случаи, когда возможные дубликаты в паре из разных баз

In [None]:
possible_doubles[possible_doubles['base_1'] != possible_doubles['base_2']]

Пример, когда хеши полностью совпадают

In [None]:
with pd.option_context('display.max_colwidth', None):
    display(possible_doubles[possible_doubles['hash_dist'] == 0].head())

In [None]:
possible_doubles.to_csv('possible_doubles_with_lookup_data.csv')

Случаи, когда предположительно лицевая сторона одной открытки дублирует другую. Оставляем в таблице, потому что их немного и это может быть ошибка при загрузке (в базе случайно поменяли местами ссылки на лицевую и оборотную сторону)

In [5]:
possible_doubles[possible_doubles['Pic_1'] != possible_doubles['Pic_2']].shape

(26, 13)

In [6]:
possible_doubles[possible_doubles['Pic_1'] != possible_doubles['Pic_2']].head()

Unnamed: 0,hash_dist,hash_db_index_1,base_1,ID_1,Pic_1,Pic_url_1,hash_value_1,hash_db_index_2,base_2,ID_2,Pic_2,Pic_url_2,hash_value_2
4,0,104,1,105,1,https://forum.vgd.ru/file.php?fid=448884&key=1...,c9e9eed11919c541,20311,1,3515,2,https://storage.yandexcloud.net/postcards-muse...,c9e9eed11919c541
19,2,775,1,791,1,https://forum.vgd.ru/file.php?fid=191244&key=1...,7d3d75d7675f537f,17700,1,773,2,https://forum.vgd.ru/file.php?fid=191246&key=5...,7d3d75d76557537f
29,1,1082,1,1106,1,https://forum.vgd.ru/file.php?fid=234804&key=1...,9c1f4f671f5973b4,32842,3,8869,2,https://storage.yandexcloud.net/postcards-open...,9c1f4f671f5953b4
30,2,1085,1,1109,1,https://forum.vgd.ru/file.php?fid=234811&key=1...,466bb5b3c8ccca60,32843,3,8870,2,https://storage.yandexcloud.net/postcards-open...,466bb4b1c8ccca60
73,2,2319,1,2376,1,https://storage.yandexcloud.net/postcards-new/...,f8f0f0f8f9f8f8f8,40696,3,11222,2,https://storage.yandexcloud.net/postcards-open...,f8f0f8f8f8f8f8f8
