### Адаптация поиска по хэш-функциям для рукописных текстов

In [1]:
import cv2
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import time
import os
from PIL import Image
import imagehash
pd.set_option('display.float_format', '{:.2f}'.format)

In [2]:
def load_image(path, scale=100):
    image = cv2.imread(path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    width  = int(image.shape[1] * scale/100)
    height = int(image.shape[0] * scale/100)
    image = cv2.resize(image, (width, height), interpolation = cv2.INTER_AREA)  
    return image

In [3]:
def preprocess_image(image):
    tmp = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
    tmp = clahe.apply(tmp)
    tmp = cv2.GaussianBlur(tmp,(7,7),0)
    return tmp

In [4]:
def detect_similar_hash(src, templates, metric="EUCL", hash_size=16, dist=90):
    if metric == 'EUCL' :
        M = 0.0
    elif metric == 'GEOM':
        M = 1.0
    elif metric == 'MIN':
        M = float("inf")

    src = preprocess_image(src)
    src = Image.fromarray(src)
    # average_hash, phash, dhash, dhash_vertical, whash
    hash_src = imagehash.average_hash(src.rotate(0), hash_size)

    for template in templates:
        template = preprocess_image(template)
        template = Image.fromarray(template)
        # average_hash, phash, dhash, dhash_vertical, whash
        hash_template = imagehash.average_hash(template.rotate(0), hash_size)
        diff = hash_src - hash_template
        if metric == 'EUCL':
            M += diff**2
        elif metric == 'GEOM':
            M *= diff
        elif metric == 'MIN' :
            if diff < M : M = diff
    if metric == 'EUCL':
        M = M**0.5
    elif metric == 'GEOM':
        M = M**(1.0/5)               
    if M > dist : M = float("inf")
    return M

In [5]:
def find_person(src_path, templates_paths, metric, hash_size, dist):
    df = pd.DataFrame()
    src = load_image(src_path)
    for key in templates_paths:
        templates = []
        for root, _ , files in os.walk(templates_paths[key]):  
            for filename in files:
                tmp = load_image(root+"\\"+filename)
                templates.append(tmp)
        M = detect_similar_hash(src, templates, metric, hash_size, dist)
        dct = dict(name=key, metric=M)
        df = df.append(dct,ignore_index=True)
    return df

In [6]:
templates_paths = dict()

templates_paths["Ioannov"]   = "templates\\Ioannov"
templates_paths["Andreeva"]  = "templates\\Andreeva"
templates_paths["Georgieva"] = "templates\\Georgieva"
templates_paths["Ivanova"]   = "templates\\Ivanova"
templates_paths["Nikitina"]  = "templates\\Nikitina"
#src_path = ["Andreeva.png", "Georgieva.png", "Ioannov.png", "Ivanova.png", "Nikitina.png"]

hash_size = 24
dist = 500

start_time = time.time()

df = find_person("Georgieva.png", templates_paths, "MIN", hash_size, dist)
print(df[["name","metric"]])

all_time = time.time() - start_time
print("\n%s seconds" % all_time)

df.loc[np.where(df["metric"] == df["metric"].min())]

        name  metric
0    Ioannov  149.00
1   Andreeva  173.00
2  Georgieva  140.00
3    Ivanova  190.00
4   Nikitina  162.00

10.878721475601196 seconds


Unnamed: 0,metric,name
2,140.0,Georgieva
