In [12]:
from pathlib import Path
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm as tn
from pymongo import MongoClient, UpdateOne
from pandarallel import pandarallel
from creds import MONGO_SERVER_URL

tn.pandas()
pandarallel.initialize(progress_bar=True, nb_workers=os.cpu_count())

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


In [9]:
DATASET_PATH = Path("../dataset")
IMAGES_TRAIN = DATASET_PATH / "images_train_unpadded"
IMAGES_TEST = DATASET_PATH / "images_test_unpadded"

In [3]:
VERSION = "v2.9_sift"


mc = MongoClient(
    MONGO_SERVER_URL
)
featurestore = mc.csc.featurestore

base_fields = [
    "_id",
    "image_url1",
    "image_url2",
    "ID",
    "is_same",
]

features_v2 = [
    # "left_grayscale",
    # "right_grayscale",
    # "unpadded.ahash_16",
    "unpadded.ahash_4",
    "unpadded.ahash_8",
    "unpadded.colorhash_21",
    # "unpadded.colorhash_63",
    # "unpadded.dhash_16",
    "unpadded.dhash_4",
    "unpadded.dhash_8",
    # "unpadded.dhash_verical_16",
    "unpadded.height_diff",
    "unpadded.height_ratio",
    "unpadded.left_height",
    "unpadded.left_width",
    # "unpadded.phash_16_8",
    "unpadded.phash_4",
    "unpadded.phash_8",
    "unpadded.right_height",
    "unpadded.right_width",
    "unpadded.sift_similarity",
    # "unpadded.whash_16_db4",
    # "unpadded.whash_16_haar",
    "unpadded.whash_4_haar",
    "unpadded.whash_8_haar",
    "unpadded.width_diff",
    "unpadded.width_ratio",
]

df = pd.json_normalize(featurestore.find({"is_test": False}, base_fields + features_v2))
df_test = pd.json_normalize(featurestore.find({"is_test": True}, base_fields + features_v2))


In [10]:
def orb_similarity(row, folder = IMAGES_TRAIN):
    import cv2
    orb = cv2.ORB_create()
    FLANN_INDEX_LSH = 6
    index_params = dict(algorithm = FLANN_INDEX_LSH, 
                        table_number = 6,
                        key_size = 12,
                        multi_probe_level = 1)
    search_params = dict(checks=50)
    flann = cv2.FlannBasedMatcher(index_params, search_params)

    try:
        img1 = cv2.imread(str(folder / row["image_url1"]), cv2.IMREAD_GRAYSCALE)
        img2 = cv2.imread(str(folder / row["image_url2"]), cv2.IMREAD_GRAYSCALE)
        kp1, des1 = orb.detectAndCompute(img1,None)
        kp2, des2 = orb.detectAndCompute(img2,None)

        matches = flann.knnMatch(des1, des2, k=2)

        good_matches_count = 0
        for pair in matches:
            try:
                m, n = pair
                if m.distance < 0.7*n.distance:
                    good_matches_count += 1

            except ValueError:
                pass

        similarity = 2*good_matches_count/(len(kp1) + len(kp2))
        return similarity
    except Exception:
        return 0

In [13]:
df.loc[df.index, "unpadded.orb_similarity"] = df.parallel_apply(orb_similarity, axis=1)
df_test.loc[df_test.index, "unpadded.orb_similarity"] = df_test.parallel_apply(orb_similarity, axis=1, folder=IMAGES_TEST)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=7554), Label(value='0 / 7554'))), …

In [None]:
def write(coll, df, operator="$set"):
    upds = []
    for x in df.to_dict("records"):
        x = {k: v for k, v in x.items() if not pd.isna(v)}
        upds.append(UpdateOne({"_id": x["_id"]}, {operator: x}, upsert=True))
    coll.bulk_write(upds)

In [None]:
df['is_same'] = True
df_test['is_same'] = False

df_res = pd.concat([df, df_test])

In [None]:
mc = MongoClient(MONGO_SERVER_URL)
write(mc.csc.featurestore, df_res)