In [None]:
import importlib
import pandas as pd
import numpy as np
import os, re, shutil

import imagecluster as ic
import phashlib as ph

import common as co
import imagecluster as ic

from tqdm import tqdm

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import matplotlib
% matplotlib inline

In [None]:
importlib.reload(ic)
importlib.reload(ph)
importlib.reload(co)

In [None]:
ic_base_dir = 'imagecluster'
modelname = 'ResNet50'
input_size = 224
importlib.reload(ic)

imagedir = './data/'

# Feature extraction

In [None]:
def process_dataset(imagedir, modelname = 'ResNet50', input_size = 224):
    """
    processes a list of files (filenames) 
    
    1 - calculates sha256 hash and renames files to hash
    2 - crops out image from meme and copies into ./cropped/
    3 - calculates phash using the imagehash library
    4 - calculates dnn fingerprint using keras and tensorflow
    6 - does the same for cropped versions
    7 - applies a clustering algorithm on fingerprints of cropped images
    8 - plots all members of all clusters into a jpg file and saves results
    
    - returns a pandas dataframe with the information
    """
    files = co.get_files(imagedir)
    print("> Renaming {} files (to sha256 hash)".format(len(files)))
    files, hashes = co.rename_files(files, imagedir)
    print("done.")
    
    # create pandas dataframe containing all data
    df = pd.DataFrame(index=hashes)
    df['filename'] = files
    df['hash'] = hashes
    
    print("> Phashing {} files".format(len(files)))
    phashes = ph.return_phashes(files)
    df['phash'] = phashes
    print("done.")
    
    print("> Cropping and copying all images")
    df = co.crop_images(df, imagedir, input_size)
    # reload file list in case some files were corrupt
    files = df['filename']
    print("done.")        
    
    print("> Loading Keras model {}".format(modelname))
    model, getFingerprint = ph.get_model(modelname=modelname)
    # construct fingerprint model (second to last layer)
    #getFingerprint = K.function([model.layers[0].input],
    #                              [model.layers[-2].output])
    
    print("done.")
    
    print("> Running images through DNN {}".format(modelname))
    # get fingerprints
    fps, preds, labels = ph.fingerprints(files, model, getFingerprint, size=(input_size,input_size), modelname=modelname)
    df['fingerprints'] = fps
    df['labels'] = labels
    
    print("> Running CROPPED images through DNN {}".format(modelname))
    # get fingerprints
    cfps, cpreds, clabels = ph.fingerprints(files, model, getFingerprint, size=(input_size,input_size), modelname=modelname)
    df['cropped_fingerprints'] = cfps
    df['cropped_labels'] = clabels
    
    print("done.")
    
    return df

In [None]:
dbfn = os.path.join(imagedir, ic_base_dir, 'db.pk')
if not os.path.exists(dbfn):
    os.makedirs(os.path.dirname(dbfn), exist_ok=True)
    print("no fingerprints database found in {}".format(dbfn))
    #fps = ic.fingerprints(files, model, size=(input_size,input_size), modelname=modelname)
    df_exists = 'df' in locals() or 'df' in globals()
    if not df_exists:
        print("Running processing pipeline ...")
        df = process_dataset(imagedir)
    else:
        print("df exists already.")
    print("writing {}".format(dbfn))
    co.write_pk(df, dbfn)
else:
    print("loading fingerprints database {} ...".format(dbfn))
    df = co.read_pk(dbfn)
    print("done.")

# Clustering

In [None]:
fingerprint_column = 'cropped_fingerprints'

fingerprintdict = df.set_index('filename')[fingerprint_column].to_dict()

## DBScan clustering

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform

dfps = list(fingerprintdict.values())
files = list(fingerprintdict.keys())

#from sklearn.decomposition import PCA
#pca = PCA(n_components=3)
#principalComponents = pca.fit_transform(dfps)

In [None]:
# precompute distance matrix
D = squareform(pdist(dfps, 'euclidean'))

In [None]:
db = DBSCAN(eps=19, metric='precomputed', min_samples=2, n_jobs=-1).fit(D)

In [None]:
# dbscan without precomputed distance matrix
# db = DBSCAN(eps=40, min_samples=2, algorithm='brute', metric='euclidean', n_jobs=-1).fit(dfps)

In [None]:
labels = db.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print("{} clusters, {} outliers".format(n_clusters_, list(labels).count(-1)))

In [None]:
# process DBSCAN results to we can make the diectory links:
clusters_list = []
for l in np.unique(labels):
    indices = [i for i,x in enumerate(labels) if x == l]
    thiscluster = []
    for i in indices:
        thiscluster.append(files[i])
    clusters_list.append(thiscluster)
    
# make links
pj = os.path.join
ic.make_links(clusters_list, pj(imagedir, ic_base_dir, 'clusters'))

In [None]:
# save results in dataframe
df['cluster'] = labels

### Test: find best eps value

In [None]:
epss = range(1, 40)
nclusts = []
for eps in tqdm(epss, total=len(epss)):
    
    db = DBSCAN(eps=eps, metric='precomputed', min_samples=2, n_jobs=-1).fit(D)
    labels = db.labels_
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    nclusts.append(n_clusters_)
    print("eps: {} nclusts: {}".format(eps, n_clusters_))
    label_counts = []
    for label in np.unique(labels):
        label_count = np.count_nonzero(labels == label)
        if (label > -1) and (label_count < 50):
            label_counts.append(label_count)
    plt.hist(label_counts)
    plt.show()

In [None]:
plt.plot(epss, nclusts)

## Old clustering algorithm

In [None]:
#clusters = ic.cluster(fingerprints, 0.6)
ic.make_links(ic.cluster(fingerprintdict, 0.6), os.path.join(imagedir, ic_base_dir, 'clusters'))

### Save results in dataframe

In [None]:
from scipy.spatial import distance
from scipy.cluster import hierarchy

fps = df[fingerprint_column]

dfps = distance.pdist(np.array(list(fps)), metric='euclidean')

sim = 0.6 

Z = hierarchy.linkage(dfps, method='average', metric='euclidean')
cut = hierarchy.fcluster(Z, t=dfps.max()*(1.0-sim), criterion='distance')

In [None]:
df['cluster'] = cut
#co.write_pk(df, dbfn)

# Visualize clusters

In [None]:
def plotfiles(files, plot = True, filename = '', labels=[]):
    nrows = max(2, int(np.ceil(np.sqrt(len(files)))))
    ncols = max(2, int(np.floor(np.sqrt(len(files)))))
    nimgs = nrows * nrows
    if len(files) < 3: nrows = 1
    
    f, axs = plt.subplots(nrows, ncols, figsize=(ncols*3, nrows*3), dpi=300)
    for n in range(nimgs):
        row, col = (n)//(ncols), (n)%(ncols)
        if n < len(files):
            try:
                img=mpimg.imread(files[n])
                bbox_props = dict(boxstyle="circle", fc="w", ec="0.5", pad=0.2, alpha=0.9)
                if nrows == 1:
                    axs[n].imshow(img)
                    if len(labels)<=len(files): axs[n].text(0.05, 0.05, labels[n], transform=axs[n].transAxes, 
                                                            bbox={'facecolor':'white', 'alpha':0.8, 'pad':2}, fontsize=6)
                else:
                    axs[row, col].imshow(img)
                    if len(labels)<=len(files): axs[row, col].text(0.05, 0.05, labels[n], transform=axs[row, col].transAxes, 
                                                                   bbox={'facecolor':'white', 'alpha':0.8, 'pad':2}, fontsize=6)
            except:
                pass
        try:
            if nrows == 1:
                axs[n].axis('off')
            else:
                axs[row, col].axis('off')
        except:
            pass
        
    plt.tight_layout()
    plt.subplots_adjust(wspace=0, hspace=0)
    if len(filename) > 0:
        plt.savefig(filename)
    if plot:
        plt.show()
    else:
        plt.close(f)
    

In [None]:
# save results on disk as jpgs
clusterdir = os.path.join(imagedir, ic_base_dir, 'clusters', 'visualization/')
if os.path.exists(clusterdir):
    shutil.rmtree(clusterdir)
os.makedirs(os.path.dirname(clusterdir), exist_ok=True)

clusterlist = list(df['cluster'])
unique_clusters = np.unique(df['cluster'])
cut = df['cluster']

plot = False

for nclust in unique_clusters:
    if nclust > -1: # -1 is the label for noise in dbscan
        clustersize = clusterlist.count(nclust)
        if clustersize > 1 and clustersize < 500:
            print("Cluster {} with {} memebers".format(nclust, clustersize))
            clusterdf = df[df['cluster'] == nclust]

            labels = list(clusterdf['labels'])
            #labels = [result[0] for result in [label[0] for label in clusterdf['labels']]]
            #print(labels)

            clusterfile = os.path.join(clusterdir, str(clustersize) + '_' + str(nclust) + '.jpg')
            
            plotfiles(list(clusterdf['filename']), plot=False, filename=clusterfile)
            print("Saving to {}".format(clusterfile))
            #break

### Testing: visualization


In [None]:
#files = ['/Users/caglar/Downloads/Web/rips/small/imagecluster/clusters/cluster_with_10/cluster_0/00a38bfafff15902662a0d03c6427bca6770f1ba4936674f2865bf8d87143123.jpg', '/Users/caglar/Downloads/Web/rips/small/imagecluster/clusters/cluster_with_10/cluster_0/1cd7722f5d4ca8de4e9321ae542e1351d062cc5cdc0ca02952e7ca59551406b2.jpg', '/Users/caglar/Downloads/Web/rips/small/imagecluster/clusters/cluster_with_10/cluster_0/26aaf025766038a76e799b40b923ec379228e5aa861080221c160ba702128cd1.jpg', '/Users/caglar/Downloads/Web/rips/small/imagecluster/clusters/cluster_with_10/cluster_0/44579f6bfff124e4de2ba4eb89368bd4d65de31252ffa2683cbf009b5cbe6b40.png', '/Users/caglar/Downloads/Web/rips/small/imagecluster/clusters/cluster_with_10/cluster_0/7c21827cb33454d280fb99999eff069658ce14ac00a12a7faeb12f32ad988790.jpg', '/Users/caglar/Downloads/Web/rips/small/imagecluster/clusters/cluster_with_10/cluster_0/8e6d237951f8af40836787f4098dfd436bd01f6f54f70a984f9ec12e7167060a.jpg', '/Users/caglar/Downloads/Web/rips/small/imagecluster/clusters/cluster_with_10/cluster_0/bdca8d59b55bfa5d0387d906469ad667e1c3b2e89c7ecdebd8ee197f7dbcf532.jpg', '/Users/caglar/Downloads/Web/rips/small/imagecluster/clusters/cluster_with_10/cluster_0/bdd7263028bf7e7d1daed68f8a06ecafc4e5be5f70b146874886dfb1fd10e5e7.jpg', '/Users/caglar/Downloads/Web/rips/small/imagecluster/clusters/cluster_with_10/cluster_0/d0bdbac22427d292acb6d0f0aaef67f894f345ec98171e778f5a9c212e9cfdbf.jpg', '/Users/caglar/Downloads/Web/rips/small/imagecluster/clusters/cluster_with_10/cluster_0/f1e160315801a95a8d5d6fdf089f7b933ede021efbd7bdb90d7ab81fd31d4c04.jpg']
files = list(df['filename'])[:2]
plotfiles(files, filename='1.jpg', labels=['hi', 'u'])

In [None]:
#files = ['/Users/caglar/Downloads/Web/rips/small/imagecluster/clusters/cluster_with_10/cluster_0/00a38bfafff15902662a0d03c6427bca6770f1ba4936674f2865bf8d87143123.jpg', '/Users/caglar/Downloads/Web/rips/small/imagecluster/clusters/cluster_with_10/cluster_0/1cd7722f5d4ca8de4e9321ae542e1351d062cc5cdc0ca02952e7ca59551406b2.jpg', '/Users/caglar/Downloads/Web/rips/small/imagecluster/clusters/cluster_with_10/cluster_0/26aaf025766038a76e799b40b923ec379228e5aa861080221c160ba702128cd1.jpg', '/Users/caglar/Downloads/Web/rips/small/imagecluster/clusters/cluster_with_10/cluster_0/44579f6bfff124e4de2ba4eb89368bd4d65de31252ffa2683cbf009b5cbe6b40.png', '/Users/caglar/Downloads/Web/rips/small/imagecluster/clusters/cluster_with_10/cluster_0/7c21827cb33454d280fb99999eff069658ce14ac00a12a7faeb12f32ad988790.jpg', '/Users/caglar/Downloads/Web/rips/small/imagecluster/clusters/cluster_with_10/cluster_0/8e6d237951f8af40836787f4098dfd436bd01f6f54f70a984f9ec12e7167060a.jpg', '/Users/caglar/Downloads/Web/rips/small/imagecluster/clusters/cluster_with_10/cluster_0/bdca8d59b55bfa5d0387d906469ad667e1c3b2e89c7ecdebd8ee197f7dbcf532.jpg', '/Users/caglar/Downloads/Web/rips/small/imagecluster/clusters/cluster_with_10/cluster_0/bdd7263028bf7e7d1daed68f8a06ecafc4e5be5f70b146874886dfb1fd10e5e7.jpg', '/Users/caglar/Downloads/Web/rips/small/imagecluster/clusters/cluster_with_10/cluster_0/d0bdbac22427d292acb6d0f0aaef67f894f345ec98171e778f5a9c212e9cfdbf.jpg', '/Users/caglar/Downloads/Web/rips/small/imagecluster/clusters/cluster_with_10/cluster_0/f1e160315801a95a8d5d6fdf089f7b933ede021efbd7bdb90d7ab81fd31d4c04.jpg']
files = list(df['filename'])[:100:10]
plotfiles(files)

# Test: image preprocessing:

In [None]:
def preprocess_image_for_cropping(img):
    img = img - np.mean(img)
    img /= np.std(img)
    #img = img[:,:,0]/3 - img[:,:,1]/3 - img[:,:,2]/3
    img = np.std(img, axis=2)
    # convert image to some grayscale mush
    #for i in range(1,2):
    #    img = img[:, :, 0]/3 - img[:, :, i]/3
    #plt.imshow(img, cmap='gray')
    #plt.colorbar()
    #plt.show()
    return img

In [None]:
def get_crop_bbox(img):
    
    img = preprocess_image_for_cropping(img)
    
    yrange, xrange = img.shape[:2]
    
    croplines_x = []
    croplines_y = []
    
    mean_x = [[],[]]
    for x in range(xrange):
        # extract cross sections to analyze
        filterline = np.abs(img[:, x])
        # interpolate line
        boxwidth = 10
        box = np.ones(boxwidth)/boxwidth
        filterline = np.convolve(filterline, box, mode='same')
        
        filter_threshold = np.mean(img)/10
        
        # find pixels where threshold is crossed 
        threshold_crossings = np.where(np.array(filterline)>filter_threshold)[0]
        
        # take mean of the found borders across image
        if len(threshold_crossings) > 0:
            croplines_x.append([threshold_crossings[0], threshold_crossings[-1]])
    if len(croplines_x)>1:
        mean_x = np.median(np.array(croplines_x), axis=0).astype(int)
        
    mean_y = [[],[]]
    for y in range(yrange):
        # extract cross sections to analyze
        filterline = np.abs(img[y, :])
        # interpolate line
        boxwidth = 10
        box = np.ones(boxwidth)/boxwidth
        filterline = np.convolve(filterline, box, mode='same')

        filter_threshold = np.mean(img)/10

        threshold_crossings = np.where(np.array(filterline)>filter_threshold)[0]
        if len(threshold_crossings) > 0:
            croplines_y.append([threshold_crossings[0], threshold_crossings[-1]])
    if len(croplines_y)>1:
        mean_y = np.median(np.array(croplines_y), axis=0).astype(int)
    return mean_y, mean_x # threshold crossings on y axis are x values to crop and vice versa

In [None]:
def plot_croplines(croplines_x, croplines_y, img):
    if (len(croplines_x)==2 and len(croplines_y)==2):
        plt.figure()
        plt.imshow(img)
        plt.hlines(croplines_y[0], croplines_x[0], croplines_x[1], color='g', lw=5)
        plt.hlines(croplines_y[1], croplines_x[0], croplines_x[1], color='y', lw=5)

        plt.vlines(croplines_x[0], croplines_y[0], croplines_y[1], color='r', lw=5)
        plt.vlines(croplines_x[1], croplines_y[0], croplines_y[1], color='b', lw=5)
        plt.show()

## Crop images and save

In [None]:
def crop_images(df):
    cropped_folder = os.path.join(imagedir, 'cropped/')
    if not os.path.exists(cropped_folder):
        os.makedirs(os.path.dirname(cropped_folder), exist_ok=True)
    if 'cropped_filename' not in df:
        df['cropped_filename'] = None
    for file in tqdm(df.index, total=len(df.index)):
        pil_img=Image.open(df.loc[file]['filename'])
        fhash = df.loc[file]['hash']
        cropped_fname = os.path.join(imagedir, 'cropped/', fhash + '.jpg')

        pil_img.thumbnail((input_size, input_size), Image.ANTIALIAS)
        img = np.array(pil_img)
        origimg = img.copy()
        croplines_x, croplines_y = get_crop_bbox(img)

        w, h = pil_img.size
        if len(croplines_x) is not 2: 
            croplines_x = [0, w]
            print("couldn't crop {} in x-axis".format(file))
        if len(croplines_y) is not 2: 
            croplines_y = [0, h]
            print("couldn't crop {} in y-axis".format(file))

        #plot_croplines(croplines_x, croplines_y, img)    
        pil_img = pil_img.crop((croplines_x[0], croplines_y[0], croplines_x[1], croplines_y[1]))
        pil_img = pil_img.convert('RGB') 

        pil_img.save(cropped_fname)

        df.loc[file]['cropped_filename'] = cropped_fname

        #plt.imshow(pil_img)
        #plt.show()
    return df