In [1]:
import faiss
import os
import numpy as np
import pandas as pd

import torch
from torch import Tensor
from torchvision import models

from torchvision.transforms import Compose, transforms
from PIL import Image
import cv2
import sqlite3

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [2]:
df = pd.read_pickle('hnsw_all_image_results.pkl')

In [16]:
img_features = df['image_features']

img_features = np.array(img_features.to_list())
img_features = img_features.reshape(img_features.shape[0], img_features.shape[2])

In [17]:
img_features.shape

(10126, 2048)

In [18]:
# all image features will be in X
X = img_features

## Trying DBSCAN

In [113]:
from sklearn.cluster import DBSCAN

dbscan_cluster = DBSCAN(eps=3, min_samples=10).fit(X)

In [114]:
dbscan_cluster.labels_

array([ -1,  -1,   0, ..., 105, 105, 105])

In [115]:
max(dbscan_cluster.labels_)

105

In [116]:
df['cluster_id'] = dbscan_cluster.labels_

#### Accuracy

In [117]:
centroid_dataframe = pd.DataFrame(columns=["label", "num_correct", "num_total", "all_labels_list"])
for cluster_id in range(-1, max(dbscan_cluster.labels_)+1):
    # maximum image_dir
    predictions_for_centroid = df[df['cluster_id'] == cluster_id]
    value_counts = predictions_for_centroid['image_dir'].value_counts()
    
    labels = list()
    for row_id in value_counts.index:
        labels.append((os.path.split(row_id)[1], value_counts[row_id]))
    
    centroid_dataframe.loc[len(centroid_dataframe.index)] = [labels[0][0], labels[0][1], sum(value_counts), labels]

In [118]:
centroid_dataframe

Unnamed: 0,label,num_correct,num_total,all_labels_list
0,_Robert_Mugabe_falling_from_his_podium,80,8140,"[(_Robert_Mugabe_falling_from_his_podium, 80),..."
1,_This_cat_plotting_to_kill_someone,14,14,"[(_This_cat_plotting_to_kill_someone, 14)]"
2,_This_happy_family,21,21,"[(_This_happy_family, 21)]"
3,_Trump_trying_to_close_his_pen,36,36,"[(_Trump_trying_to_close_his_pen, 36)]"
4,_Dog_with_Cat_mask,10,10,"[(_Dog_with_Cat_mask, 10)]"
...,...,...,...,...
102,_Japanese_astronaut_looking_through_a_water_bu...,13,13,[(_Japanese_astronaut_looking_through_a_water_...
103,_A_New_Zealand_Lamb,10,10,"[(_A_New_Zealand_Lamb, 10)]"
104,_Ridiculously_photogenic_volleyball_player,14,14,"[(_Ridiculously_photogenic_volleyball_player, ..."
105,_A_guy_throwing_a_frisbee_really_hard,12,12,"[(_A_guy_throwing_a_frisbee_really_hard, 12)]"


## OPTICS

In [124]:
from sklearn.cluster import OPTICS

optic_cluster = OPTICS(n_jobs=50).fit(X)

In [125]:
optic_cluster.labels_

array([ -1,  -1, 178, ...,  66,  66,  66])

In [126]:
df['cluster_id'] = optic_cluster.labels_

In [127]:
max(optic_cluster.labels_)

226

In [128]:
centroid_dataframe = pd.DataFrame(columns=["label", "num_correct", "num_total", "all_labels_list"])
for cluster_id in range(-1, max(dbscan_cluster.labels_)+1):
    # maximum image_dir
    predictions_for_centroid = df[df['cluster_id'] == cluster_id]
    value_counts = predictions_for_centroid['image_dir'].value_counts()
    
    labels = list()
    for row_id in value_counts.index:
        labels.append((os.path.split(row_id)[1], value_counts[row_id]))
    
    centroid_dataframe.loc[len(centroid_dataframe.index)] = [labels[0][0], labels[0][1], sum(value_counts), labels]

In [129]:
centroid_dataframe

Unnamed: 0,label,num_correct,num_total,all_labels_list
0,_Peter_Dinklage_riding_a_scooter,73,7257,"[(_Peter_Dinklage_riding_a_scooter, 73), (_Rob..."
1,_Weightlifter_jumping_up_after_a_successful_lift,13,14,[(_Weightlifter_jumping_up_after_a_successful_...
2,_Headless_hockey_player_and_terrified_referee,8,8,[(_Headless_hockey_player_and_terrified_refere...
3,_This_teacher_from_my_Facebook_feed,5,5,"[(_This_teacher_from_my_Facebook_feed, 5)]"
4,_Obama_wielding_a_lightsaber_in_front_of_the_W...,13,14,[(_Obama_wielding_a_lightsaber_in_front_of_the...
...,...,...,...,...
102,_Albino_squirrel_leaning_on_a_screw,6,6,"[(_Albino_squirrel_leaning_on_a_screw, 6)]"
103,_A_guy_throwing_a_frisbee_really_hard,13,13,"[(_A_guy_throwing_a_frisbee_really_hard, 13)]"
104,_A_guy_throwing_a_frisbee_really_hard,8,8,"[(_A_guy_throwing_a_frisbee_really_hard, 8)]"
105,_2016_Olympic_Ping_Pong,13,13,"[(_2016_Olympic_Ping_Pong, 13)]"


## Trying Model training after PCA

In [130]:
from sklearn.decomposition import PCA

pca = PCA(n_components=100, random_state=22)
pca.fit(X)
pca_x = pca.transform(X)

In [133]:
pca_x.shape

(10126, 100)

In [132]:
# cluster training
ncentroids = 256
niter = 50
verbose = True
d = pca_x.shape[1]
kmeans = faiss.Kmeans(d, ncentroids, niter=niter, verbose=verbose, gpu=True)
kmeans.train(pca_x)

Clustering 10126 points in 100D to 256 clusters, redo 1 times, 50 iterations
  Preprocessing in 0.00 s
  Iteration 49 (0.70 s, search 0.66 s): objective=257574 imbalance=1.342 nsplit=0       

257574.46875

In [135]:
D, I = kmeans.index.search(pca_x, 1)
I.reshape(I.shape[0])
df['cluster_id'] = I

In [136]:
centroid_dataframe = pd.DataFrame(columns=["label", "num_correct", "num_total", "all_labels_list"])
for cluster_id in range(ncentroids):
    # maximum image_dir
    predictions_for_centroid = df[df['cluster_id'] == cluster_id]
    value_counts = predictions_for_centroid['image_dir'].value_counts()
    
    labels = list()
    for row_id in value_counts.index:
        labels.append((os.path.split(row_id)[1], value_counts[row_id]))
    
    centroid_dataframe.loc[len(centroid_dataframe.index)] = [labels[0][0], labels[0][1], sum(value_counts), labels]
    

In [138]:
sum(centroid_dataframe['num_correct'])/sum(centroid_dataframe['num_total'])

0.3296464546711436

## DBSCAN PCA

In [356]:
from sklearn.decomposition import PCA

pca = PCA(n_components=10, random_state=22)
pca.fit(X)
pca_x = pca.transform(X)

In [414]:
for i in np.arange(1,3,1):
    dbscan_cluster = DBSCAN(eps=1.5, min_samples=i).fit(pca_x)
    num_clusters = max(dbscan_cluster.labels_)
    num_outliers = np.unique(dbscan_cluster.labels_, return_counts=True)[1][0]
    print(f"for min_samples={i}: cluster={num_clusters} and outliers={num_outliers}")
    
    df['cluster_id'] = dbscan_cluster.labels_
    
    centroid_dataframe = pd.DataFrame(columns=["label", "num_correct", "num_total", "all_labels_list"])
    for cluster_id in range(max(dbscan_cluster.labels_)):
        # maximum image_dir
        predictions_for_centroid = df[df['cluster_id'] == cluster_id]
        value_counts = predictions_for_centroid['image_dir'].value_counts()

        labels = list()
        for row_id in value_counts.index:
            labels.append((os.path.split(row_id)[1], value_counts[row_id]))

        centroid_dataframe.loc[len(centroid_dataframe.index)] = [labels[0][0], labels[0][1], sum(value_counts), labels]
        
    print("accuracy = ",sum(centroid_dataframe['num_correct'])/sum(centroid_dataframe['num_total']))

for min_samples=1: cluster=6793 and outliers=1
accuracy =  0.9515061728395062
for min_samples=2: cluster=729 and outliers=6064
accuracy =  0.879064039408867


In [415]:
dbscan_cluster = DBSCAN(eps=1.5, min_samples=1).fit(pca_x)

In [416]:
np.unique(dbscan_cluster.labels_, return_counts=True)[1][0]

1

In [417]:
max(dbscan_cluster.labels_)

6793

In [418]:
dbscan_cluster.labels_

array([   0,    1,    2, ..., 4513, 4513, 4513])

In [419]:
df['cluster_id'] = dbscan_cluster.labels_

In [420]:
centroid_dataframe = pd.DataFrame(columns=["label", "num_correct", "num_total", "all_labels_list"])
for cluster_id in range(max(dbscan_cluster.labels_)):
    # maximum image_dir
    predictions_for_centroid = df[df['cluster_id'] == cluster_id]
    value_counts = predictions_for_centroid['image_dir'].value_counts()
    
    labels = list()
    for row_id in value_counts.index:
        labels.append((os.path.split(row_id)[1], value_counts[row_id]))
    
    centroid_dataframe.loc[len(centroid_dataframe.index)] = [labels[0][0], labels[0][1], sum(value_counts), labels]
    

In [421]:
centroid_dataframe['accuracy'] = centroid_dataframe['num_correct']/centroid_dataframe['num_total']

In [422]:
centroid_dataframe.sort_values(by=['accuracy'], ascending=True).head(20)

Unnamed: 0,label,num_correct,num_total,all_labels_list,accuracy
4247,_Meerkats_under_a_heating_lamp,1,6,"[(_Meerkats_under_a_heating_lamp, 1), (_Kid_in...",0.166667
595,_My_tortoise_trying_to_eat_a_bolt,2,11,"[(_My_tortoise_trying_to_eat_a_bolt, 2), (_Thi...",0.181818
66,_This_happy_family,1,5,"[(_This_happy_family, 1), (_This_guy_dancing, ...",0.2
2548,_Ewan_McGregor_on_a_greenscreen_set_in_a_green...,1,4,[(_Ewan_McGregor_on_a_greenscreen_set_in_a_gre...,0.25
2161,_RDJ_in_a_bunny_costume,1,4,"[(_RDJ_in_a_bunny_costume, 1), (_Peter_Dinklag...",0.25
427,_Dodging_a_ball,1,4,"[(_Dodging_a_ball, 1), (_This_eggplant_with_ar...",0.25
808,_Keanu_Reeves_sitting_awkwardly_on_a_rock_in_t...,1,4,[(_Keanu_Reeves_sitting_awkwardly_on_a_rock_in...,0.25
1466,_Jennifer_Lawrence_playing_basketball,1,4,"[(_Jennifer_Lawrence_playing_basketball, 1), (...",0.25
270,"_This_Cat,_Drinking_From_a_Bird_Bath",1,4,"[(_This_Cat,_Drinking_From_a_Bird_Bath, 1), (_...",0.25
910,_A_basket_ball_that_has_been_sprayed_with_Vant...,1,3,[(_A_basket_ball_that_has_been_sprayed_with_Va...,0.333333


In [423]:
centroid_dataframe[centroid_dataframe['num_total']>50]

Unnamed: 0,label,num_correct,num_total,all_labels_list,accuracy
1195,_Canadian_PM_Justin_Trudeau_hesitating_to_shak...,58,113,[(_Canadian_PM_Justin_Trudeau_hesitating_to_sh...,0.513274
1305,_This_picture_of_Hillary_Clinton_and_Barack_Obama,36,55,[(_This_picture_of_Hillary_Clinton_and_Barack_...,0.654545


In [424]:
sum(centroid_dataframe['num_correct'])/sum(centroid_dataframe['num_total'])

0.9515061728395062