In [2]:
import faiss
import os
import numpy as np
import pandas as pd

import torch
from torch import Tensor
from torchvision import models

from torchvision.transforms import Compose, transforms
from PIL import Image
import cv2
import sqlite3

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


### Load Model

In [36]:
model = models.resnet50(pretrained=True, progress=False)
for param in model.parameters():
    param.requires_grad = False
model.fc = torch.nn.Identity()
model.to(device)
model.eval()
print('', end='')



In [37]:
def transform(images: np.ndarray):
    transformed = [transforms.ToTensor()]
    composed = Compose(transformed)
    return composed(Image.fromarray(images[:, :, ::-1])).unsqueeze(0)
def as_numpy(val: Tensor) -> np.ndarray:
        return val.detach().cpu().numpy()

In [38]:
def model_output(image_path):
    img = cv2.imread(image_path)
    imgt = transform(img)
    # f = (3, width, height) values: 0-1
    imgt = imgt.to(device)
    with torch.no_grad():
        inference = as_numpy(model(torch.unsqueeze(imgt[0], 0)))
    return inference

In [39]:
INDEX_PATH =  "/nethome/kravicha3/aryan/project/notebooks/redditdataset/saves/HNSW_dataindex.index"
index = faiss.read_index(INDEX_PATH)
k = 10

c = sqlite3.connect("/nethome/kravicha3/aryan/project/notebooks/redditdataset/saves/eva_catalog.db").cursor()

In [40]:
def get_similarity_results(image_path):
    # get model output
    output = model_output(image_path)
    # get index similarity
    D, I = index.search(output, k)
    # get the filenames
    results = list()
    for i in I[0]:
        c.execute(f"SELECT * FROM '192111ccbbbfc5042415841dfaa9f90a' WHERE _row_id={i}")
        r= c.fetchall()
        results.append(r[0][1])
    return results

In [6]:
REDDIT_DATA_HOME = os.path.join("/nethome/kravicha3/aryan/project/dataset/Reddit_Provenance_Datasets/data/")
reddit_threads = os.listdir(REDDIT_DATA_HOME)

In [42]:
def get_examples():
    dataframe = pd.DataFrame(columns=['image_name', 'image_dir', 'num_similar', 'results'])
    # take a directory and get all image results from it
    for dir_name in reddit_threads:
        img_dir = os.path.join(REDDIT_DATA_HOME, dir_name)
        for file in os.listdir(img_dir):
            # store and save results some where
            if (file.endswith(".jpg") or file.endswith(".png")):
                fp = os.path.join(img_dir, file)
                try:
                    result = get_similarity_results(fp)
                except:
                    continue
                num_sim = 0
                
                for name in result:
                    
                    if img_dir in name:
                        num_sim += 1
                
                dataframe.loc[len(dataframe)] = [file, img_dir, num_sim, result]
        break
    return dataframe

In [43]:
get_examples()

Unnamed: 0,image_name,image_dir,num_similar,results
0,g1327_czcqbl6.jpg,/nethome/kravicha3/aryan/project/dataset/Reddi...,1,[/nethome/kravicha3/aryan/project/dataset/Redd...
1,g1327_czcu1y7.jpg,/nethome/kravicha3/aryan/project/dataset/Reddi...,3,[/nethome/kravicha3/aryan/project/dataset/Redd...
2,g1327_czd2m0n.jpg,/nethome/kravicha3/aryan/project/dataset/Reddi...,10,[/nethome/kravicha3/aryan/project/dataset/Redd...
3,g1327_czcrc83.png,/nethome/kravicha3/aryan/project/dataset/Reddi...,10,[/nethome/kravicha3/aryan/project/dataset/Redd...
4,g1327_czd40us.jpg,/nethome/kravicha3/aryan/project/dataset/Reddi...,2,[/nethome/kravicha3/aryan/project/dataset/Redd...
5,g1327_czcsosm.jpg,/nethome/kravicha3/aryan/project/dataset/Reddi...,1,[/nethome/kravicha3/aryan/project/dataset/Redd...
6,g1327_czd0lca.png,/nethome/kravicha3/aryan/project/dataset/Reddi...,10,[/nethome/kravicha3/aryan/project/dataset/Redd...
7,g1327_czcy9ku.jpg,/nethome/kravicha3/aryan/project/dataset/Reddi...,1,[/nethome/kravicha3/aryan/project/dataset/Redd...
8,g1327_czclqxh.png,/nethome/kravicha3/aryan/project/dataset/Reddi...,10,[/nethome/kravicha3/aryan/project/dataset/Redd...
9,g1327_czcwgnr.jpg,/nethome/kravicha3/aryan/project/dataset/Reddi...,0,[/nethome/kravicha3/aryan/project/dataset/Redd...


In [50]:
len(os.listdir(REDDIT_DATA_HOME + reddit_threads[0]))

60

## Check DataFrame result
from mining_result.py

In [57]:
def find_ends_with(start_path = '.'):
    ends_with = set()
    for dirpath, dirnames, filenames in os.walk(start_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            ext = os.path.splitext(fp)[-1].lower()
            ends_with.add(ext)
    return ends_with

In [58]:
find_ends_with(REDDIT_DATA_HOME)

{'.jpg', '.json', '.png'}

In [51]:
df = pd.read_pickle('hnsw_all_image_results.pkl')

In [52]:
df

Unnamed: 0,image_name,image_dir,num_similar,results
0,g1327_czcqbl6.jpg,/nethome/kravicha3/aryan/project/dataset/Reddi...,1,[/nethome/kravicha3/aryan/project/dataset/Redd...
1,g1327_czcu1y7.jpg,/nethome/kravicha3/aryan/project/dataset/Reddi...,3,[/nethome/kravicha3/aryan/project/dataset/Redd...
2,g1327_czd40us.jpg,/nethome/kravicha3/aryan/project/dataset/Reddi...,2,[/nethome/kravicha3/aryan/project/dataset/Redd...
3,g1327_czcsosm.jpg,/nethome/kravicha3/aryan/project/dataset/Reddi...,1,[/nethome/kravicha3/aryan/project/dataset/Redd...
4,g1327_czcy9ku.jpg,/nethome/kravicha3/aryan/project/dataset/Reddi...,1,[/nethome/kravicha3/aryan/project/dataset/Redd...
5,g1327_czcwgnr.jpg,/nethome/kravicha3/aryan/project/dataset/Reddi...,0,[/nethome/kravicha3/aryan/project/dataset/Redd...
6,g1327_czcmuq8.jpg,/nethome/kravicha3/aryan/project/dataset/Reddi...,0,[/nethome/kravicha3/aryan/project/dataset/Redd...
7,g1327_czdh9id.jpg,/nethome/kravicha3/aryan/project/dataset/Reddi...,2,[/nethome/kravicha3/aryan/project/dataset/Redd...
8,g1327_czcthc7.png,/nethome/kravicha3/aryan/project/dataset/Reddi...,0,[/nethome/kravicha3/aryan/project/dataset/Redd...
9,g1327_czcqur2.jpg,/nethome/kravicha3/aryan/project/dataset/Reddi...,1,[/nethome/kravicha3/aryan/project/dataset/Redd...


In [53]:
import gc
torch.cuda.empty_cache()
gc.collect()

1364

In [3]:
!nvidia-smi

Tue Mar  7 01:41:30 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.48.07    Driver Version: 515.48.07    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Quadro P6000        On   | 00000000:02:00.0 Off |                  Off |
| 26%   52C    P0    61W / 250W |   1077MiB / 24576MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Quadro P6000        On   | 00000000:81:00.0 Off |                  Off |
| 26%   58C    P0    63W / 250W |    891MiB / 24576MiB |      0%      Default |
|       

In [12]:
max1 = 0
oh = 0
for i in reddit_threads:
    max2 = len(os.listdir(REDDIT_DATA_HOME + i))
    if  max2 > max1:
        max1 = max2
        oh = i
print(max1, oh)

87 _This_picture_of_Hillary_Clinton_and_Barack_Obama


### Looking at results

In [57]:
df = pd.read_pickle('hnsw_all_image_results.pkl')

In [58]:
df

Unnamed: 0,image_name,image_dir,results,image_features
0,g1327_czcqbl6.jpg,/nethome/kravicha3/aryan/project/dataset/Reddi...,"[1, 8887, 1415, 2028, 8885, 9459, 4633, 8590, ...","[[0.7295329, 0.30392087, 0.7336758, 0.41025618..."
1,g1327_czcu1y7.jpg,/nethome/kravicha3/aryan/project/dataset/Reddi...,"[2, 12, 4042, 2610, 7813, 7462, 9731, 5972, 11...","[[0.12240556, 0.14182492, 0.24656458, 0.213859..."
2,g1327_czd2m0n.jpg,/nethome/kravicha3/aryan/project/dataset/Reddi...,"[3, 13, 4, 21, 26, 20, 9, 22, 50, 24, 54, 43, ...","[[0.10773073, 0.100702815, 0.010436808, 0.1696..."
3,g1327_czcrc83.png,/nethome/kravicha3/aryan/project/dataset/Reddi...,"[4, 21, 13, 26, 20, 9, 3, 22, 50, 24, 54, 43, ...","[[0.09970431, 0.1035891, 0.0134712625, 0.17500..."
4,g1327_czd40us.jpg,/nethome/kravicha3/aryan/project/dataset/Reddi...,"[5, 23, 4395, 7184, 4544, 7193, 7187, 7223, 72...","[[0.08443671, 0.16196343, 0.0846229, 0.1778144..."
...,...,...,...,...
10121,g1401_d4qbiny.png,/nethome/kravicha3/aryan/project/dataset/Reddi...,"[9797, 9784, 9793, 9785, 9786, 9800, 9788, 975...","[[0.27853826, 0.2937213, 0.50375164, 0.2408193..."
10122,g1401_d4qp62i.jpg,/nethome/kravicha3/aryan/project/dataset/Reddi...,"[9798, 4438, 4433, 4441, 4478, 4244, 4444, 735...","[[0.21785916, 0.5924403, 0.99710155, 0.2347397..."
10123,g1401_d4qm73e.png,/nethome/kravicha3/aryan/project/dataset/Reddi...,"[9799, 9788, 9786, 9800, 9758, 9787, 9801, 979...","[[0.22710375, 0.29315758, 0.43143603, 0.104876..."
10124,g1401_d4rfii4.jpg,/nethome/kravicha3/aryan/project/dataset/Reddi...,"[9786, 9800, 9788, 9758, 9793, 9787, 9801, 979...","[[0.21410389, 0.2253808, 0.5778347, 0.13573372..."


In [12]:
x = df[df["image_name"] == "g1327_czd2m0n.jpg"]['results']

In [16]:
for i in x.values[0]:
    print(i)

/nethome/kravicha3/aryan/project/dataset/Reddit_Provenance_Datasets/data/_This_cat_plotting_to_kill_someone/g1327_czd2m0n.jpg
/nethome/kravicha3/aryan/project/dataset/Reddit_Provenance_Datasets/data/_This_cat_plotting_to_kill_someone/g1327_czcnbuc.png
/nethome/kravicha3/aryan/project/dataset/Reddit_Provenance_Datasets/data/_This_cat_plotting_to_kill_someone/g1327_czcrc83.png
/nethome/kravicha3/aryan/project/dataset/Reddit_Provenance_Datasets/data/_This_cat_plotting_to_kill_someone/g1327_root.jpg
/nethome/kravicha3/aryan/project/dataset/Reddit_Provenance_Datasets/data/_This_cat_plotting_to_kill_someone/g1327_czcp79h.jpg
/nethome/kravicha3/aryan/project/dataset/Reddit_Provenance_Datasets/data/_This_cat_plotting_to_kill_someone/g1327_czcmzll.png
/nethome/kravicha3/aryan/project/dataset/Reddit_Provenance_Datasets/data/_This_cat_plotting_to_kill_someone/g1327_czclqxh.png
/nethome/kravicha3/aryan/project/dataset/Reddit_Provenance_Datasets/data/_This_cat_plotting_to_kill_someone/g1327_czcrzkd

In [9]:
df

Unnamed: 0,image_name,image_dir,num_similar,results
0,g1327_czcqbl6.jpg,/nethome/kravicha3/aryan/project/dataset/Reddi...,1,[/nethome/kravicha3/aryan/project/dataset/Redd...
1,g1327_czcu1y7.jpg,/nethome/kravicha3/aryan/project/dataset/Reddi...,3,[/nethome/kravicha3/aryan/project/dataset/Redd...
2,g1327_czd2m0n.jpg,/nethome/kravicha3/aryan/project/dataset/Reddi...,10,[/nethome/kravicha3/aryan/project/dataset/Redd...
3,g1327_czcrc83.png,/nethome/kravicha3/aryan/project/dataset/Reddi...,10,[/nethome/kravicha3/aryan/project/dataset/Redd...
4,g1327_czd40us.jpg,/nethome/kravicha3/aryan/project/dataset/Reddi...,2,[/nethome/kravicha3/aryan/project/dataset/Redd...
...,...,...,...,...
9864,g1401_d4qbiny.png,/nethome/kravicha3/aryan/project/dataset/Reddi...,10,[/nethome/kravicha3/aryan/project/dataset/Redd...
9865,g1401_d4qp62i.jpg,/nethome/kravicha3/aryan/project/dataset/Reddi...,1,[/nethome/kravicha3/aryan/project/dataset/Redd...
9866,g1401_d4qm73e.png,/nethome/kravicha3/aryan/project/dataset/Reddi...,10,[/nethome/kravicha3/aryan/project/dataset/Redd...
9867,g1401_d4rfii4.jpg,/nethome/kravicha3/aryan/project/dataset/Reddi...,10,[/nethome/kravicha3/aryan/project/dataset/Redd...


In [66]:
features = np.array(df['image_features'].to_list())

In [67]:
features.shape

(10126, 1, 2048)

In [69]:
feat2 = features.reshape(10126, 2048)

In [78]:
for i in range(len(features)):
    k = (features[i][0] == feat2[i])
    if False in k:
        print("false")
        break

### Running clustering algorithms

In [86]:
# features from the dataframe
df = pd.read_pickle('hnsw_all_image_results.pkl')
features = np.array(df['image_features'].to_list())
features = features.reshape(features.shape[0], features.shape[2])
# cluster training
ncentroids = 256
niter = 50
verbose = True
d = features.shape[1]
kmeans = faiss.Kmeans(d, ncentroids, niter=niter, verbose=verbose, gpu=True)
kmeans.train(features)


Clustering 10126 points in 2048D to 256 clusters, redo 1 times, 50 iterations
  Preprocessing in 0.01 s
  Iteration 49 (8.31 s, search 8.05 s): objective=331778 imbalance=1.505 nsplit=0       

331777.6875

In [87]:
# then add cluster_id to dataframe
D, I = kmeans.index.search(features, 1)
I.reshape(I.shape[0])
df['cluster_id'] = I

In [88]:
df

Unnamed: 0,image_name,image_dir,results,image_features,cluster_id
0,g1327_czcqbl6.jpg,/nethome/kravicha3/aryan/project/dataset/Reddi...,"[1, 8887, 1415, 2028, 8885, 9459, 4633, 8590, ...","[[0.7295329, 0.30392087, 0.7336758, 0.41025618...",151
1,g1327_czcu1y7.jpg,/nethome/kravicha3/aryan/project/dataset/Reddi...,"[2, 12, 4042, 2610, 7813, 7462, 9731, 5972, 11...","[[0.12240556, 0.14182492, 0.24656458, 0.213859...",13
2,g1327_czd2m0n.jpg,/nethome/kravicha3/aryan/project/dataset/Reddi...,"[3, 13, 4, 21, 26, 20, 9, 22, 50, 24, 54, 43, ...","[[0.10773073, 0.100702815, 0.010436808, 0.1696...",69
3,g1327_czcrc83.png,/nethome/kravicha3/aryan/project/dataset/Reddi...,"[4, 21, 13, 26, 20, 9, 3, 22, 50, 24, 54, 43, ...","[[0.09970431, 0.1035891, 0.0134712625, 0.17500...",69
4,g1327_czd40us.jpg,/nethome/kravicha3/aryan/project/dataset/Reddi...,"[5, 23, 4395, 7184, 4544, 7193, 7187, 7223, 72...","[[0.08443671, 0.16196343, 0.0846229, 0.1778144...",154
...,...,...,...,...,...
10121,g1401_d4qbiny.png,/nethome/kravicha3/aryan/project/dataset/Reddi...,"[9797, 9784, 9793, 9785, 9786, 9800, 9788, 975...","[[0.27853826, 0.2937213, 0.50375164, 0.2408193...",105
10122,g1401_d4qp62i.jpg,/nethome/kravicha3/aryan/project/dataset/Reddi...,"[9798, 4438, 4433, 4441, 4478, 4244, 4444, 735...","[[0.21785916, 0.5924403, 0.99710155, 0.2347397...",26
10123,g1401_d4qm73e.png,/nethome/kravicha3/aryan/project/dataset/Reddi...,"[9799, 9788, 9786, 9800, 9758, 9787, 9801, 979...","[[0.22710375, 0.29315758, 0.43143603, 0.104876...",61
10124,g1401_d4rfii4.jpg,/nethome/kravicha3/aryan/project/dataset/Reddi...,"[9786, 9800, 9788, 9758, 9793, 9787, 9801, 979...","[[0.21410389, 0.2253808, 0.5778347, 0.13573372...",61


In [89]:
df.to_pickle('hnsw_all_cluster.pkl')

In [None]:
# find which cluster belongs to which thread
    # a single thread may have multiple clusters
    # a cluster may have multiple threads
# based on that find the accuracy of the model
    # 

In [18]:
x = np.load('/nethome/kravicha3/aryan/project/notebooks/redditdataset/saves/features_array.np', allow_pickle=True)

In [20]:
x.shape

(9869, 2048)

In [23]:
ncentroids = 256
niter = 50
verbose = True
d = x.shape[1]
kmeans = faiss.Kmeans(d, ncentroids, niter=niter, verbose=verbose, gpu=True)
kmeans.train(x)




Clustering 9869 points in 2048D to 256 clusters, redo 1 times, 50 iterations
  Preprocessing in 0.03 s
  Iteration 49 (8.16 s, search 7.90 s): objective=313386 imbalance=1.560 nsplit=0       

313386.3125

In [30]:
D, I = kmeans.index.search(x, 1)

In [82]:
I.reshape(I.shape[0])

array([ 37, 186, 102, ...,  28,  28,  28])

In [79]:
for i in zip(I, D):
    print(f"({i[0][0]}, {i[1][0]})")

(37, 28.934616088867188)
(186, 23.01739501953125)
(102, 6.7139892578125)
(102, 5.9949493408203125)
(190, 17.715118408203125)
(210, 19.948638916015625)
(102, 17.25518798828125)
(135, 33.10516357421875)
(102, 5.396942138671875)
(137, 56.70831298828125)
(38, 29.226837158203125)
(186, 21.403472900390625)
(102, 5.92901611328125)
(217, 63.944427490234375)
(186, 34.26129150390625)
(105, 48.89599609375)
(60, 39.402008056640625)
(201, 40.42759704589844)
(60, 47.12005615234375)
(102, 5.56390380859375)
(102, 5.998504638671875)
(102, 5.34417724609375)
(190, 20.706085205078125)
(102, 5.1426849365234375)
(189, 35.835693359375)
(102, 6.2732086181640625)
(210, 58.8089599609375)
(222, 26.52178955078125)
(33, 33.359832763671875)
(210, 26.71417236328125)
(158, 126.894287109375)
(74, 53.61102294921875)
(186, 21.688995361328125)
(173, 14.33984375)
(8, 53.834564208984375)
(8, 25.3304443359375)
(102, 23.531158447265625)
(157, 52.177520751953125)
(95, 52.985260009765625)
(51, 30.589019775390625)
(149, 47.5508

In [45]:
REDDIT_DATA_HOME = os.path.join("/nethome/kravicha3/aryan/project/dataset/Reddit_Provenance_Datasets/data/")
reddit_threads = os.listdir(REDDIT_DATA_HOME)

In [None]:
cluster_df = pd.DataFrame(columns=["dir_path", "img_name", "part_of_same_thread", "cluster_dir_path"])
def get_cluter_results(image_path):
    
    for i in I:
        c.execute(f"SELECT * FROM '192111ccbbbfc5042415841dfaa9f90a' WHERE _row_id={i[0]}")
        r= c.fetchall()
        results.append(r[0][1])
    return results