In [None]:
import threading
import queue
import numpy as np
import pandas as pd
from tqdm import tqdm
from PIL import Image
import torch
import torchvision
from torchvision import transforms
# from datasets import load_dataset
import time
import random
import math
import os

In [None]:
# %pip install -U datasets -y
# %pip uninstall fsspec 
# %pip install fsspec==2023.9.2 -y 

In [None]:
dataset_name = "cifar10"

In [None]:
# Load the CIFAR-10 dataset
# dataset = load_dataset(dataset_name)
# cifar10 = load_dataset("cifar10")

from datasets import config
config.cache_dir = None  # Disable caching temporarily
cifar10 = load_dataset("cifar10")

# Define the image transformation
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])


In [None]:
print(dataset.shape)

In [None]:
def extract_features(img, resnet):
    """
        :param img: A CIFAR image
        :return: List of features
    """

    # Apply the transformation and convert the image to a tensor
    img_tensor = transform(img).unsqueeze(0)

    # Extract the features using the ResNet18 model
    with torch.no_grad():
        features = resnet(img_tensor)

    # Flatten the features and convert to a 1D numpy array
    features = features.squeeze().numpy()
    features = features.flatten()
    
    return features

In [None]:
def extract_features_threaded_worker(img_queue, index_list, features_list, label_list, features_lock, model, event):
    processed_images = 0
    while True:
        # Get an image path from the queue
        if img_queue.empty():
            # Wait for the main thread to signal that all of the images have been enqueued
            event.wait()

            # If the queue is still empty, break out of the loop
            if img_queue.empty():
                break

        index, img_path, label = img_queue.get()

        # Extract the features from the image
        img_features = extract_features(img_path, model)

        # Acquire the lock
        features_lock.acquire()

        # Add the extracted features to the list
        features_list.append(img_features)
        index_list.append(index)
        label_list.append(label)

        # Release the lock
        features_lock.release()
        
        # Increment the number of processed images
        processed_images+=1

        # If the thread has processed 1000 images, print the thread ID and the number of processed images
        if processed_images % 1000 == 0:
            print(f"Thread ID: {threading.current_thread().ident} | Processed images: {processed_images}")

        # If the queue is empty, break out of the loop
        if img_queue.empty():
            print(f"Thread ID: {threading.current_thread().ident} | Processed images: {processed_images}")
            break

In [None]:


def extract_features_resnet_threaded_cifar(cifar_dataset, num_threads=4 ):
    # Create a threading.Event object
    event = threading.Event()
    
    num_imgs = len(cifar_dataset)

    # Create a queue to store the image data
    img_queue = queue.Queue()

    # Create a list to store the extracted features
    features_list = []
    label_list = []
    index_list = []

    # Create a lock to protect the features list
    features_lock = threading.Lock()

    # Create a list of threads
    threads = []
    models = []

    # create multiple copies of the models for extracting features
    for i in range(num_threads):
        models.append(torch.hub.load('pytorch/vision:v0.11.3', 'resnet18', pretrained=True))

    # Start the threads
    for i in range(num_threads):
        thread = threading.Thread(target=extract_features_threaded_worker, args=(img_queue, index_list, features_list, label_list, features_lock, models[i], event))
        thread.start()
        threads.append(thread)

    # Enqueue all the image data
    for i, img in tqdm(enumerate(cifar_dataset)):
        img_queue.put((i, img["img"], img["label"]))

    # Signal to the threads that all of the images have been enqueued
    event.set()

    # Wait for all of the threads to finish
    for thread in threads:
        thread.join()

    # Create a DataFrame from the extracted features
    data = {'Index': index_list, 'Label': label_list, 'Features': features_list}
    df = pd.DataFrame(data)

    return df


In [None]:
def get_features_dataset(dataset, model):
    num_imgs = len(dataset)

    # Create a list to store the extracted features
    features_list = []
    label_list = []
    index_list = []

    # Enqueue all the image data
    for i, img in tqdm(enumerate(dataset)):
        features = extract_features(img["img"], model)
        features_list.append(features)
        label_list.append(img["label"])
        index_list.append(i)
        if i%1000==999:
            print(time.strftime('%X'))

    # Create a DataFrame from the extracted features
    data = {'Index': index_list, 'Label': label_list, 'Features': features_list}
    df = pd.DataFrame(data)
    return df


In [None]:
def main(dataset):
    # Load the ResNet18 model
    resnet = torch.hub.load('pytorch/vision:v0.11.3', 'resnet18', pretrained=True)
    train_dataset = dataset["train"]
    df = get_features_dataset(train_dataset, resnet)
    return df

In [None]:
df = main(dataset)

In [None]:
print(df.shape)

In [None]:
# Load the ResNet18 model
resnet = torch.hub.load('pytorch/vision:v0.11.3', 'resnet18', pretrained=True)
df = extract_features_resnet_threaded_cifar(dataset["train"])

# sort the dataframe by index
df = df.sort_values(by='Index')

In [None]:
print(type(df["Features"][0]))
import pickle
with open(f"{dataset}/dataframe.pkl", "wb") as f:
    pickle.dump(df, f)

In [None]:
with open(f"{dataset}/dataframe.pkl", "rb") as f:
    loaded_df = pickle.load(f)

print(type(loaded_df["Features"][0]))

In [None]:
# Save the features DataFrame to a CSV file
filename = "features_cifar_check.csv"
df.to_csv(filename, index=False)

np_array = np.array(df['Features'])
flattened_array = np_array.tolist()
# Create a DataFrame from the flattened array
df = pd.DataFrame(flattened_array)
# Save the DataFrame to a CSV file
df.to_csv('np.csv', index=False, header=False)


In [None]:
import pickle

with open(f"subset_gen_time.pkl", "rb") as f:
    subset_gen_time = pickle.load(f)

In [None]:
print(subset_gen_time)

## Feature Generation

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

import torchvision
from tqdm import tqdm
import torchvision.transforms as transforms
from torchvision import datasets, transforms
from tqdm import tqdm 
import time
from torch.utils.data import random_split, Dataset, DataLoader
from torchvision.models.resnet import ResNet18_Weights
import pickle
import random
from torch.optim import SGD
from torch.optim.lr_scheduler import CosineAnnealingLR
import statistics

In [None]:
if torch.backends.mps.is_available():
    device = "mps"
elif torch.cuda.is_available():
    device = "cuda:5" # change the available gpu number
else:
    device = "cpu"

In [None]:
# Define data transforms
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])

train_dataset = datasets.CIFAR10(root="../data", train=True, download=True, transform=transform_train)
test_dataset = datasets.CIFAR10(root="../data", train=False, download=True, transform=transform_test)

In [None]:
resnet = torch.hub.load('pytorch/vision:v0.11.3', 'resnet101', pretrained=True).to(device)

In [None]:
all_features = []
all_labels = []

for images, labels in tqdm(train_dataloader):
    images = images.to(device)
    labels = labels.to(device)

    features = resnet(images)

    all_features.append(features)
    all_labels.append(labels)

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=False, num_workers=2)

model = torch.hub.load('pytorch/vision:v0.11.3', 'resnet101', pretrained=True)
device_ids = [0, 1, 2, 3]
model = model.to(f"cuda:{device_ids[0]}")
model = nn.DataParallel(model, device_ids=device_ids)

all_features = []
all_labels = []

with torch.no_grad():
    for images, labels in tqdm(train_dataloader):
        images = images.to(f"cuda:{device_ids[0]}")
        labels = labels.to(f"cuda:{device_ids[0]}")

        features = model(images)

        all_features.append(features.cpu())  
        all_labels.append(labels.cpu())

In [None]:
stacked_features = torch.cat(all_features, dim=0)
stacked_labels = torch.cat(all_labels, dim=0)

In [None]:
# features_array = list(stacked_features.cpu().detach().numpy())
# labels_array = stacked_labels.cpu().detach().numpy()

features_array = list(stacked_features.numpy())
labels_array = stacked_labels.numpy()

df = pd.DataFrame({"Features": features_array, "Label": labels_array})

In [None]:
print(len(features_array[0]))

In [None]:
with open("./cifar10/dataframe3.pkl", "wb") as f:
    pickle.dump(df, f)

In [None]:
%pip install -U scikit-learn

## Check 

In [None]:
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import pickle
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

with open(f"cifar10/dataframe3.pkl", "rb") as f:
    data = pickle.load(f)

In [None]:
n_components = 5
pca = PCA(n_components=n_components)
features_matrix = np.stack(data['Features'].values)
reduced_features = pca.fit_transform(features_matrix)
data['Reduced_Features'] = list(reduced_features)

In [None]:
col = "Features"

groups = data.groupby('Label')
dataframes = [group for _, group in groups][5:9]

np.random.seed(42)
n = 100

all_samples = pd.concat([df.sample(n) for df in dataframes], ignore_index=True)
features = np.stack(all_samples[col].values)

tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(features)

colors = plt.cm.get_cmap('tab10', len(dataframes))  # Get a colormap with enough colors
fig, ax = plt.subplots()

for i, df in enumerate(dataframes):
    start_idx = i * n
    end_idx = start_idx + n
    ax.scatter(tsne_results[start_idx:end_idx, 0], tsne_results[start_idx:end_idx, 1], color=colors(i), label=f'class {i+1}', s=5)

ax.legend()
plt.title('t-SNE visualization of multiple DataFrames')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.show()