# Assignment 2: Dimensionality and PCA (10 pts)

In [None]:
# Install torchextractor for facilitating feature extraction
!pip install torchextractor

# Import packages
import warnings
import sys
if not sys.warnoptions:
    warnings.simplefilter('ignore')

import torch
from torchvision import models, transforms
import torchextractor as tx
from PIL import Image
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

from sklearn.decomposition import PCA

In [None]:
# Download the dataset
!gdown 1q5mPgOpEsWx4x_FeuZWu-8HfsdGr-aLS
# Extract the dataset and remove the tar file
!mkdir -p natural_scenes_demo && tar -xzf natural_scenes_demo.tar.gz -C natural_scenes_demo
!rm natural_scenes_demo.tar.gz

The layer activations from Alexnet fc7 of 1000 images can be extracted with this demo code. Note here libary `torchextractor` is used to faciliate the feature extraction.

In [None]:
# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load a pre-trained model  
model = models.alexnet(weights='IMAGENET1K_V1')
model.to(device).eval()

# Define layers to extract
layers = ["classifier.4"]  # e.g. AlexNet fc7

# Wrap the model with torchextractor
model = tx.Extractor(model, layers)

# Define image preprocessing
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Read the image information
df = pd.read_csv('natural_scenes_demo/stimulus_data.csv')
# Initialize list to hold activations
activations_list = []

# Loop over images in the dataset to extract activations
for img_path in tqdm(df['image_path']):  
    # Preprocess the image and add batch dimension
    image = preprocess(Image.open(img_path)).unsqueeze(0).to(device)
    # Forward pass through the model
    with torch.no_grad():
        _, activations = model(image)
    # Store the activations
    activations_list.append(activations['classifier.4'].squeeze().data.cpu().numpy())

# Convert list to array
activations_fc7 = np.array(activations_list)

#### ✏️ Do it yourself (2 pt):
Estimate the dimensionality of activations from layer 7 by computing its rank. \
_Hint: use `np.linalg.matrix_rank` to compute rank_

In [None]:
# Insert your code here


print(f'Rank of the matrix is: {dim_rank}')

This demo performs PCA on fc7 features of the 1000 images. 

In [None]:
# Set up a PCA instance
pca = PCA()
# Fit PCA on the activations
pca.fit(activations_fc7)

#### ✏️ Do it yourself (2 pt):
Estimate the dimensionality of activations by setting the number of components explaining 85% total variance. \
_Hint: use `pca.explained_variance_ratio`_

In [None]:
# Insert your code here


print(f'Number of components to explain 85% variance: {dim_thresh}')

#### ✏️ Do it yourself (2 pt):
Estimate the dimensionality of activations by computing its effective dimensionality. \
_Hint: use `pca.explained_variance`

In [None]:
# Insert your code here


print(f'The effective dimensionality is: {dim_eff}')

#### ✏️ Do it yourself (4 pts):
Plot the reconstruction error as the function of number of PCs seperately for the training and test sets. \
The first 800 images are the training set. The last 200 images are the test set. \
Only show the plot of first 20 PCs. \
_Hint: Use `pca.transform` to project data onto the latent dimensions; use `pca.inverse_transform` to decode from the latent dimensions_

In [None]:
# First 800 images 
X_train = activations_fc7[:800]
# Last 200 images
X_test  = activations_fc7[800:]

# Insert your code here
