# Perform PCA on VGG16 Embeddings

Convert 4,096-dimensional VGG16 embeddings into 2-dimensional embeddings with PCA.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

In [2]:
import json
import h5py

import dask
import dask.array as da
from dask_ml.decomposition import PCA, IncrementalPCA

Load in the array of size `n_images` by `embedding_shape` from the `hdf5` file.

In [8]:
f = h5py.File('data/vgg16images/embeddings.hdf5',"r")
d = f['mydataset']
n_images, embedding_shape = d.shape
n_images, embedding_shape

(132617, 4096)

Convert it to a dask array

In [9]:
embeddings = da.from_array(d, chunks=(1000, 1000))

Perform PCA using `dask-ml`. Reducing dimensions from `132,617 x 4,096` to `132,617 x 2`.

In [10]:
pca = PCA(n_components=2)
pca

PCA(n_components=2)

In [11]:
pca.fit(embeddings)

PCA(n_components=2)

In [12]:
pca.explained_variance_ratio_

array([0.07109076, 0.0390475 ])

In [13]:
pca_embeddings = pca.transform(embeddings)
pca_embeddings

Unnamed: 0,Array,Chunk
Bytes,2.12 MB,16.00 kB
Shape,"(132617, 2)","(1000, 2)"
Count,3971 Tasks,133 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 2.12 MB 16.00 kB Shape (132617, 2) (1000, 2) Count 3971 Tasks 133 Chunks Type float64 numpy.ndarray",2  132617,

Unnamed: 0,Array,Chunk
Bytes,2.12 MB,16.00 kB
Shape,"(132617, 2)","(1000, 2)"
Count,3971 Tasks,133 Chunks
Type,float64,numpy.ndarray


Save out the result to an HDF5 file.

In [17]:
da.to_hdf5('data/vgg16images/pca_embeddings.hdf5', '/pca_embeddings', pca_embeddings)