In [34]:
from sklearn.decomposition import PCA
import numpy as np
from tqdm import tqdm

In [35]:
seed = 42
np.random.seed(seed)

# Config

In [50]:
reduced_dimensions = 150
experiment_name = "pca_v2"
dataset = "data/embeddings/base/clipped.glove.6B.300d.txt"
use_ppa = True # PPA = Post Processing Algorithm from Mu & Viswanath

# Load & Prepare Embeddings

In [37]:
words = []
vectors = []
with open(dataset, "r", encoding='utf8') as fp:
    for line in fp:
        line = line.split()
        word = line[0]
        vector = np.asarray(line[1:], 'float32')
        words.append(word)
        vectors.append(vector)
vectors = np.asarray(vectors)

In [51]:
vectors.shape

(400000, 300)

# Model Time

In [52]:
def ppa(vectors, D=7):
    pca = PCA(n_components=reduced_dimensions)
    # Subtract Mean Vector
    mean_removed = vectors - np.mean(vectors)
    # Remove the top-D components
    _ = pca.fit_transform(mean_removed)
    components = pca.components_
    post_ppa = []
    for i, x in enumerate(mean_removed):
        for component in components[0:7]:
            x = x - np.dot(component.transpose(),x) * component
        post_ppa.append(x)
    return np.asarray(post_ppa)

In [53]:
if use_ppa:
    # https://github.com/vyraun/Half-Size/blob/master/algo.py
    # Apply PPA Once
    reduced_vectors = ppa(vectors, D=7)
    
    # Perform PCA
    pca = PCA(n_components=reduced_dimensions)
    reduced_vectors -= np.mean(reduced_vectors)
    reduced_vectors = pca.fit_transform(reduced_vectors)
    
    # Apply PPA the 2nd time
    reduced_vectors = ppa(reduced_vectors, D=7)    
else:
    pca = PCA(n_components=reduced_dimensions)
    reduced_vectors = pca.fit_transform(vectors)

In [54]:
reduced_vectors.shape

(400000, 150)

# Save Model & Embeddings

In [55]:
# Need to convert the latent embeddings into the glove format
# word dim1 dim2 dim3 dim4 ... dimX
lines = []
for i, (word, vector) in tqdm(enumerate(zip(words, reduced_vectors))):
    line = [word] + [str(x) for x in vector.tolist()]
    lines.append(' '.join(line))

400000it [00:56, 7076.27it/s]


In [56]:
with open(f"data/embeddings/trained/{experiment_name}.glove.6B.300d.txt", "w", encoding="utf-8") as fp:
    fp.write("\n".join(lines))