# Reduce Pretrained fastText

This is a short, simple notebook that reduces the size of our pretrained fastText model. We do this by overlapping the large vocabulary set we have with all the words mentioned at least once in our target dataset. After reducing the vocabulary size with that method, we use PCA reduction to reduce our vector size. We then save our word vector file so the fastText model can access it in training.

First we import our packages and load in our dataset and Word2Vec model from Ceph.

In [1]:
import os
import re
import numpy as np
from nltk.tokenize import word_tokenize
import nltk
from gensim.models import Word2Vec
from dotenv import find_dotenv, load_dotenv
import boto3
import sys
import pandas as pd
from sklearn.decomposition import PCA
from collections import Counter

vocab_path = "../src/data"
if vocab_path not in sys.path:
    sys.path.insert(1, vocab_path)

from preprocess import preprocess # noqa

nltk.download('punkt')

[nltk_data] Downloading package stopwords to /opt/app-
[nltk_data]     root/src/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /opt/app-
[nltk_data]     root/src/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /opt/app-
[nltk_data]     root/src/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
load_dotenv(find_dotenv())

True

In [3]:
# whether to use ceph or store locally

use_ceph = bool(int(os.getenv('USE_CEPH')))

if use_ceph:
    s3_endpoint_url = os.environ["OBJECT_STORAGE_ENDPOINT_URL"]
    s3_access_key = os.environ["AWS_ACCESS_KEY_ID"]
    s3_secret_key = os.environ["AWS_SECRET_ACCESS_KEY"]
    s3_bucket = os.environ["OBJECT_STORAGE_BUCKET_NAME"]

    s3 = boto3.client(
        service_name="s3",
        aws_access_key_id=s3_access_key,
        aws_secret_access_key=s3_secret_key,
        endpoint_url=s3_endpoint_url,
    )

In [4]:
name = os.getenv("REPO_NAME")

if "/" in name:
    REPO = name
    USER = ""
else:
    USER = name
    REPO = ""

In [5]:
# repo data is saved as {org_name}-_-{repo_name}
# orginization data is saved as {org_name}

savename = USER if USER else REPO.replace("/", "-_-")
path = os.path.join("../data", savename + ".csv")
key = f"github-labeler/data/{savename}.csv"

if use_ceph:
    response = s3.get_object(Bucket=s3_bucket, Key=key)
    issues_df = pd.read_csv(response.get("Body")).drop_duplicates()
else:
    issues_df = pd.read_csv(path).drop_duplicates()

In [6]:
pattern = re.compile('github-labeler/w2v/.*')

buck = boto3.resource(
    service_name="s3",
    aws_access_key_id=s3_access_key,
    aws_secret_access_key=s3_secret_key,
    endpoint_url=s3_endpoint_url,
)

keys = []

for obj in buck.Bucket(s3_bucket).objects.all():
    if pattern.match(obj.key):
        keys.append(obj.key)

keys = [os.path.basename(key) for key in keys]

In [7]:
if use_ceph:
    for key in keys:
        response = s3.get_object(
            Bucket=s3_bucket,
            Key=f"github-labeler/w2v/{key}",
        )
        with open(f'../models/{key}' ,'wb') as f:
            for i in response['Body']:
                f.write(i)

In [8]:
w = Word2Vec.load('../models/w2v.model')
full_vocab = set(w.wv.index_to_key)

Now we generate all the words that are mentioned in our dataset and intersect it with the pre-trained Word2Vec model.

In [9]:
cnt = Counter()

for i, row in issues_df.iterrows():
    words = word_tokenize(preprocess(row.title + " SEP " + row.body if type(row.body) == str else row.title))
    cnt.update(words)

In [10]:
shared_vocab = set(cnt).intersection(full_vocab)

In [11]:
print(f'{len(shared_vocab)} words in reduced model')

13622 words in reduced model


In [12]:
shared_vocab_dict = {}
for v in shared_vocab:
    shared_vocab_dict[v] = w.wv[v]

We perform PCA reduction such that we retain 70% of the total singular value sizes.

In [13]:
pca_in = np.stack([b for _, b in shared_vocab_dict.items()])
words = [a for a, _ in shared_vocab_dict.items()]

pca = PCA()
pca.fit(pca_in)
sum_svs = sum(pca.singular_values_)
dim = 0
running = 0
for i in pca.singular_values_:
    dim += 1
    running += i
    if running/sum_svs > 0.7:
        break

print(f'{dim} dimensions in reduced model')

pca = PCA(n_components=dim)
pca_out = pca.fit_transform(pca_in)

78 dimensions in reduced model


In [14]:
with open('vocab_reduced.vec', 'w', encoding = 'utf-8') as f:
    f.write(str(len(shared_vocab)) + ' ' + str(dim) + '\n')
    for w, vec in zip(words, pca_out):
        vector = [str(v) for v in vec]
        f.write(w + ' ' + ' '.join(vector))
        f.write('\n')

We upload this reduced vector file to Ceph and delete the large Word2Vec files.

In [15]:
if use_ceph:
    s3.upload_file(
        Bucket=s3_bucket,
        Key=f"github-labeler/{savename}/vocab_reduced.vec",
        Filename="vocab_reduced.vec",
    )

In [17]:
if use_ceph:
    w2v_pattern = re.compile('w2v.*')
    for item in os.listdir('../models/'):
        if re.match(w2v_pattern, item):
            os.remove(os.path.join('../models', item))

    os.remove('vocab_reduced.vec')