In this notebook we carry over the work from the models/train_models.ipnb notebook. The main difference is that we create a model class of pre-trained fastText to evaluate if pre-training actually helped or not. We see that it did help a little, but sometimes it made models worse and in its current state it is not worth the hassle to implement, especially since considerable work would have to be done to reduce the size of the model. Future work that aims to improve the fastText part of the model can use this notebook to run their own expirements with a different pre-trained model.

In [28]:
import re
import numpy as np
import os
from nltk.tokenize import word_tokenize
import nltk
from gensim.models import Word2Vec
from dotenv import find_dotenv, load_dotenv
import boto3
from tqdm.notebook import tqdm
import sys
import pandas as pd
from sklearn.model_selection import KFold
import fasttext
from skift import FirstColFtClassifier
from sklearn.decomposition import PCA
from collections import Counter

vocab_path = "../src/data"
if vocab_path not in sys.path:
    sys.path.insert(1, vocab_path)

models_path = "../models"
if models_path not in sys.path:
    sys.path.insert(1, models_path)

from model_classes import FtModel, SVM # noqa
from preprocess import preprocess, process #noqa
from w2v_preprocess import is_punc # noqa

nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/atersaak/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
name = os.getenv("REPO_NAME")

if "/" in name:
    REPO = name
    USER = ""
else:
    USER = name
    REPO = ""

In [3]:
load_dotenv(find_dotenv())

True

In [4]:
# whether to use ceph or store locally

use_ceph = bool(int(os.getenv('USE_CEPH')))

if use_ceph:
    s3_endpoint_url = os.environ["OBJECT_STORAGE_ENDPOINT_URL"]
    s3_access_key = os.environ["AWS_ACCESS_KEY_ID"]
    s3_secret_key = os.environ["AWS_SECRET_ACCESS_KEY"]
    s3_bucket = os.environ["OBJECT_STORAGE_BUCKET_NAME"]

    s3 = boto3.client(
        service_name="s3",
        aws_access_key_id=s3_access_key,
        aws_secret_access_key=s3_secret_key,
        endpoint_url=s3_endpoint_url,
    )

In [5]:
pattern = re.compile('github-labeler/w2v/.*')

buck = boto3.resource(
    service_name="s3",
    aws_access_key_id=s3_access_key,
    aws_secret_access_key=s3_secret_key,
    endpoint_url=s3_endpoint_url,
)

keys = []

for obj in buck.Bucket(s3_bucket).objects.all():
    if pattern.match(obj.key):
        keys.append(obj.key)

keys = [os.path.basename(key) for key in keys]

In [6]:
if use_ceph:
    for key in keys:
        response = s3.get_object(
            Bucket=s3_bucket,
            Key=f"github-labeler/w2v/{key}",
        )
        with open(f'../models/{key}' ,'wb') as f:
            for i in tqdm(response['Body']):
                f.write(i)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [13]:
w = Word2Vec.load('../models/w2v.model')

In [14]:
vocabulary = w.wv
vec_size = w.wv.vector_size

In [15]:
full_vocab = w.wv.index_to_key

with open('vocab.vec', 'w', encoding = 'utf-8') as f:
    f.write(str(len(full_vocab)) + ' ' + str(vec_size) + '\n')
    for word in tqdm(full_vocab):
        vector = w.wv[word]
        vector = [str(v) for v in vector]
        f.write(word + ' ' + ' '.join(vector))
        f.write('\n')

full_vocab = set(full_vocab)

  0%|          | 0/386705 [00:00<?, ?it/s]

In [16]:
savename = USER if USER else REPO.replace("/", "-_-")
path = os.path.join("../data", savename + ".csv")
key = f"github-labeler/data/{savename}.csv"

if use_ceph:
    response = s3.get_object(Bucket=s3_bucket, Key=key)
    issues_df = pd.read_csv(response.get("Body")).drop_duplicates()
else:
    issues_df = pd.read_csv(path).drop_duplicates()

In [39]:
cnt = Counter()

for i, row in issues_df.iterrows():
    words = word_tokenize(preprocess(row.title + " SEP " + row.body if type(row.body) == str else row.title))
    cnt.update(words)

In [49]:
shared_vocab = set(cnt).intersection(full_vocab)

In [72]:
def in_set(word):
    """Check if the word is in our set."""
    if word in full_vocab:
        return word
    else:
        return '_unknown_'
    
def in_set_reduced(word):
    """Check if the word is in our set."""
    if word in shared_vocab:
        return word
    else:
        return '_unknown_'

In [52]:
shared_vocab_dict = {}
for v in shared_vocab:
    shared_vocab_dict[v] = w.wv[v]

In [61]:
pca_in = np.stack([b for _, b in shared_vocab_dict.items()])
words = [a for a, _ in shared_vocab_dict.items()]

pca  = PCA()
pca.fit(pca_in)
sum_svs = sum(pca.singular_values_)
dim = 0
running = 0
for i in pca.singular_values_:
    dim += 1
    running += i
    if running/sum_svs > 0.7:
        break
        
print(f'{dim} dimensions in reduced model')

pca = PCA(n_components=dim)
pca_out = pca.fit_transform(pca_in)

78 dimensions in reduced model


In [86]:
with open('vocab_reduced.vec', 'w', encoding = 'utf-8') as f:
    f.write(str(len(full_vocab)) + ' ' + str(dim) + '\n')
    for w, vec in tqdm(zip(words, pca_out)):
        vector = [str(v) for v in vec]
        f.write(w + ' ' + ' '.join(vector))
        f.write('\n')

full_vocab = set(full_vocab)

0it [00:00, ?it/s]

In [87]:
len(pca_out[0])

78

Below is the most important part, everything else is mainly copy & paste. Here we define the pretrained fastText class and the necessary methods. In the init method the file 'vocab.vec' is the pretrained vectors themselves.

In [88]:
class FtModelPretrained(FirstColFtClassifier):
    """This model is written over the skift column first classifier."""

    def __init__(self, path=""):
        """Initialize the model."""
        if not path:
            super().__init__(pretrainedVectors = 'vocab.vec', dim = vec_size)
        else:
            model = fasttext.load_model(path)
            setattr(self, "model", model)

    def preprocess(self, x):
        """Preprocess the text from a dataframe with processed column."""
        ret = x.title.fillna('') + ' SEP ' + x.body.fillna('')
        ret = ret.apply(preprocess)
        ret = ret.apply(lambda x: x.lower())
        ret = ret.apply(word_tokenize).values
        ret = [[word for word in issue if not is_punc(word)] for issue in ret]
        ret = [[in_set(w) for w in issue] for issue in ret]
        ret = [' '.join(issue) for issue in ret]
        return ret

    def fit(self, x, y):
        """Fit the model."""
        input_ = np.array(self.preprocess(x)).reshape(-1, 1)
        super().fit(input_, y)

    def predict(self, x):
        """Predict the output."""
        input_ = np.array(self.preprocess(x)).reshape(-1, 1)
        return super().predict(input_)

    def save(self, path):
        """Save the model."""
        return self.model.save_model(path)

    def inference(self, title, body):
        """Inference for the app."""
        input_ = np.array(process(title, body)).reshape(1, -1)
        pred = super().predict(input_)
        return pred[0]
    
class FtModelPretrainedReduced(FirstColFtClassifier):
    """This model is written over the skift column first classifier."""

    def __init__(self, path=""):
        """Initialize the model."""
        if not path:
            super().__init__(pretrainedVectors = 'vocab_reduced.vec', dim = dim)
        else:
            model = fasttext.load_model(path)
            setattr(self, "model", model)

    def preprocess(self, x):
        """Preprocess the text from a dataframe with processed column."""
        ret = x.title.fillna('') + ' SEP ' + x.body.fillna('')
        ret = ret.apply(preprocess)
        ret = ret.apply(lambda x: x.lower())
        ret = ret.apply(word_tokenize).values
        ret = [[word for word in issue if not is_punc(word)] for issue in ret]
        ret = [[in_set_reduced(w) for w in issue] for issue in ret]
        ret = [' '.join(issue) for issue in ret]
        return ret

    def fit(self, x, y):
        """Fit the model."""
        input_ = np.array(self.preprocess(x)).reshape(-1, 1)
        super().fit(input_, y)

    def predict(self, x):
        """Predict the output."""
        input_ = np.array(self.preprocess(x)).reshape(-1, 1)
        return super().predict(input_)

    def save(self, path):
        """Save the model."""
        return self.model.save_model(path)

    def inference(self, title, body):
        """Inference for the app."""
        input_ = np.array(process(title, body)).reshape(1, -1)
        pred = super().predict(input_)
        return pred[0]

In [75]:
def get_subdataset(label):
    """
    pass in a label name and get back a dataframe of positive & negative samples for the label
    we avoid taking unlabelled data as negative samples
    negative samples are distributed evenly amongst the other labels
    """
    labelled = issues_df[~issues_df.labels.isna()]
    final_labels_ = final_labels.copy()
    final_labels_.remove(label)
    pos_samples = labelled[labelled.labels.apply(lambda x: label in x.split("\t"))]
    n = len(pos_samples)
    remaining = labelled[labelled.id.apply(lambda x: label not in x.split("\t"))]
    n_neg = 0
    per_label = n // len(final_labels_)
    neg_ids = set()
    # evenly sample if we can
    # if not enough samples for a label, throw them all in and increase the remaining amount we need per label
    for i, lbl in enumerate(reversed(final_labels_)):
        neg_samples = remaining[remaining.labels.apply(lambda x: lbl in x.split("\t"))]
        if len(neg_samples) >= per_label:
            neg_samples = neg_samples.sample(per_label)
            n_neg += per_label
        else:
            n_neg += len(neg_samples)
            if i != len(final_labels_) - 1:
                per_label = (n - n_neg) // (len(final_labels_) - i - 1)
        remaining = remaining[
            remaining.labels.apply(lambda x: lbl not in x.split("\t"))
        ]
        neg_ids = neg_ids.union(set(neg_samples.id))
    # fill in potential gap with unlabelled issues, if needed
    if len(pos_samples) > len(neg_ids):
        unlabelled = issues_df.query("num_labels == 0")
        if len(unlabelled) > len(pos_samples) - len(neg_ids):
            neg_ids = neg_ids.union(
                set(unlabelled.sample(len(pos_samples) - len(neg_ids)).id)
            )
        else:
            neg_ids = neg_ids.union(set(unlabelled.id))
            pos_samples = pos_samples.sample(len(neg_ids))
    final_neg_samples = issues_df[issues_df.id.apply(lambda x: x in neg_ids)]
    x = pd.concat((final_neg_samples, pos_samples))
    y = np.concatenate((np.zeros(len(final_neg_samples)), np.ones(len(pos_samples))))
    return x, y.astype(int)

In [76]:
def predict_label(x, y, k=5, model_class=FtModel):
    """
    validates fastText model on the given label, using k-fold cross validation
    """
    kf = KFold(n_splits=k, random_state=None, shuffle=True)
    accuracy = []
    precision = []
    recall = []
    for train_index, test_index in kf.split(x):
        model = model_class()
        x_train, x_test = x.iloc[train_index], x.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model = model_class()
        model.fit(x_train, y_train)
        preds = model.predict(x_test)
        accuracy.append(np.mean(preds == y_test))
        precision_ = preds[preds == 1] == y_test[preds == 1]
        if len(precision_) != 0:
            precision.append(np.mean(precision_))
        recall_ = preds[y_test == 1] == y_test[y_test == 1]
        if len(recall_) != 0:
            recall.append(np.mean(recall_))
    cols = ["label", "n", "accuracy", "precision", "recall"]
    data = [label, len(x), np.mean(accuracy), np.mean(precision), np.mean(recall)]
    return pd.DataFrame([data], columns=cols)

In [68]:
issues_df["processed"] = issues_df.apply(
    lambda row: np.array(process(row["title"], row["body"])), axis=1
)

In [69]:
def dummy_fun(doc):
    return doc

In [70]:
final_labels = ['kind/bug',
                'kind/test-flake',
                'component/cli',
                'kind/question',
                'area/usability',
                'component/build',
                'component/web',
                'area/tests',
                'component/apps',
                'component/kubernetes',
                'component/imageregistry',
                'component/networking',
                'component/auth',
                'sig/master',
                'area/techdebt',
                'component/routing',
                'component/restapi',
                'area/security',
                'component/storage',
                'component/internal-tools',
                'component/install',
                'sig/developer-experience',
                'kind/post-rebase',
                'component/composition',
                'component/image',
                'sig/security',
                'sig/networking',
                'area/documentation',
                'area/performance',
                'component/cluster-up',
                'area/infrastructure',
                'component/logging',
                'sig/pod',
                'help wanted',
                'sig/storage',
                'component/metrics',
                'kind/feature',
                'component/containers']

In [92]:
predicted_labels = []
for label in tqdm(final_labels):
    x, y = get_subdataset(label)
    pred_3 = predict_label(x, y, model_class=SVM)
    pred_1 = predict_label(x, y, model_class=FtModel)
    pred_2 = predict_label(x, y, model_class=FtModelPretrainedReduced)
    pred_1["model"] = "ft"
    pred_2["model"] = "ft_pt"
    pred_3["model"] = "svm"
    predicted_labels.append(pred_1)
    predicted_labels.append(pred_2)
    predicted_labels.append(pred_3)

  0%|          | 0/38 [00:00<?, ?it/s]

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [90]:
results_df = pd.concat(predicted_labels).sort_values("n", ascending=False).reset_index(drop = True)

results_df

Unnamed: 0,label,n,accuracy,precision,recall,model
0,kind/bug,3736,0.527573,0.525876,0.575949,ft
1,kind/bug,3736,0.520075,0.520516,0.553889,svm
2,kind/bug,3736,0.598509,0.685851,0.384167,ft_pt
3,kind/test-flake,2170,0.754378,0.743527,0.793650,ft
4,kind/test-flake,2170,0.773272,0.769515,0.782208,svm
...,...,...,...,...,...,...
109,kind/feature,110,0.736364,0.704456,0.796667,ft_pt
110,kind/feature,110,0.572727,0.620000,0.534603,svm
111,component/containers,104,0.508571,0.581313,0.789091,ft
112,component/containers,104,0.598095,0.607121,0.569167,ft_pt


Results can be expanded and read below.

In [91]:
pd.set_option("display.max_rows", 114)

results_df

Unnamed: 0,label,n,accuracy,precision,recall,model
0,kind/bug,3736,0.527573,0.525876,0.575949,ft
1,kind/bug,3736,0.520075,0.520516,0.553889,svm
2,kind/bug,3736,0.598509,0.685851,0.384167,ft_pt
3,kind/test-flake,2170,0.754378,0.743527,0.79365,ft
4,kind/test-flake,2170,0.773272,0.769515,0.782208,svm
5,kind/test-flake,2170,0.811982,0.816215,0.806374,ft_pt
6,component/cli,1746,0.620815,0.63158,0.606615,ft
7,component/cli,1746,0.638579,0.627236,0.682525,svm
8,component/cli,1746,0.727951,0.715331,0.755554,ft_pt
9,kind/question,1496,0.596899,0.638253,0.446936,ft


Some accuracies barely budged, some went up, and some went down. On average I would say pretraining did help boost most of the accuracies, sometimes in a pretty big way, but not enough that it's worth implementing the very large model.