In [99]:
from google.cloud import bigquery
from google.oauth2 import service_account
import re
import datetime
import numpy as np
import os
from nltk.tokenize import word_tokenize
import nltk
from gensim.models import Word2Vec
from dotenv import find_dotenv, load_dotenv
import boto3
import sys
import pandas as pd
from model_classes import FtModel, SVM
from sklearn.model_selection import KFold

vocab_path = "../src/data"
if vocab_path not in sys.path:
    sys.path.insert(1, vocab_path)

from preprocess import preprocess, process
from w2v_preprocess import is_punc # noqa

nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/atersaak/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [58]:
name = os.getenv("REPO_NAME")

if "/" in name:
    REPO = name
    USER = ""
else:
    USER = name
    REPO = ""

In [9]:
load_dotenv(find_dotenv())

True

In [10]:
use_ceph = True

In [11]:
# whether to use ceph or store locally

use_ceph = True

if use_ceph:
    s3_endpoint_url = os.environ["OBJECT_STORAGE_ENDPOINT_URL"]
    s3_access_key = os.environ["AWS_ACCESS_KEY_ID"]
    s3_secret_key = os.environ["AWS_SECRET_ACCESS_KEY"]
    s3_bucket = os.environ["OBJECT_STORAGE_BUCKET_NAME"]

    s3 = boto3.client(
        service_name="s3",
        aws_access_key_id=s3_access_key,
        aws_secret_access_key=s3_secret_key,
        endpoint_url=s3_endpoint_url,
    )

In [12]:
pattern = re.compile('github-labeler/w2v/.*')

buck = boto3.resource(
    service_name="s3",
    aws_access_key_id=s3_access_key,
    aws_secret_access_key=s3_secret_key,
    endpoint_url=s3_endpoint_url,
)

keys = []

for obj in buck.Bucket(s3_bucket).objects.all():
    if pattern.match(obj.key):
        keys.append(obj.key)

keys = [os.path.basename(key) for key in keys]

In [13]:
if use_ceph:
    for key in keys:
        response = s3.get_object(
            Bucket=s3_bucket,
            Key=f"github-labeler/w2v/{key}",
        )
        with open(f'../models/{key}' ,'wb') as f:
            for i in response['Body']:
                f.write(i)

w = Word2Vec.load('../models/w2v.model')

In [31]:
vocabulary = w.wv

In [29]:
vocabulary['slime']

array([ 9.23742183e-01,  2.32883527e-01,  2.37421694e-01,  1.72353471e-01,
       -2.18682564e-01,  1.82947338e-01, -2.46382981e-01,  1.44234826e-01,
       -2.99687221e-02,  4.11167992e-01, -8.03297702e-02, -1.31971191e-01,
        1.72190587e-01,  1.18970575e-01, -1.10150887e-01,  3.69050753e-01,
        3.36970819e-01,  3.11997259e-01,  3.05851019e-01,  2.47959185e-01,
        5.52929727e-02,  2.80811273e-04,  1.24124149e-01, -2.01001166e-01,
       -1.70063632e-02, -6.75225474e-02, -4.99156970e-02,  8.20819799e-02,
       -7.90059572e-02,  1.39661219e-01,  4.87351409e-01,  1.42484934e-01,
        1.62602014e-02, -3.32129484e-01,  2.60693022e-02, -6.16326118e-02,
       -3.24902504e-01,  9.79855351e-02, -9.46882363e-02,  7.94708150e-03,
        7.16395413e-02,  6.90607377e-02,  8.12420606e-02,  3.74224111e-02,
       -2.58287327e-02, -1.12986857e-01,  2.76075428e-01, -2.21995813e-01,
       -1.74412349e-01,  1.83697060e-01, -1.00397553e-01,  3.63611964e-01,
        1.79405002e-01,  

In [43]:
from tqdm.notebook import tqdm

In [117]:
vec_size = w.wv.vector_size

In [138]:
full_vocab = w.wv.vocab

with open('vocab.vec', 'w', encoding = 'utf-8') as f:
    f.write(str(len(full_vocab)) + ' ' + str(vec_size) + '\n')
    for word in tqdm(full_vocab):
        vector = w.wv[word]
        vector = [str(v) for v in vector]
        f.write(word + ' ' + ' '.join(vector))
        f.write('\n')

full_vocab = set(full_vocab)

  0%|          | 0/386705 [00:00<?, ?it/s]

In [53]:
from skift import FirstColFtClassifier

In [54]:
def in_set(word):
    """Check if the word is in our set."""
    if word in full_vocab:
        return word
    else:
        return '_unknown_'

In [118]:
class FtModel_Pretrained(FirstColFtClassifier):
    """This model is written over the skift column first classifier."""

    def __init__(self, path=""):
        """Initialize the model."""
        if not path:
            super().__init__(pretrainedVectors = 'vocab.vec', dim = vec_size)
        else:
            model = fasttext.load_model(path)
            setattr(self, "model", model)

    def preprocess(self, x):
        """Preprocess the text from a dataframe with processed column."""
        ret = x.title.fillna('') + ' SEP ' + x.body.fillna('')
        ret = ret.apply(preprocess)
        ret = ret.apply(lambda x: x.lower())
        ret = ret.apply(word_tokenize).values
        ret = [[word for word in issue if not is_punc(word)] for issue in ret]
        ret = [[in_set(w) for w in issue] for issue in ret]
        ret = [' '.join(issue) for issue in ret]
        return ret

    def fit(self, x, y):
        """Fit the model."""
        input_ = np.array(self.preprocess(x)).reshape(-1, 1)
        super().fit(input_, y)

    def predict(self, x):
        """Predict the output."""
        input_ = self.preprocess(x)
        return super().predict(input_)

    def save(self, path):
        """Save the model."""
        return self.model.save_model(path)

    def inference(self, title, body):
        """Inference for the app."""
        input_ = np.array(process(title, body)).reshape(1, -1)
        pred = super().predict(input_)
        return pred[0]

In [61]:
savename = USER if USER else REPO.replace("/", "-_-")
path = os.path.join("../data", savename + ".csv")
key = f"github-labeler/data/{savename}.csv"

if use_ceph:
    response = s3.get_object(Bucket=s3_bucket, Key=key)
    issues_df = pd.read_csv(response.get("Body")).drop_duplicates()
else:
    issues_df = pd.read_csv(path).drop_duplicates()

In [74]:
samp = issues_df.sample(10)

In [95]:
def get_subdataset(label):
    """
    pass in a label name and get back a dataframe of positive & negative samples for the label
    we avoid taking unlabelled data as negative samples
    negative samples are distributed evenly amongst the other labels
    """
    labelled = issues_df[~issues_df.labels.isna()]
    final_labels_ = final_labels.copy()
    final_labels_.remove(label)
    pos_samples = labelled[labelled.labels.apply(lambda x: label in x.split("\t"))]
    n = len(pos_samples)
    remaining = labelled[labelled.id.apply(lambda x: label not in x.split("\t"))]
    n_neg = 0
    per_label = n // len(final_labels_)
    neg_ids = set()
    # evenly sample if we can
    # if not enough samples for a label, throw them all in and increase the remaining amount we need per label
    for i, lbl in enumerate(reversed(final_labels_)):
        neg_samples = remaining[remaining.labels.apply(lambda x: lbl in x.split("\t"))]
        if len(neg_samples) >= per_label:
            neg_samples = neg_samples.sample(per_label)
            n_neg += per_label
        else:
            n_neg += len(neg_samples)
            if i != len(final_labels_) - 1:
                per_label = (n - n_neg) // (len(final_labels_) - i - 1)
        remaining = remaining[
            remaining.labels.apply(lambda x: lbl not in x.split("\t"))
        ]
        neg_ids = neg_ids.union(set(neg_samples.id))
    # fill in potential gap with unlabelled issues, if needed
    if len(pos_samples) > len(neg_ids):
        unlabelled = issues_df.query("num_labels == 0")
        if len(unlabelled) > len(pos_samples) - len(neg_ids):
            neg_ids = neg_ids.union(
                set(unlabelled.sample(len(pos_samples) - len(neg_ids)).id)
            )
        else:
            neg_ids = neg_ids.union(set(unlabelled.id))
            pos_samples = pos_samples.sample(len(neg_ids))
    final_neg_samples = issues_df[issues_df.id.apply(lambda x: x in neg_ids)]
    x = pd.concat((final_neg_samples, pos_samples))
    y = np.concatenate((np.zeros(len(final_neg_samples)), np.ones(len(pos_samples))))
    return x, y.astype(int)

In [96]:
def predict_label(label, k=5, model_class=FtModel):
    """
    validates fastText model on the given label, using k-fold cross validation
    """
    x, y = get_subdataset(label)
    kf = KFold(n_splits=k, random_state=None, shuffle=True)
    accuracy = []
    precision = []
    recall = []
    for train_index, test_index in kf.split(x):
        model = model_class()
        x_train, x_test = x.iloc[train_index], x.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(x_train, y_train)
        preds = model.predict(x_test)
        accuracy.append(np.mean(preds == y_test))
        precision_ = preds[preds == 1] == y_test[preds == 1]
        if len(precision_) != 0:
            precision.append(np.mean(precision_))
        recall_ = preds[y_test == 1] == y_test[y_test == 1]
        if len(recall_) != 0:
            recall.append(np.mean(recall_))
    cols = ["label", "n", "accuracy", "precision", "recall"]
    data = [label, len(x), np.mean(accuracy), np.mean(precision), np.mean(recall)]
    return pd.DataFrame([data], columns=cols)

In [100]:
issues_df["processed"] = issues_df.apply(
    lambda row: np.array(process(row["title"], row["body"])), axis=1
)

In [139]:
final_labels = ['kind/bug', 'kind/feature']

predicted_labels = []
for label in final_labels:
    pred_1 = predict_label(label, model_class=FtModel)
    pred_2 = predict_label(label, model_class=FtModel_Pretrained)
    pred_1["model"] = "ft"
    pred_2["model"] = "ft_pt"
    predicted_labels.append(pred_1)
    predicted_labels.append(pred_2)

ValueError: FastTextClassifier methods must get a two-dimensional numpy array (or castable) as the X parameter.

In [120]:
with open('vocab.vec', 'r') as f:
    o = f.readlines()

In [121]:
len(o)

386705

In [126]:
sum(len(i) for i in o)

1440609435

In [128]:
m = FtModel()

In [134]:
y = np.array([0,0,0, 1, 1, 1])

In [135]:
y

array([0, 0, 0, 1, 1, 1])

In [136]:
m.fit(issues_df.sample(6), y)

In [137]:
m.save('here')