In [7]:
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, LoggingHandler, util, models, evaluation, losses, InputExample
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CEBinaryClassificationEvaluator
from datetime import datetime
from collections import defaultdict
from torch.utils.data import IterableDataset
from tqdm.notebook import tqdm

import numpy as np

import json
import time
import torch
import os
import logging

In [2]:
train_batch_size = 256
model_name = 'cross-encoder/ms-marco-TinyBERT-L-6'
model_save_path = 'models/crenc-exp1'

In [5]:
def get_triplets(Passage_dict):
    triplets = []
    for k, v in Passage_dict.items():
        for x in v[0]:
            for y in v[1]:
                triplets.append([k, x, y])

    return triplets

def get_dataset(triplets, corpus):
    dataset = []        
    for triplet in triplets:
        qid, pos_id, neg_id = triplet
        
        qid = str(qid)
        pos_id = str(pos_id)
        neg_id = str(neg_id)

        query_text = corpus[qid]
        pos_text = corpus[pos_id]
        neg_text = corpus[neg_id]

        pos_instance = InputExample(texts=[query_text, pos_text],label=1)
        neg_instance = InputExample(texts=[query_text, neg_text],label=0)

        dataset.append(pos_instance)
        dataset.append(neg_instance)

    return dataset


with open('./data/generated/train_passage.json', 'r') as f:
    train_passage = json.load(f)

with open('./data/generated/train_corpus.json', 'r') as f:
    train_corpus = json.load(f)

with open('./data/generated/val_passage.json', 'r') as f:
    val_passage = json.load(f)

with open('./data/generated/val_corpus.json', 'r') as f:
    val_corpus = json.load(f)

train_triplets = get_triplets(train_passage)
train_dataset = get_dataset(train_triplets, train_corpus)

val_triplets = get_triplets(val_passage)
val_dataset = get_dataset(val_triplets, val_corpus)

In [4]:
logging.basicConfig(
    format='- %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
    level=logging.INFO,
    handlers=[LoggingHandler()]
)

model = CrossEncoder(model_name)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
evaluator = CEBinaryClassificationEvaluator.from_input_examples(val_dataset, name='cross_encoder_val')

- Use pytorch device: cuda


In [5]:
warmup_steps = int(len(train_dataloader) * 5 * 0.1)

model.fit(
    train_dataloader=train_dataloader,
    evaluator=evaluator,
    epochs=2,
    evaluation_steps=int(len(train_dataloader) / 2),
    warmup_steps=warmup_steps,
    save_best_model=True,
    output_path=model_save_path
)

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3122 [00:00<?, ?it/s]

- CEBinaryClassificationEvaluator: Evaluating the model on cross_encoder_val dataset in epoch 0 after 1561 steps:
- Accuracy:           74.08	(Threshold: 0.5425)
- F1:                 76.21	(Threshold: 0.2190)
- Precision:          67.27
- Recall:             87.90
- Average Precision:  83.74

- Save model to models/crenc-exp1
- CEBinaryClassificationEvaluator: Evaluating the model on cross_encoder_val dataset in epoch 0 after 3122 steps:
- Accuracy:           73.26	(Threshold: 0.0613)
- F1:                 76.00	(Threshold: 0.0065)
- Precision:          67.16
- Recall:             87.52
- Average Precision:  82.89

- CEBinaryClassificationEvaluator: Evaluating the model on cross_encoder_val dataset after epoch 0:
- Accuracy:           73.26	(Threshold: 0.0613)
- F1:                 76.00	(Threshold: 0.0065)
- Precision:          67.16
- Recall:             87.52
- Average Precision:  82.89



Iteration:   0%|          | 0/3122 [00:00<?, ?it/s]

- CEBinaryClassificationEvaluator: Evaluating the model on cross_encoder_val dataset in epoch 1 after 1561 steps:
- Accuracy:           73.53	(Threshold: 0.0043)
- F1:                 75.99	(Threshold: 0.0012)
- Precision:          68.92
- Recall:             84.67
- Average Precision:  82.23

- CEBinaryClassificationEvaluator: Evaluating the model on cross_encoder_val dataset in epoch 1 after 3122 steps:
- Accuracy:           73.53	(Threshold: 0.0021)
- F1:                 76.10	(Threshold: 0.0003)
- Precision:          67.53
- Recall:             87.17
- Average Precision:  82.01

- CEBinaryClassificationEvaluator: Evaluating the model on cross_encoder_val dataset after epoch 1:
- Accuracy:           73.53	(Threshold: 0.0021)
- F1:                 76.10	(Threshold: 0.0003)
- Precision:          67.53
- Recall:             87.17
- Average Precision:  82.01



In [7]:
del model
torch.cuda.empty_cache()

In [3]:
model = CrossEncoder('./models/crenc-exp1/')

In [6]:
positives = []
negatives = []

for triplet in val_triplets:
    query = val_corpus[triplet[0]]
    pos = val_corpus[str(triplet[1])]
    neg = val_corpus[str(triplet[2])]

    positives.append([query, pos])
    negatives.append([query, neg])

positive_scores = model.predict(positives)
negative_scores = model.predict(negatives)

In [11]:
positive_out = np.where(positive_scores < 0.5)[0]
negative_out = np.where(negative_scores >= 0.5)[0]

# sample some bad positive samples
for idx in np.random.choice(positive_out, 10, replace=False):
    score = positive_scores[idx]
    query = val_corpus[val_triplets[idx][0]]
    text = val_corpus[str(val_triplets[idx][1])]

    print(f'Query: {query}\nText: {text}\nScore:{score:.4f}\n')

Query: deployment of the pod in the aws kops cluster
Text: arkade: set up service as a LoadBalancer
Score:0.0356

Query: DockerFile and environment variable
Text: symfony can't install assets the first time
Score:0.2428

Query: Problem in installing nvidia-docker in windows 10 system
Text: Keras Model stops training without indication as to why and how to enable GPU-acceleration
Score:0.0137

Query: How do you print to console from a docker file during build?
Text: Dockerfile for NGINX Web server
Score:0.3939

Query: Docker container to bring up DB fails with connection refused error
Text: Saving database state from library/postgres
Score:0.3410

Query: Kubernetes to find Pod IP from another Pod
Text: Unable to install Spinnaker via Helm
Score:0.0768

Query: Mount volumes with Secrets using Python Kubernetes API
Text: Airflow pull docker image from private google container repository
Score:0.0093

Query: Unable to send messages to port-forwarded Kafka pod
Text: How can I use ingress-ng

In [12]:
# sample some bad negative samples
for idx in np.random.choice(negative_out, 10, replace=False):
    score = negative_scores[idx]
    query = val_corpus[val_triplets[idx][0]]
    text = val_corpus[str(val_triplets[idx][2])]

    print(f'Query: {query}\nText: {text}\nScore:{score:.4f}\n')

Query: Distributed JMeter in Kubernetes increase heap size
Text: Kubernetes Scheduler Extenders - when are they invoked?
Score:0.7508

Query: Can I map gpu drivers of host machine (windows) inside docker container?
Text: Docker mysql environment
Score:0.8025

Query: Unable to connect docker nginx with docker ubuntu
Text: Getting 'didn't match node selector' when running Docker Windows container in Azure AKS
Score:0.5311

Query: Docker - Failing to get PGP Keys
Text: What is the use for CRD status?
Score:0.5260

Query: Running tests for .NET Core in Docker during local dev
Text: What is the practical purpose of VOLUME in Dockerfile?
Score:0.7636

Query: Dask - Kubernetes - Tutorial example
Text: How to deploy frontends on kubernetes and how this can work with AWS Cloudfront?
Score:0.6133

Query: Production ready Kubernetes redis
Text: Kubernetes Prometheus metric for HPA (horizontal pod autoscaler) `currentCPUUtilizationPercentage`?
Score:0.8053

Query: Kubernetes monitoring and self-he

In [14]:
positive_good = np.where(positive_scores > 0.8)[0]
negative_good = np.where(negative_scores < 0.2)[0]

# sample some good positive samples
for idx in np.random.choice(positive_good, 10, replace=False):
    score = positive_scores[idx]
    query = val_corpus[val_triplets[idx][0]]
    text = val_corpus[str(val_triplets[idx][1])]

    print(f'Query: {query}\nText: {text}\nScore:{score:.4f}\n')

Query: Kubernetes Internal Ingress
Text: How can I use ingress-nginx via Helm on custom k8s install without LoadBalancer support?
Score:0.9078

Query: Best practices for values in global section of helm values.yaml
Text: How to pass extra configuration to RabbitMQ with Helm?
Score:0.9927

Query: Run Postgres migration with Docker/Docker compose
Text: Why doesn't postgres official docker repo start db service at build time?
Score:0.9922

Query: AWS MWAA (Managed Apache Airflow); Programmatically enable DAGs
Text: Create user with LDAP authentification in airflow 2.1.4
Score:0.9923

Query: Can't change kafka broker-id in Incubator Helm chart?
Text: Setup Ingress whith Helm using the chart stable/nginx-ingress
Score:0.9109

Query: Whats exactly the difference between a programming language only image and a OS plus programming language docker image?
Text: Best way to build a Docker image
Score:0.9492

Query: How does Kubernetes know on which node to schedule its POD when PVs are backed by 

In [15]:
# sample some good negative samples
for idx in np.random.choice(negative_good, 10, replace=False):
    score = negative_scores[idx]
    query = val_corpus[val_triplets[idx][0]]
    text = val_corpus[str(val_triplets[idx][1])]

    print(f'Query: {query}\nText: {text}\nScore:{score:.4f}\n')

Query: Amazon Web Services: NoCredentialsError: Unable to locate credentials
Text: Serverless NodeJS / Native node_modules
Score:0.0041

Query: Configure prometheus to collect custom metrics from dockerized nodejs pod
Text: Kubernetes Internal Ingress
Score:0.0087

Query: the tensorflow docker gpu image doesn't detect my GPU
Text: Keras Model stops training without indication as to why and how to enable GPU-acceleration
Score:0.0190

Query: Delete helm chart from *registry* (not uninstall from cluster; not repository)
Text: Push existing image to another registry (without mounting docker.sock or using docker:dind)
Score:0.0035

Query: Why won't docker generate a password when creating a new mysql container?
Score:0.0071

Query: Docker mysql official image
Text: Docker mysql environment
Score:0.0057

Query: Can't access Prometheus Postgres exporter installed in kubernetes: connection refused
Text: How to isolate data of one persistent volume claim from another
Score:0.1147

Query: How t