In [1]:
import os
import logging
from transformers import logging as transformers_logging
import tensorflow as tf
from mongo_db_client import MongoDbClient
from embedding_dataset import EmbeddingDataset
import numpy as np
from embedding_generator import EmbeddingGenerator
import matplotlib.pyplot as plt
import numpy as np
from typing import Iterator, List, Literal, TypedDict, Any
from embedding_generator import EmbeddingPairBatch, EmbeddingPair
import random
import pandas as pd
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from tqdm import tqdm
import itertools

random.seed(42)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
tf.get_logger().setLevel(logging.ERROR)
tf.autograph.set_verbosity(0)
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
transformers_logging.set_verbosity_error()

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
CodeSearchNetLanguage = Literal['python', 'go', 'java', 'javascript', 'php', 'ruby']
CodeSearchNetSplit = Literal['train', 'test', 'validation']

class CodeSearchNetSample(TypedDict):
  repository_name: str
  func_path_in_repository: str
  func_name: str
  whole_func_string: str
  language: CodeSearchNetLanguage
  func_code_string: str
  func_code_tokens: List[str]
  func_documentation_string: str
  func_documentation_string_tokens: List[str]
  split_name: CodeSearchNetSplit
  func_code_url: str

In [None]:
from functools import partial
from datasets import Dataset
test_dataset = load_dataset('code_search_net', 'python', split='train', streaming=True).take(2).map(lambda _, index: { "index": index }, with_indices=True)
[sample['index'] for sample in test_dataset]

# Transform IterableDataset to Dataset

In [None]:
def gen(iterable_ds):
  yield from iterable_ds

ds = Dataset.from_generator(partial(gen, test_dataset), features=test_dataset.features)
ds.save_to_disk('../datasets/test_dataset')

In [None]:
loaded_ds = Dataset.load_from_disk(dataset_path='../datasets/test_dataset/')
[sample['id'] for sample in loaded_ds]

# Crime and punish example

In [17]:
crime_ds = load_dataset('crime_and_punish', split='train[:1000]')

In [18]:
from numpy.random import default_rng


neg_samples_per_sample = 3
dataset_len = 100
indexes = np.random.randint(dataset_len-1, size=dataset_len)
full_ds = []
random_gen = default_rng(seed=42)

def remove_empty_lines(sample) -> bool:
  line: str = sample['line']
  return len(line.split()) > 0

for sample in crime_ds.filter(remove_empty_lines):
  neg_indexes = random_gen.choice(indexes, neg_samples_per_sample, replace=False)
  neg_samples = crime_ds.select(neg_indexes)
  full_ds.append({
    "positive": sample['line'],
    "negatives": [neg_sample['line'] for neg_sample in neg_samples]
  })

len(full_ds)

Filter: 100%|██████████| 1000/1000 [00:00<00:00, 228522.61 examples/s]


883

In [20]:
# Find equals
eq_count = 0
for sample in full_ds:
  positive = sample['positive']
  for neg in sample['negatives']:
    if neg == positive:
      eq_count += 1
eq_count

3

-----

# Faiss index example

In [None]:
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
import torch
torch.set_grad_enabled(False)
ctx_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
ctx_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

In [None]:
from datasets import load_dataset
ds = load_dataset('crime_and_punish', split='train[:100]')
next(iter(ds))

In [None]:
ds_with_embeddings = ds.map(lambda example: {'embeddings': ctx_encoder(**ctx_tokenizer(example["line"], return_tensors="pt"))[0][0].numpy()})

In [None]:
next(iter(ds_with_embeddings)).keys()

In [None]:
ds_with_embeddings.add_faiss_index(column='embeddings', faiss_verbose=True)

In [None]:
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
q_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
q_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")

question = "Is it serious ?"
question_embedding = q_encoder(**q_tokenizer(question, return_tensors="pt"))[0][0].numpy()
scores, retrieved_examples = ds_with_embeddings.get_nearest_examples('embeddings', question_embedding, k=10)

In [None]:
ds_with_embeddings.save_faiss_index('embeddings', '../datasets/faiss_index')
ds_with_embeddings.drop_index('embeddings')
ds_with_embeddings.save_to_disk('../datasets/')

In [None]:
x = Dataset.load_from_disk('../datasets/indexes.faiss/')
x.load_faiss_index('embeddings', '../datasets/faiss_index.faiss')
next(iter(x))