# FOR COLAB

In [None]:
# Import GitHub branch
!git clone -b chunk https://github.com/aycankatitas/vectorsearch-applications.git
%cd /content/vectorsearch-applications
# Unzip finetuned models 
!unzip /content/vectorsearch-applications/models
# Enable third-party widgets
from google.colab import output
output.enable_custom_widget_manager()

In [None]:
!pip install -r requirements.txt

# Data Creation, Indexing and Reranking Pipeline

This code is created to support a pipeline for original or finetuned transformers embedding models.

In [31]:
%load_ext autoreload
%autoreload 2

from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

import json
import os
import time
from typing import List
from math import ceil

import pandas as pd
import numpy as np
from rich import print
from torch import cuda
from tqdm.notebook import tqdm

# Dataset Creation and Indexing
from preprocessing import FileIO
import tiktoken
from llama_index.text_splitter import SentenceSplitter
from sentence_transformers import CrossEncoder,SentenceTransformer
from weaviate_interface import WeaviateClient, WeaviateIndexer
from class_templates import impact_theory_class_properties
from pipeline import split_contents, encode_content_splits, join_metadata, create_dataset, retrieval_evaluation

# Retrieval Evaluation
from retrieval_evaluation import calc_hit_rate_scores, calc_mrr_scores, record_results, add_params, execute_evaluation
from llama_index.finetuning import EmbeddingQAFinetuneDataset
from weaviate_interface import WeaviateClient
from getpass import getpass
import openai

# ReRanker
from reranker import ReRanker

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Set Constants

For Open Source Models

In [32]:
# don't change to compare to golden dataset
chunk_size = 256

#tokenizer - don't change to compare to golden dataset
encoding = tiktoken.encoding_for_model('gpt-3.5-turbo')
#text splitter - don't change to compare to golden dataset
splitter = SentenceSplitter(chunk_overlap=0, chunk_size=chunk_size,tokenizer=encoding.encode)

#model
#model_name = "sentence-transformers/all-mpnet-base-v2"
model_name = "models/finetuned-all-MiniLM-L6-v2-300"
model = SentenceTransformer(model_name)


## Set Dataset Parameters

In [33]:
# Define file name outpath
data_path = "./data/"
date = "1216"
model_name_short = model_name.split("/")[1]
outpath = data_path+"impact-theory" + "_" + model_name_short + "_" + date
print(outpath)

## Configure Index

In [None]:
class_name = "Impact_theory_finetunedminiLM_reranker_256"

ef = 64
efConstruction = 128
maxConnection = 32

class_config = {'classes': [

                      {"class": class_name,

                       "description": "Episodes of Impact Theory up to Nov 2023",

                       "vectorIndexType": "hnsw",

                       # Vector index specific settings
                       "vectorIndexConfig": {

                            "ef": ef,
                            "efConstruction": efConstruction,
                            "maxConnections": maxConnection,
                                            },

                       "vectorizer": "none",

                       # pre-defined property mappings
                       "properties": impact_theory_class_properties}
                      ]
               }

print(class_config)

## Load Dataset

In [35]:
#corpus
data=FileIO().load_json("data/impact_theory_data.json")

## Create Dataset

In [37]:
# Create dataset
docs = create_dataset(data, model, splitter, file_outpath_prefix=outpath)

Creating dataset using chunk_size: 256


  0%|          | 0/384 [00:00<?, ?it/s]

KeyboardInterrupt: 

## Load golden dataset

In [13]:
# Load golden dataset

golden_dataset = EmbeddingQAFinetuneDataset.from_json("data/golden_100.json")

print(f'Num queries in Golden Dataset: {len(golden_dataset.queries)}')

## Create Weaviate Client

In [16]:
api_key = os.environ["WEAVIATE_API_KEY"]
url = os.environ["WEAVIATE_ENDPOINT"]
openai_key = os.environ["OPENAI_API_KEY"]


#instantiate client
client = WeaviateClient(api_key,url,model_name,openai_key)

#check if WCS instance is live and ready
client.is_live(), client.is_ready()

(True, True)

In [17]:
indexer = WeaviateIndexer(client, batch_size=200, num_workers=2)

## Initiate ReRanker

In [None]:
reranker_model = 'cross-encoder/ms-marco-MiniLM-L-6-v2'
reranker = ReRanker(model_name=reranker_model)

## Load Chunked Dataset for Weaviate

In [22]:
file_path = outpath + "-" + str(chunk_size) + ".parquet"
print(file_path)
data_with_vectors = FileIO().load_parquet(file_path)

Shape of data: (26448, 12)
Memory Usage: 2.42+ MB


In [None]:
data_with_vectors[0]

## Show Weaviate class configuration

In [None]:
client.schema.create(class_config)

# Check mif the index is properly working
print(client.show_class_config(class_name))

## Build Index on Weaviate

In [25]:
# Build the index
indexer.batch_index_data(data=data_with_vectors,
                         class_name=class_name)

100%|██████████| 26448/26448 [01:36<00:00, 274.74it/s]


Batch job completed in 1.65 minutes.
{'class': 'Impact_theory_finetunedminiLM_256', 'name': 'qSkqqkM5B2cy', 'objectCount': 26448, 'vectorIndexingStatus': 'READY', 'vectorQueueLength': 0}


## Retrieval Evaluation

We will judge the quality of the retriever and reranker on the golden dataset.

In [27]:
retrieval_results = execute_evaluation(golden_dataset, 
                                       class_name,
                                       client,
                                       reranker,
                                       alpha=0.5,
                                       retrieve_limit=100,
                                       top_k=5,
                                       search_type="hybrid",
                                       include_miss_info=True)

Queries:   0%|          | 0/100 [00:00<?, ?it/s]

Total Processing Time: 0.51 minutes


In [28]:
# Print the results of the evaluation
print(retrieval_results)