In [5]:
from pymilvus import MilvusClient
from llm import StarEncoder, CodeBERT
import torch
from llm2vec import LLM2Vec
from huggingface_hub import login
from datasets import load_dataset
from tqdm import tqdm
import numpy as np
from evaluate import evaluate

# Database connection and setup

In [7]:
client = MilvusClient(uri="http://localhost:19530", user="root", password="Milvus")

In [10]:
models = ["bert", "starcoder"]
for current_model in models:
    collection_name = current_model

    client.create_collection(
        collection_name=collection_name,
        dimension=768,
        auto_id=True,
    )
client.create_collection(
    collection_name="llama",
    dimension=2048,
    auto_id=True,
)

# Refresh databases

In [5]:
models = ["bert", "starcoder"]
for current_model in models:
    collection_name = current_model
    client.drop_collection(
        collection_name=collection_name,
    )
    client.create_collection(
        collection_name=collection_name,
        dimension=768,
        auto_id=True,
    )

client.drop_collection(
    collection_name=collection_name,
)
client.create_collection(
    collection_name="llama",
    dimension=2048,
    auto_id=True,
)

# Evaluate

In [13]:
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"

EVAL_CONFIGS = {
    "starencoder": {
        "max_input_len": 10000,
        "maximum_token_len": 1024,
        "device": DEVICE,
    },
    "codebert": {
        "max_input_len": 10000,
        "maximum_token_len": 512,
        "device": DEVICE,
    },
}

In [8]:
dataset = load_dataset("C:\\CSN")

In [33]:
bert = CodeBERT(**EVAL_CONFIGS.get("codebert"))

In [36]:
evaluate(bert, client, dataset, "bert")
print("done")

  4%|▎         | 1000/26909 [12:47<5:31:26,  1.30it/s]

done





In [6]:
starencoder = StarEncoder(**EVAL_CONFIGS.get("starencoder"))

# Enter CSN-Dataset into Milvus

In [10]:
evaluate(starencoder, client, dataset, "starcoder")
print("done")

  0%|          | 0/26909 [00:00<?, ?it/s]

  4%|▎         | 1000/26909 [07:00<3:01:35,  2.38it/s]


insert complete


  4%|▎         | 1000/26909 [04:54<2:07:22,  3.39it/s]

Search complete
0.36581221498550975
done





In [2]:
from llm2vec import LLM2Vec

import torch
from transformers import AutoTokenizer, AutoModel, AutoConfig
from peft import PeftModel

# Loading base Mistral model, along with custom code that enables bidirectional connections in decoder-only LLMs. MNTP LoRA weights are merged into the base model.
tokenizer = AutoTokenizer.from_pretrained("McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp")
config = AutoConfig.from_pretrained(
    "McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp", trust_remote_code=True
)
llama = AutoModel.from_pretrained(
    "McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp",
    trust_remote_code=True,
    config=config,
    torch_dtype=torch.bfloat16,
    device_map="cuda" if torch.cuda.is_available() else "cpu",
)
llama = PeftModel.from_pretrained(
    llama,
    "McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp",
)
llama = llama.merge_and_unload()  # This can take several minutes on cpu

# Loading supervised model. This loads the trained LoRA weights on top of MNTP model. Hence the final weights are -- Base model + MNTP (LoRA) + supervised (LoRA).
llama = PeftModel.from_pretrained(
    llama, "McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp-supervised"
)

# Wrapper for encoding and pooling operations
l2v = LLM2Vec(llama, tokenizer, pooling_mode="mean", max_length=512)

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Some weights of the model checkpoint at princeton-nlp/Sheared-LLaMA-1.3B were not used when initializing LlamaEncoderModel: ['lm_head.weight']
- This IS expected if you are initializing LlamaEncoderModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LlamaEncoderModel from the checkpoint of a model that you expect to be exactly identical (init

In [11]:
evaluate(l2v, client, dataset, "llama", normalize=True)
print("done")

  0%|          | 26/26909 [09:56<171:21:16, 22.95s/it]


KeyboardInterrupt: 

In [25]:
from transformers import AutoModel, AutoTokenizer


class T5Model:
    def __init__(self):
        checkpoint = "Salesforce/codet5p-110m-embedding"

        self.tokenizer = AutoTokenizer.from_pretrained(
            checkpoint, trust_remote_code=True
        )
        self.model = AutoModel.from_pretrained(checkpoint, trust_remote_code=True).to(
            DEVICE
        )

    def encode(self, input, **kwargs):
        inputs = self.tokenizer.encode(input[0], return_tensors="pt").to(DEVICE)
        return self.model(inputs)


# Dimension of the embedding: 256, with norm=1.0
t5 = T5Model()

embedding = t5.encode(["test"])
print(embedding)

tensor([[-0.0631, -0.0606, -0.1400, -0.1095, -0.1198, -0.0427, -0.0992,  0.0744,
         -0.0153, -0.0794,  0.0794, -0.0768, -0.0969, -0.0132,  0.1037,  0.0398,
          0.0278, -0.0455, -0.0083,  0.1054, -0.0510, -0.0771,  0.0448,  0.0434,
         -0.0634, -0.0336, -0.1503, -0.0752, -0.0279,  0.0346, -0.0009, -0.1264,
         -0.0031, -0.0578, -0.1237, -0.0370,  0.1132,  0.0315,  0.0004,  0.0976,
         -0.1366, -0.1307,  0.0403, -0.0684,  0.0225, -0.0048, -0.0428, -0.0904,
         -0.1097, -0.0945, -0.0162,  0.0425, -0.0623, -0.0406,  0.0217, -0.0119,
          0.0549, -0.0464,  0.0322, -0.0136,  0.0578, -0.0714, -0.0087,  0.0203,
         -0.0264,  0.0559, -0.0647, -0.0505,  0.0594, -0.1144,  0.0190,  0.0210,
          0.0238, -0.0238, -0.0548,  0.0379, -0.1142, -0.0304,  0.0314, -0.0958,
         -0.0453, -0.0317,  0.0392, -0.0827, -0.0095, -0.0157,  0.0982,  0.0632,
          0.0061, -0.0097,  0.0488, -0.0624,  0.2343, -0.1506, -0.0135,  0.1040,
          0.0195,  0.0477,  

In [15]:
client.create_collection(
    collection_name="t5",
    dimension=256,
    auto_id=True,
)

In [26]:
evaluate(t5, client, dataset, "t5")

  0%|          | 2/26909 [00:01<3:32:32,  2.11it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (715 > 512). Running this sequence through the model will result in indexing errors
  1%|          | 228/26909 [03:53<8:40:30,  1.17s/it] [93m[describe_collection] retry:4, cost: 0.27s, reason: <_MultiThreadedRendezvous: StatusCode.UNAVAILABLE, failed to connect to all addresses; last error: UNAVAILABLE: ipv6:%5B::1%5D:19530: Connection refused>[0m
[93m[describe_collection] retry:5, cost: 0.81s, reason: <_MultiThreadedRendezvous: StatusCode.UNAVAILABLE, failed to connect to all addresses; last error: UNAVAILABLE: ipv6:%5B::1%5D:19530: Connection refused>[0m
[93m[describe_collection] retry:6, cost: 2.43s, reason: <_MultiThreadedRendezvous: StatusCode.UNAVAILABLE, failed to connect to all addresses; last error: UNAVAILABLE: ipv6:%5B::1%5D:19530: Connection refused>[0m
[93m[describe_collection] retry:7, cost: 3.00s, reason: <_MultiThre

KeyboardInterrupt: 