In [2]:
from pymilvus import MilvusClient
from llm import StarEncoder, CodeBERT, T5
import torch
from llm2vec import LLM2Vec
from huggingface_hub import login
from datasets import load_dataset
from tqdm import tqdm
import numpy as np
from evaluate import evaluate
from llm2vec import LLM2Vec
from transformers import BitsAndBytesConfig

import torch
from transformers import AutoTokenizer, AutoModel, AutoConfig
from peft import PeftModel

DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"

# Setup

In [10]:
client = MilvusClient(uri="http://localhost:19530", user="root", password="Milvus")

In [3]:
EVAL_CONFIGS = {
    "starencoder": {
        "max_input_len": 10000,
        "maximum_token_len": 1024,
        "device": DEVICE,
    },
    "t5": {
        "max_input_len": 10000,
        "maximum_token_len": 512,
        "device": DEVICE,
    },
}

In [3]:
dataset = load_dataset("D:\\CSN")

In [4]:
dataset["test"][0]

{'repository_name': 'ReactiveX/RxJava',
 'func_path_in_repository': 'src/main/java/io/reactivex/internal/observers/QueueDrainObserver.java',
 'func_name': 'QueueDrainObserver.fastPathOrderedEmit',
 'whole_func_string': 'protected final void fastPathOrderedEmit(U value, boolean delayError, Disposable disposable) {\n        final Observer<? super V> observer = downstream;\n        final SimplePlainQueue<U> q = queue;\n\n        if (wip.get() == 0 && wip.compareAndSet(0, 1)) {\n            if (q.isEmpty()) {\n                accept(observer, value);\n                if (leave(-1) == 0) {\n                    return;\n                }\n            } else {\n                q.offer(value);\n            }\n        } else {\n            q.offer(value);\n            if (!enter()) {\n                return;\n            }\n        }\n        QueueDrainHelper.drainLoop(q, observer, delayError, disposable, this);\n    }',
 'language': 'java',
 'func_code_string': 'protected final void fastPathOr

# Prepare Collections

In [12]:
collections = (
    {"name": "starcoder", "dim": 768},
    {"name": "llama", "dim": 2048},
    {"name": "t5", "dim": 256},
)


for current in collections:

    client.create_collection(
        collection_name=current["name"],
        dimension=current["dim"],
        auto_id=True,
    )

RPC error: [create_collection], <MilvusException: (code=65535, message=create duplicate collection with different parameters, collection: bert)>, <Time:{'RPC start': '2024-06-12 19:55:12.606544', 'RPC error': '2024-06-12 19:55:12.615753'}>
Failed to create collection: bert


MilvusException: <MilvusException: (code=65535, message=create duplicate collection with different parameters, collection: bert)>

# Refresh databases

In [13]:
collections = [
    {"name": "starcoder", "dim": 768},
    {"name": "llama", "dim": 2048},
    {"name": "t5", "dim": 256},
]
for current in collections:
    client.drop_collection(
        collection_name=current["name"],
    )
    client.create_collection(
        collection_name=current["name"],
        dimension=current["dim"],
        auto_id=True,
    )

# Evaluate

# StarEncoder

In [9]:
starencoder = StarEncoder(**EVAL_CONFIGS.get("starencoder"))

In [10]:
evaluate(starencoder, client, dataset, "starcoder")
print("done")

  4%|▎         | 1000/26909 [00:31<13:36, 31.73it/s]


insert complete


  4%|▎         | 1000/26909 [00:38<16:40, 25.88it/s]

Search complete
0.36582743468258516
done





# Llama-2

In [11]:
# Loading base Mistral model, along with custom code that enables bidirectional connections in decoder-only LLMs. MNTP LoRA weights are merged into the base model.
tokenizer = AutoTokenizer.from_pretrained("McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp")
config = AutoConfig.from_pretrained(
    "McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp", trust_remote_code=True
)
llama = AutoModel.from_pretrained(
    "McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp",
    trust_remote_code=True,
    config=config,
    torch_dtype=torch.bfloat16,
    device_map="cuda" if torch.cuda.is_available() else "cpu",
)
llama = PeftModel.from_pretrained(
    llama,
    "McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp",
)
llama = llama.merge_and_unload()  # This can take several minutes on cpu

# Loading supervised model. This loads the trained LoRA weights on top of MNTP model. Hence the final weights are -- Base model + MNTP (LoRA) + supervised (LoRA).
llama = PeftModel.from_pretrained(
    llama, "McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp-supervised"
)

# Wrapper for encoding and pooling operations
l2v = LLM2Vec(llama, tokenizer, pooling_mode="mean", max_length=512)

Some weights of the model checkpoint at princeton-nlp/Sheared-LLaMA-1.3B were not used when initializing LlamaEncoderModel: ['lm_head.weight']
- This IS expected if you are initializing LlamaEncoderModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LlamaEncoderModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
class LLM2VecWrapper:
    def __init__(self, model):
        self.model = model

    def encode(self, input):
        encoding = self.model.encode(input, show_progress_bar=False)
        return torch.nn.functional.normalize(encoding, p=2, dim=1).detach().numpy()

In [13]:
evaluate(LLM2VecWrapper(l2v), client, dataset, "llama")
print("done")

  attn_output = torch.nn.functional.scaled_dot_product_attention(
  4%|▎         | 1000/26909 [05:13<2:15:29,  3.19it/s]


insert complete


  4%|▎         | 1000/26909 [03:45<1:37:17,  4.44it/s]

Search complete
0.8296693368434425
done





# T5

In [14]:
t5 = T5(**EVAL_CONFIGS.get("t5"))

In [15]:
evaluate(t5, client, dataset, "t5")

  4%|▎         | 1000/26909 [00:32<14:14, 30.31it/s]


insert complete


  4%|▎         | 1000/26909 [00:40<17:31, 24.65it/s]

Search complete
0.8800751834847025



