# Install dependencies

In [None]:
!pip3 install \
  transformers \
  sentence-transformers \
  pinecone-client \
  datasets \
  accelerate \
  einops \
  langchain \
  xformers \
  bitsandbytes \
  langchain-community \
  pinecone

In [None]:
!pip3 install git+https://github.com/naver/splade.git

# Initialize HG Pipeline

In [None]:
from torch import cuda
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

embed_model_id = 'sentence-transformers/msmarco-bert-base-dot-v5'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

embed_model = HuggingFaceEmbeddings(
    model_name=embed_model_id,
    model_kwargs={'device': device},
    encode_kwargs={'device': device, 'batch_size': 32}
)

# Building the Pinecone index

In [None]:
from pinecone import Pinecone, ServerlessSpec
import os

pc = Pinecone(api_key='XXX')
index_name = 'bigvul-single-ctx'

pc.create_index(
    name=index_name,
    dimension=768, 
    metric="dotproduct",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)

# Initialize Pinecone index

In [None]:
index = pc.Index(index_name)
index.describe_index_stats()

# Load data

In [None]:
import pandas as pd
import uuid

# Load the Excel file
df = pd.read_csv('data.csv')
df = df.fillna('')

# Create vector db context

In [None]:
def create_combined_string(row):    
    combined_string = (
        f"Vulnerability: {row['CVE ID']}"
        f"\n"
        f"Weakness: {row['CWE ID']}"
        f"\n"
        f"Vulnerability Summary: {row['Summary']}"
        f"\n"
        f"Vulnerable Function:{row['func_before']}"
        f"\n"
        f"Vulnerable Function Fix: {row['func_after']}"
    )
    return combined_string

In [None]:
df['Data'] = df.apply(create_combined_string, axis=1)
data = [{'id': str(uuid.uuid4()), 'context': row['Data']} for index, row in df.iterrows()]

# Dense Vectors

In [None]:
from sentence_transformers import SentenceTransformer
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

if device != 'cuda':
    print("==========\n"+
          "WARNING: You are not running on GPU so this may be slow.\n"+
          "If on Google Colab, go to top menu > Runtime > Change "+
          "runtime type > Hardware accelerator > 'GPU' and rerun "+
          "the notebook.\n==========")

dense_model = SentenceTransformer(
    'msmarco-bert-base-dot-v5',
    device=device
)

dim = dense_model.get_sentence_embedding_dimension()

# Sparse Vectors

In [None]:
from transformers import AutoTokenizer
from splade.models.transformer_rep import Splade

sparse_model_id = 'naver/splade-cocondenser-ensembledistil'

sparse_model = Splade(sparse_model_id, agg='max')
sparse_model.to(device)  
sparse_model.eval()

tokenizer = AutoTokenizer.from_pretrained(sparse_model_id)

# Index entire dataset

# Builder

In [None]:
from pinecone import Pinecone


def builder(records: list):
    ids = [x['id'] for x in records]
    contexts = [x['context'] for x in records]
    dense_vecs = dense_model.encode(contexts).tolist()
    input_ids = tokenizer(
        contexts, return_tensors='pt',
        padding=True, truncation=True
    )
    with torch.no_grad():
        sparse_vecs = sparse_model(
            d_kwargs=input_ids.to(device)
        )['d_rep'].squeeze()
    upserts = []
    for _id, dense_vec, sparse_vec, context in zip(ids, dense_vecs, sparse_vecs, contexts):
        indices = sparse_vec.nonzero().squeeze().cpu().tolist()  
        values = sparse_vec[indices].cpu().tolist()  
        sparse_values = {
            "indices": indices,
            "values": values
        }
        
        metadata = {'context': context}
        upserts.append({
            'id': _id,
            'values': dense_vec,
            'sparse_values': sparse_values,
            'metadata': metadata
        })
    return upserts

# Upsert chunks

In [None]:
from tqdm.auto import tqdm

batch_size = 32

for i in tqdm(range(0, len(data), batch_size)):
    i_end = min(i+batch_size, len(data))
    batch = data[i:i_end]
    index.upsert(builder(data[i:i+batch_size]))

# Querying

In [None]:
def encode(text: str):
    dense_vec = dense_model.encode(text).tolist()
    input_ids = tokenizer(text, return_tensors='pt')
    with torch.no_grad():
        sparse_vec = sparse_model(
            d_kwargs=input_ids.to(device)
        )['d_rep'].squeeze()
    
    indices = sparse_vec.nonzero().squeeze().cpu().tolist()
    values = sparse_vec[indices].cpu().tolist()
    sparse_dict = {"indices": indices, "values": values}
    
    return dense_vec, sparse_dict

In [None]:
query = (
    "Vulnerability: CVE-2014-3173 and Weakness: CWE-119"
)

dense, sparse = encode(query)

xc = index.query(
    vector=dense,
    sparse_vector=sparse,
    top_k=5,  
    include_metadata=True
)