# Install dependencies

In [None]:
!pip3 install \
  transformers \
  sentence-transformers \
  pinecone-client \
  datasets \
  accelerate \
  einops \
  langchain \
  xformers \
  bitsandbytes \
  langchain-community \
  pinecone

In [None]:
!pip3 install git+https://github.com/naver/splade.git

# Initialize HG Pipeline

In [None]:
from torch import cuda
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

embed_model_id = 'sentence-transformers/msmarco-bert-base-dot-v5'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

embed_model = HuggingFaceEmbeddings(
    model_name=embed_model_id,
    model_kwargs={'device': device},
    encode_kwargs={'device': device, 'batch_size': 32}
)

# Building the Pinecone index

In [None]:
from pinecone import Pinecone, ServerlessSpec
import os

pc = Pinecone(api_key='XXX')
index_name = 'metadata-embedding'

pc.create_index(
    name=index_name,
    dimension=768, 
    metric="dotproduct",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)

# Initialize Pinecone index

In [None]:
index = pc.Index(index_name)
index.describe_index_stats()

# Load data

In [None]:
import pandas as pd
import uuid

# Load the Excel file
df = pd.read_csv('data.csv')
df = df.fillna('')

# Create vector db context

In [None]:
data = [
    {
        'id': str(uuid.uuid4()),
        'cve': row['CVE ID'],
        'cwe': row['CWE ID'],
        'Summary': row['Summary'],
        'func_before': row['func_before'],
        'func_after': row['func_after']
    } for index, row in df.iterrows()
]

# Dense Vectors

In [None]:
from sentence_transformers import SentenceTransformer
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

if device != 'cuda':
    print("==========\n"+
          "WARNING: You are not running on GPU so this may be slow.\n"+
          "If on Google Colab, go to top menu > Runtime > Change "+
          "runtime type > Hardware accelerator > 'GPU' and rerun "+
          "the notebook.\n==========")

dense_model = SentenceTransformer(
    'msmarco-bert-base-dot-v5',
    device=device
)

dim = dense_model.get_sentence_embedding_dimension()

# Sparse Vectors

In [None]:
from transformers import AutoTokenizer
from splade.models.transformer_rep import Splade

sparse_model_id = 'naver/splade-cocondenser-ensembledistil'

sparse_model = Splade(sparse_model_id, agg='max')
sparse_model.to(device)  
sparse_model.eval()

tokenizer = AutoTokenizer.from_pretrained(sparse_model_id)

# Index entire dataset

# Builder

In [None]:
from collections import defaultdict

def encode_field(text):
    """Encodes a single field using the dense and sparse models."""
    dense_vec = dense_model.encode([text]).tolist()[0]
    input_ids = tokenizer([text], return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        sparse_vec = sparse_model(d_kwargs=input_ids.to(device))['d_rep'].squeeze()
    indices = sparse_vec.nonzero().squeeze().cpu().tolist()
    values = sparse_vec[indices].cpu().tolist()
    sparse_dict = {"indices": indices, "values": values}
    return dense_vec, sparse_dict

# Function to encode the combined metadata
def encode_metadata(row):
    dense_vecs = []
    combined_sparse_dict = defaultdict(float)

    for field in ['cve', 'cwe', 'Summary', 'func_before', 'func_after']:
        text = row[field]
        dense_vec, sparse_dict = encode_field(text)
        
        # Collect dense vectors
        dense_vecs.append(dense_vec)
        
        # Combine sparse vectors and handle duplicate indices by summing their values
        for idx, val in zip(sparse_dict['indices'], sparse_dict['values']):
            combined_sparse_dict[idx] += val
    
    # Average dense vectors to match the expected dimension
    averaged_dense_vec = [sum(x) / len(dense_vecs) for x in zip(*dense_vecs)]
    
    # Convert combined sparse dict to indices and values lists
    combined_sparse_indices = list(combined_sparse_dict.keys())
    combined_sparse_values = list(combined_sparse_dict.values())
    
    combined_sparse_dict = {"indices": combined_sparse_indices, "values": combined_sparse_values}
    return averaged_dense_vec, combined_sparse_dict


def builder(records: list):
    # Convert records to upserts format
    upserts = []
    for record in records:
        dense_vec, sparse_dict = encode_metadata(record)
        
        # Build metadata struct
        metadata = {
            'cve': record['cve'],
            'cwe': record['cwe'],
            'Summary': record['Summary'],
            'func_before': record['func_before'],
            'func_after': record['func_after']
        }
        
        # Append all to upserts list as pinecone.Vector (or GRPCVector)
        upserts.append({
            'id': record['id'],
            'values': dense_vec,
            'sparse_values': sparse_dict,
            'metadata': metadata
        })
    
    return upserts

# Upsert chunks

In [None]:
from tqdm.auto import tqdm

batch_size = 32

for i in tqdm(range(0, len(data), batch_size)):
    i_end = min(i+batch_size, len(data))
    batch = data[i:i_end]
    index.upsert(builder(data[i:i+batch_size]))

# Querying

In [None]:
def encode(text: str):
    dense_vec = dense_model.encode(text).tolist()
    input_ids = tokenizer(text, return_tensors='pt')
    with torch.no_grad():
        sparse_vec = sparse_model(
            d_kwargs=input_ids.to(device)
        )['d_rep'].squeeze()
    
    indices = sparse_vec.nonzero().squeeze().cpu().tolist()
    values = sparse_vec[indices].cpu().tolist()
    sparse_dict = {"indices": indices, "values": values}
    
    return dense_vec, sparse_dict

In [None]:
query = (
    "Vulnerability: CVE-2014-3173 and Weakness: CWE-119"
)

dense, sparse = encode(query)

xc = index.query(
    vector=dense,
    sparse_vector=sparse,
    top_k=5,  
    include_metadata=True
)