<a href="https://colab.research.google.com/github/ben-ogden/musiccaps/blob/main/init-pinecone-index.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install dependencies

In [None]:
!pip install sentence_transformers pinecone-client datasets

## Load dataset

In [None]:
from datasets import load_dataset

# load the dataset and convert to pandas dataframe
df = load_dataset(
    'google/MusicCaps', data_files='musiccaps-public.csv', split='train'
).to_pandas()


In [None]:

df

## Initialize NER Model

In [None]:
import torch

# set device to GPU if available
device = torch.cuda.current_device() if torch.cuda.is_available() else None

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

model_id = 'dslim/bert-base-NER'

# load the tokenizer from huggingface
tokenizer = AutoTokenizer.from_pretrained(
    model_id
)
# load the NER model from huggingface
model = AutoModelForTokenClassification.from_pretrained(
    model_id
)
# load the tokenizer and model into a NER pipeline
nlp = pipeline(
    'ner',
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy='max',
    device=device
)

Verify the NER Pipeline extraction is good to go.

In [None]:
text = 'The song is a classic Rock and Roll and the narration is a Documentary'
# use the NER pipeline to extract named entities from the text
nlp(text)

## Initialize Retriever

In [None]:
from sentence_transformers import SentenceTransformer

# load the model from huggingface
retriever = SentenceTransformer(
    'flax-sentence-embeddings/all_datasets_v3_mpnet-base',
    device=device
)
retriever

## Prepare to connect to Pinecone

In [64]:
import pinecone

# connect to pinecone environment
pinecone.init(
    api_key='YOUR_API_KEY',
    environment='YOUR_REGION'
)

## Create Pinecone Index

In [65]:
index_name = 'music-caps-index'

# check if the ner-search index exists
if index_name not in pinecone.list_indexes():
    # create the index if it does not exist
    pinecone.create_index(
        index_name,
        dimension=768,
        metric='cosine'
    )

# connect to ner-search index we created
index = pinecone.Index(index_name)

## Generate Embeddings and populate index

In [66]:
def extract_named_entities(text_batch):
    # extract named entities using the NER pipeline
    extracted_batch = nlp(text_batch)
    entities = []
    # loop through the results and only select the entity names
    for text in extracted_batch:
        ne = [entity['word'] for entity in text]
        entities.append(ne)
    return entities

In [None]:
from tqdm.auto import tqdm

# we will use batches of 128
batch_size = 128

for i in tqdm(range(0, len(df), batch_size)):
    # find end of batch
    i_end = min(i+batch_size, len(df))
    # extract batch
    batch = df.iloc[i:i_end]
    # generate embeddings for batch
    emb = retriever.encode(batch['caption'].tolist()).tolist()
    # get metadata
    meta = batch.to_dict(orient='records')
    # create unique IDs
    ids = [f"{idx}" for idx in range(i, i_end)]
    # add all to upsert list
    to_upsert = list(zip(ids, emb, meta))
    # upsert/insert these records to pinecone
    _ = index.upsert(vectors=to_upsert)
 
# check that we have all vectors in index
index.describe_index_stats()

## Query the Index

In [76]:
from pprint import pprint

def search_pinecone(query):
    # create embeddings for the query
    xq = retriever.encode(query).tolist()
    # query the pinecone index
    xc = index.query(xq, top_k=10, include_metadata=True)
    return xc

In [78]:
query = 'happy balkan music'
search_pinecone(query)

{'matches': [{'id': '2192',
              'metadata': {'aspect_list': "['balkan music', 'instrumental', "
                                          "'virtual sounds', 'accordion', "
                                          "'bass guitar', 'acoustic sounding "
                                          "drums', 'cheerful', 'upbeat', 'folk "
                                          "dance']",
                           'audioset_positive_labels': '/m/0mkg',
                           'author_id': 9.0,
                           'caption': 'This is a folk music piece from the '
                                      'Balkans. It is an instrumental piece '
                                      'performed with virtual sounds. The '
                                      'leading tune is played by a realistic '
                                      'accordion sound. There is an upbeat '
                                      'bass guitar following the rhythm of a '
                            

## Clean up

In [57]:
pinecone.delete_index('ner-search')