In [18]:
# Import libraries
import pandas as pd
from fastembed import TextEmbedding
import numpy as np
from pinecone import Pinecone, ServerlessSpec

In [19]:
# Load data from Excel file
csv_file = "100_review.csv"
df = pd.read_csv(csv_file)

In [20]:
# Check for NaN values
print(df.isna().sum())

Unnamed: 0    0
rating        0
reviewText    0
summary       0
review        0
word_count    0
dtype: int64


In [21]:
df.head()

Unnamed: 0.1,Unnamed: 0,rating,reviewText,summary,review,word_count
0,0,5,This book was the very first bookmobile book I...,50 + years ago...,Rating: 5 Title: 50 + years ago... Content: Th...,69
1,3,5,I don't normally buy 'mystery' novels because ...,Very good read.,Rating: 5 Title: Very good read. Content: I do...,73
2,4,5,"This isn't the kind of book I normally read, a...",Great Story!,Rating: 5 Title: Great Story! Content: This is...,77
3,6,3,I bought this book because I loved the cover a...,Hot Civil War Read... I wanted more Romance,Rating: 3 Title: Hot Civil War Read... I wante...,123
4,7,5,This was a book that I thoroughly enjoyed from...,Wow and wonderful read with a twist,Rating: 5 Title: Wow and wonderful read with a...,81


In [22]:
# Extract review column to list
review_list = df['review'].tolist()

In [23]:
# Check list result
review_list[:2]

['Rating: 5 Title: 50 + years ago... Content: This book was the very first bookmobile book I bought when I was in the school book club. I loved the story then and I bet a dollar to a donut I will love it again. If my memory serves, I bought this book in 5th grade. That would have been about 1961. I am looking forward to reliving the memories.',
 "Rating: 5 Title: Very good read. Content: I don't normally buy 'mystery' novels because I just don't like them.  However, this time I decided to take a chance and I am glad I did.I found the story engrossing, the characters engaging, and it was well written.  I will buy more books from this author in this series.I still do not especially care for 'mystery' novels - I consider this book series an exception."]

In [24]:
def generate_embeddings(model_name, documents):
    '''function generate embedding from model'''
    model = TextEmbedding(model_name=model_name)
    embeddings = list(model.embed(documents))
    return embeddings

In [25]:
# Generate embeddings
embeddings = generate_embeddings("nomic-ai/nomic-embed-text-v1.5-Q", review_list)

In [26]:
embeddings

[array([ 5.27420163e-01,  5.66166461e-01, -3.90618968e+00, -1.40270400e+00,
        -1.05087459e+00,  8.01171362e-01,  3.05084493e-02,  9.52222407e-01,
        -2.66676426e-01, -1.51718095e-01, -4.65079814e-01, -6.98104408e-03,
         2.74186760e-01,  1.03771758e+00, -1.60182923e-01, -1.49561882e+00,
        -3.56268048e-01, -4.37810063e-01,  1.05052240e-01,  6.06091976e-01,
        -2.76009273e-02, -1.16442412e-01, -1.17079377e+00,  7.07980037e-01,
         8.69172931e-01,  5.70643991e-02, -1.98920631e+00, -8.05060565e-01,
        -1.42324245e+00,  4.68596131e-01,  5.17051220e-01, -1.24581599e+00,
        -8.54310691e-01,  4.69683707e-01, -1.06972706e+00, -2.73176402e-01,
         1.05008876e+00,  2.03657895e-01, -2.35090151e-01,  4.26598310e-01,
         1.73521817e+00,  1.52653611e+00,  5.52823305e-01, -3.41410458e-01,
         1.70008409e+00, -6.56324804e-01,  2.11713701e-01, -1.09101629e+00,
         1.25762331e+00,  9.38047707e-01,  4.33979444e-02,  2.05075949e-01,
         7.7

In [27]:
pc = Pinecone(api_key="xxxx")

In [28]:
# Create a serverless index
index_name = "review-embedding-100"

pc.create_index(
    name=index_name,
    dimension=768, # Model dimensions
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

In [29]:
# Wait for the index to be ready
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

index = pc.Index(index_name)

# Prepare vectors for upserting
vectors = []
for i, (review, embedding) in enumerate(zip(review_list, embeddings)):
    vectors.append({
        "id": str(i),  # Use row index as unique ID
        "values": embedding,  # Directly use the embedding (no need for ['values'])
        "metadata": {'review': review}  # Store the original review as metadata
    })

In [34]:
# Upsert the vectors into Pinecone
index.upsert(
    vectors=vectors,
    namespace="review-100"  # Optional namespace to organize data
)

{'upserted_count': 100}

In [35]:
# Check index
print(index.describe_index_stats())

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'review-100': {'vector_count': 100}},
 'total_vector_count': 100}


In [36]:
# Sample query
query = "Give me review for rating 5 stars"

# Generate embedding for the query using the same model as before
query_embedding = generate_embeddings("nomic-ai/nomic-embed-text-v1.5-Q", query)

In [37]:
# Ensure the query_embedding is a list and not an ndarray
if isinstance(query_embedding, list) and isinstance(query_embedding[0], np.ndarray):
    query_embedding = [embedding.tolist() for embedding in query_embedding]  # Convert each ndarray in the list to a list

# Perform a similarity search in Pinecone to find similar reviews
results = index.query(
    vector=query_embedding,  # Use the query's embedding vector
    top_k=5,  # Number of most similar reviews to retrieve
    include_metadata=True,
    namespace="review-100"  
)

# Print out the most similar reviews
print(results)

{'matches': [{'id': '90',
              'metadata': {'review': 'Rating: 5 Title: My fairy story book '
                                     'Content: I absolutely LOVE THIS BOOK. I '
                                     'turn on the read to me feature and '
                                     'listen to these stories every night. I '
                                     'am so familiar with the stories in this '
                                     'book that I know them by heart. This '
                                     'book gets all stars in my opinion.'},
              'score': 0.656612813,
              'values': []},
             {'id': '82',
              'metadata': {'review': 'Rating: 5 Title: Surviving the Fog '
                                     'Review Content: Stan Morris - did a '
                                     'great job with this book: good '
                                     'description of characters, plot was '
                                     'gre

In [42]:
# Sample query
query2 = "Rating: 1 Title: Sucked Content: This book should not have gotten pass my erotica block - from the second chapter to the 6th(and that was all I could take)it was raunchy talk and explicited sex.  Not at all what I look for in a book. The only story line was getting the woman in bed which he did by chapter 5(less then 25% of the book according to my kindle)and the characters was a little more then non-believable. This was a waste of my time."

# Generate embedding for the query using the same model as before
query_embedding2 = generate_embeddings("nomic-ai/nomic-embed-text-v1.5-Q", query2)

In [43]:
# Ensure the query_embedding is a list and not an ndarray
if isinstance(query_embedding2, list) and isinstance(query_embedding2[0], np.ndarray):
    query_embedding2 = [embedding.tolist() for embedding in query_embedding2]  # Convert each ndarray in the list to a list

# Perform a similarity search in Pinecone to find similar reviews
results2 = index.query(
    vector=query_embedding2,  # Use the query's embedding vector
    top_k=5,  # Number of most similar reviews to retrieve
    include_metadata=True,
    namespace="review-100"  
)

# Print out the most similar reviews
print(results2)

{'matches': [{'id': '7',
              'metadata': {'review': "Rating: 1 Title: I'm being kind with 1 "
                                     'star Content: The only thing that got me '
                                     'to the end of this story was sheer '
                                     'cussedness.  The storyline was flat, '
                                     'there was absolutely no character '
                                     'development, and the dialog was from a '
                                     'cheesy porn.  I found myself '
                                     'increasingly disgusted with the female '
                                     'lead and her whining.  In no way did I '
                                     'find myself sympathetic to the '
                                     'protagonist nor did I find myself even '
                                     'caring what made him who he was.  There '
                                     'was no back sto