In [None]:
# Importing Required Libraries
import pandas as pd
import numpy as np
from transformers import AutoModel, AutoTokenizer

In [None]:
# Load Amazon and Walmart datasets
amazon_df = pd.read_csv("Amazon.csv")
walmart_df = pd.read_csv("Walmart.csv")

In [None]:
# Handle missing values in the reviews column
amazon_df['Reviews'] = amazon_df['Reviews'].fillna('')
walmart_df['Reviews'] = walmart_df['Reviews'].fillna('')

In [None]:
# Converting Walmart Product IDs to String
walmart_df['Productid'] = walmart_df['Productid'].astype(str)

In [None]:
# Checking the data type of walmart product id
walmart_df['Productid'].dtype

dtype('O')

In [None]:
# Checking the data type of amazon product id
amazon_df['Productid'].dtype

dtype('O')

In [None]:
# Filter rows based on product_id
def get_product_details(product_id, df):
    product_details = df[df['Productid'] == product_id]
    if product_details.empty:
        print(f"Product ID {product_id} not found.")
        return None
    return product_details

In [None]:
# Example: Fetch details for a specific product
amazon_product_id = "B001E4KFG0"
walmart_product_id = "10"
amazon_details = get_product_details(amazon_product_id, amazon_df)
walmart_details = get_product_details(walmart_product_id, walmart_df)

In [None]:
amazon_details

Unnamed: 0,Productid,Ratings,Reviews
0,B001E4KFG0,5,Good Quality Dog Food


In [None]:
walmart_details

Unnamed: 0,Productid,Ratings,Reviews
9,10,1,I bought a 75 inch tv online using Walmart. Ri...


In [None]:
# Combine reviews for embedding
amazon_reviews = amazon_details['Reviews'].tolist() if amazon_details is not None else []
walmart_reviews = walmart_details['Reviews'].tolist() if walmart_details is not None else []

In [None]:
#generate embeddings
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

In [None]:
# Load Hugging Face model and tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [None]:
# Function to generate embeddings
def generate_embeddings(text_list, batch_size=32, max_length=128, device="cpu"):
    model.eval()
    model.to(device)
    embeddings = []

    with torch.no_grad():
        for i in range(0, len(text_list), batch_size):
            batch = text_list[i:i + batch_size]
            inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
            inputs = {key: value.to(device) for key, value in inputs.items()}
            outputs = model(**inputs)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
            embeddings.append(batch_embeddings)

    return np.vstack(embeddings)

In [None]:
# Generate embeddings
device = "cuda" if torch.cuda.is_available() else "cpu"
amazon_embeddings = generate_embeddings(amazon_reviews, device=device)
walmart_embeddings = generate_embeddings(walmart_reviews, device=device)

In [None]:
! pip install "pinecone[grpc]"

Collecting pinecone[grpc]
  Downloading pinecone-5.4.0-py3-none-any.whl.metadata (19 kB)
Collecting pinecone-plugin-inference<3.0.0,>=2.0.0 (from pinecone[grpc])
  Downloading pinecone_plugin_inference-2.0.1-py3-none-any.whl.metadata (2.2 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone[grpc])
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Collecting lz4>=3.1.3 (from pinecone[grpc])
  Downloading lz4-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Collecting protoc-gen-openapiv2<0.0.2,>=0.0.1 (from pinecone[grpc])
  Downloading protoc_gen_openapiv2-0.0.1-py3-none-any.whl.metadata (1.5 kB)
Downloading lz4-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_inference-2.0.1-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━

In [None]:
! pip install pinecone-client
from pinecone import Pinecone

# Initialize Pinecone with your API key
pc = Pinecone(api_key="pcsk_x6ha4_DW4ssTLsJ2J7Z71m8owBgMrduWFDPB4tDj3p2RxkKJ7pKW4ivjU82rSobTkCSWH", environment="us-east-1")  # Replace with your actual API key and environment

# List all available indexes
indexes = pc.list_indexes()

# Print out the list of indexes
print(indexes.names())  # This will print the names of available indexes

Collecting pinecone-client
  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting pinecone-plugin-inference<2.0.0,>=1.0.3 (from pinecone-client)
  Downloading pinecone_plugin_inference-1.1.0-py3-none-any.whl.metadata (2.2 kB)
Downloading pinecone_client-5.0.1-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.8/244.8 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_inference-1.1.0-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.4/85.4 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pinecone-plugin-inference, pinecone-client
  Attempting uninstall: pinecone-plugin-inference
    Found existing installation: pinecone-plugin-inference 2.0.1
    Uninstalling pinecone-plugin-inference-2.0.1:
      Successfully uninstalled pinecone-plugin-inference-2.0.1
[31mERROR: pip's dependency resolver does not currently take into 

In [None]:
from pinecone import Pinecone

#initializing pinecone
pc = Pinecone(api_key="pcsk_x6ha4_DW4ssTLsJ2J7Z71m8owBgMrduWFDPB4tDj3p2RxkKJ7pKW4ivjU82rSobTkCSWH", environment="us-east-1")
index = pc.Index("product-analysis")


In [None]:
# Upsert embeddings into Pinecone
def upsert_embeddings_with_metadata(embeddings, ids, ratings, reviews, index, namespace):
    vectors = [
        {
            "id": str(ids[i]),
            "values": embeddings[i],
            "metadata": {
                "product_id": str(ids[i]),
                "rating": ratings[i],
                "review": reviews[i]
            }
        }
        for i in range(len(ids))
    ]
    index.upsert(vectors=vectors, namespace=namespace)

In [None]:
# Upsert Amazon data
upsert_embeddings_with_metadata(
    embeddings=amazon_embeddings,
    ids=amazon_details['Productid'].tolist(),
    ratings=amazon_details['Ratings'].tolist(),
    reviews=amazon_details['Reviews'].tolist(),
    index=index,
    namespace="amazon"
)


In [None]:
# Upsert Walmart data
upsert_embeddings_with_metadata(
    embeddings=walmart_embeddings,
    ids=walmart_details['Productid'].tolist(),
    ratings=walmart_details['Ratings'].tolist(),
    reviews=walmart_details['Reviews'].tolist(),
    index=index,
    namespace="walmart"
)

In [None]:
# Query with metadata handling
def query_embeddings_with_metadata(query, namespace, top_k=5):
    query_embedding = generate_embeddings([query])[0]
    results = index.query(
        vector=query_embedding.tolist(),
        top_k=top_k,
        namespace=namespace,
        include_metadata=True
    )
    return results

# Display results with safe metadata access
query = "What are the positive aspects of this product?"
amazon_results = query_embeddings_with_metadata(query, namespace="amazon")
walmart_results = query_embeddings_with_metadata(query, namespace="walmart")

# Display Amazon Results
print("Amazon Results:")
for res in amazon_results["matches"]:
    metadata = res.get("metadata", {})
    print(f"ID: {res['id']}, Score: {res['score']}, "
          f"Rating: {metadata.get('rating', 'No rating')}, Review: {metadata.get('review', 'No review')}")

# Display Walmart Results
print("\nWalmart Results:")
for res in walmart_results["matches"]:
    metadata = res.get("metadata", {})
    print(f"ID: {res['id']}, Score: {res['score']}, "
          f"Rating: {metadata.get('rating', 'No rating')}, Review: {metadata.get('review', 'No review')}")


Amazon Results:
ID: B001E4KFG0, Score: 0.686207533, Rating: 5.0, Review: Good Quality Dog Food

Walmart Results:
ID: 1, Score: 0.681684911, Rating: No rating, Review: No review
ID: 10, Score: 0.653563917, Rating: 1.0, Review: I bought a 75 inch tv online using Walmart. Right after buying the tv, my bank card was hacked and charged $700 after I just bought a tv using Walmart online. Never use your bank card. ALWAYS use a prepaid card when buying online.


In [None]:
!pip install transformers
from transformers import pipeline

# Define classify_sentiment function using transformers pipeline
def classify_sentiment(text):
    classifier = pipeline("sentiment-analysis")
    result = classifier(text)[0]
    return result['label']

# Query with metadata handling
def query_embeddings_with_metadata(query, namespace, top_k=5):
    query_embedding = generate_embeddings([query])[0]  # Assuming generate_embeddings is defined elsewhere
    results = index.query(
        vector=query_embedding.tolist(),
        top_k=top_k,
        namespace=namespace,
        include_metadata=True  # Ensure metadata is included in the query results
    )
    return results

# Process Amazon and Walmart reviews, safely accessing metadata
amazon_classified = [
    {
        "id": res['id'],
        "review": res.get('metadata', {}).get('review', "No review"),  # Safe metadata access
        "rating": res.get('metadata', {}).get('rating', "No rating"),  # Safe metadata access
        "sentiment": classify_sentiment(res.get('metadata', {}).get('review', "No review"))  # Safe metadata access
    }
    for res in amazon_results["matches"]
]

walmart_classified = [
    {
        "id": res['id'],
        "review": res.get('metadata', {}).get('review', "No review"),  # Safe metadata access
        "rating": res.get('metadata', {}).get('rating', "No rating"),  # Safe metadata access
        "sentiment": classify_sentiment(res.get('metadata', {}).get('review', "No review"))  # Safe metadata access
    }
    for res in walmart_results["matches"]
]
# Display Classified Results
print("Amazon Classified Results:")
for item in amazon_classified:
    print(f"ID: {item['id']}, Rating: {item['rating']}, Review: {item['review']}, Sentiment: {item['sentiment']}")

print("\nWalmart Classified Results:")
for item in walmart_classified:
    print(f"ID: {item['id']}, Rating: {item['rating']}, Review: {item['review']}, Sentiment: {item['sentiment']}")



No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Amazon Classified Results:
ID: B001E4KFG0, Rating: 5.0, Review: Good Quality Dog Food, Sentiment: POSITIVE

Walmart Classified Results:
ID: 1, Rating: No rating, Review: No review, Sentiment: NEGATIVE
ID: 10, Rating: 1.0, Review: I bought a 75 inch tv online using Walmart. Right after buying the tv, my bank card was hacked and charged $700 after I just bought a tv using Walmart online. Never use your bank card. ALWAYS use a prepaid card when buying online., Sentiment: NEGATIVE


In [None]:
!pip install ipywidgets


Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jedi
Successfully installed jedi-0.19.2


In [None]:
import ipywidgets as widgets
from IPython.display import display

# Example classified results
amazon_results = [
    {"id": "B001E4KFG0", "rating": 5.0, "review": "Good Quality Dog Food", "sentiment": "POSITIVE"}
]

walmart_results = [
    {"id": "1", "rating": "No rating", "review": "No review", "sentiment": "NEGATIVE"},
    {"id": "10", "rating": 1.0, "review": "I bought a 75 inch tv online using Walmart. Right after buying the tv, my bank card was hacked and charged $700 after I just bought a tv using Walmart online. Never use your bank card. ALWAYS use a prepaid card when buying online.", "sentiment": "NEGATIVE"}
]

# Function to display results based on selection
def display_results(company):
    if company == "Amazon":
        display_text = "Amazon Results:\n"
        for res in amazon_results:
            display_text += f"ID: {res['id']}, Rating: {res['rating']}, Review: {res['review']}, Sentiment: {res['sentiment']}\n"
    elif company == "Walmart":
        display_text = "Walmart Results:\n"
        for res in walmart_results:
            display_text += f"ID: {res['id']}, Rating: {res['rating']}, Review: {res['review']}, Sentiment: {res['sentiment']}\n"
    print(display_text)

# Dropdown to select company
company_dropdown = widgets.Dropdown(
    options=["Amazon", "Walmart"],
    description="Select Company:",
)

# Display the dropdown and link to the display function
widgets.interactive(display_results, company=company_dropdown)


interactive(children=(Dropdown(description='Select Company:', options=('Amazon', 'Walmart'), value='Amazon'), …

In [None]:
!pip install streamlit
!pip install pyngrok

Collecting streamlit
  Downloading streamlit-1.40.1-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Downloading streamlit-1.40.1-py2.py3-none-any.whl (8.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m47.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m75.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[

In [None]:
%%writefile app.py
import streamlit as st

# Example classified results
amazon_results = [
    {"id": "B001E4KFG0", "rating": 5.0, "review": "Good Quality Dog Food", "sentiment": "POSITIVE"}
]

walmart_results = [
    {"id": "1", "rating": "No rating", "review": "No review", "sentiment": "NEGATIVE"},
    {"id": "10", "rating": 1.0, "review": "I bought a 75 inch tv online using Walmart. Right after buying the tv, my bank card was hacked and charged $700 after I just bought a tv using Walmart online. Never use your bank card. ALWAYS use a prepaid card when buying online.", "sentiment": "NEGATIVE"}
]

# Streamlit App Layout
st.title("AMAZON Vs WALMART")

# Dropdown to select company
company = st.selectbox(
    'Select the company:',
    ['Amazon', 'Walmart']
)

# Button to display results
if company == 'Amazon':
    st.subheader("Amazon Product Reviews:")
    for res in amazon_results:
        st.write(f"**ID**: {res['id']}")
        st.write(f"**Rating**: {res['rating']}")
        st.write(f"**Review**: {res['review']}")
        st.write(f"**Sentiment**: {res['sentiment']}")
        st.write("---")

elif company == 'Walmart':
    st.subheader("Walmart Product Reviews:")
    for res in walmart_results:
        st.write(f"**ID**: {res['id']}")
        st.write(f"**Rating**: {res['rating']}")
        st.write(f"**Review**: {res['review']}")
        st.write(f"**Sentiment**: {res['sentiment']}")
        st.write("---")


Writing app.py


In [None]:
from pyngrok import ngrok

# Replace 'your_ngrok_auth_token' with your actual ngrok authentication token
ngrok.set_auth_token("2jxmLo6drZlDNYWajhy4MBJQTh4_7N69nHWKZcsWgeUyhBLWM")



In [None]:
from pyngrok import ngrok
import os

# Set up the tunnel to the Streamlit port
public_url = ngrok.connect(8501)
print(f"Streamlit app is live at: {public_url}")

# Run the Streamlit app
os.system('streamlit run app.py')

Streamlit app is live at: NgrokTunnel: "https://6bfc-34-27-247-85.ngrok-free.app" -> "http://localhost:8501"
