# **Sentiment Mining with Pinecone**

In [None]:
! pip install sentence_transformers pinecone-client datasets seaborn matplotlib

# **Load and Prepare Dataset**

In [None]:
from datasets import load_dataset

In [None]:
df = load_dataset('ashraq/hotel-reviews',split='train').to_pandas()

In [None]:
df

In [None]:
# we keep only the first 800 characters of each review
df["review"] = df['review'].str[:800]
df.head()

# **Initializing the sentiment model**

In [None]:
import torch

In [None]:
device = torch.cuda.current_device() if torch.cuda.is_available() else 'cpu'
device

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from transformers import pipeline,AutoTokenizer,AutoModelForSequenceClassification

In [None]:
# loading the sentiment analysis model 
model_id = "cardiffnlp/twitter-roberta-base-sentiment"
model = AutoModelForSequenceClassification.from_pretrained(model_id,
                                                          num_labels=3)
# loading the tokenizer from Huggingface
tokenizer = AutoTokenizer.from_pretrained(model_id)
# loading both the model and the tokenizer into the sentiment analysis pipeline
nlp = pipeline("sentiment-analysis",
               model=model,
              tokenizer=tokenizer,
              device = device)

In [None]:
labels = {
    'LABEL_0':'negative',
    'LABEL_1':'neutral',
    'LABEL_2':'positive'
}

In [None]:
test = df['review'][241]
test

In [None]:
# applying the nlp model
nlp(test)

# **initalizing the retrievel model** 

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
# loading the model from hugging face
retriever = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2',
                              device=device)
retriever

# **Initializing the Pinecone Index**

In [None]:
!pip uninstall -y pinecone-client

In [None]:
! pip install pinecone

In [None]:
key = "pcsk_5VXdpR_UaDYWRTemDNVZh8SUz9wQSTuT2rGSZnNwStFmm6XRtMUurb1wAuyUxMDxhzpzki"

In [None]:
import pinecone
from pinecone import Pinecone

# **sentiment-analysis-vector** 

In [None]:
from pinecone import Pinecone, ServerlessSpec

from pinecone import Pinecone

pc = Pinecone(api_key=key)
index = pc.Index("sentiment-analysis-vector")


In [None]:
def get_sentiment(reviews):
    sentiments = nlp(reviews)
    i = [labels[x["label"]] for x in sentiments]
    s = [x["score"] for x in sentiments]
    return i , s 

In [None]:
get_sentiment(df['review'][:10].tolist())

In [None]:
df.dtypes

In [None]:
import dateutil.parser

In [None]:
def get_timestamp(dates):
    timestamp = [dateutil.parser.parse(d).timestamp() for d in dates]
    return timestamp

In [None]:
get_timestamp([df["review_date"][0]])[0]

In [None]:
from tqdm.auto import tqdm

batch_size = 64

for i in tqdm(range(0, 64*50, batch_size), desc="Processing batches"):
    try:
        # Define batch range
        i_end = min(i + batch_size, 64*20)
        batch = df.iloc[i:i_end]

        # Encode reviews into embeddings
        emb = retriever.encode(batch["review"].tolist()).tolist()

        # Get timestamps and sentiment analysis
        timestamp = get_timestamp(batch['review_date'].tolist())
        label, score = get_sentiment(batch['review'].tolist())
        
        # Add new data to batch
        batch['label'], batch['score'] = label, score

        # Prepare metadata and IDs
        meta = batch.to_dict(orient="records")
        ids = [str(idx) for idx in range(i, i_end)]

        # Prepare data for upsert and push to index
        to_upsert = list(zip(ids, emb, meta))
        _ = index.upsert(vectors=to_upsert)

    except Exception as e:
        print(f"❗ Error processing batch {i // batch_size + 1}: {e}")

# Display index stats after all batches
stats = index.describe_index_stats()
print("✅ Index stats:", stats)


# **Room size and satisfaction`RSS`**

In [None]:
query = "are customers satisfied with the room sizees of hotels in london ?"
xq = retriever.encode(query).tolist()

result = index.query(
    vector=xq, top_k=1000,
    include_metadata = True,
)


In [None]:
data = result['matches']

In [None]:
import pandas as pd 

In [None]:
RSS_data = pd.DataFrame([
    {
        'id': r['id'],
        'similarity_score': r['score'],
        'hotel_name': r['metadata']['hotel_name'],
        'label': r['metadata']['label'],
        'review': r['metadata']['review'],
        'review_date': r['metadata']['review_date'],
        'sentiment_score': r['metadata']['score']
    }
    for r in data
])

In [None]:
RSS_data.head()

In [None]:
import seaborn as sns 

In [None]:
sns.countplot(RSS_data,x='label',hue='label',palette='Paired')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
sentiment_count = RSS_data['label'].value_counts()
sentiment_count.plot(kind='pie',autopct='%1.1F%%',startangle=90,colors=['green','red','gray'])
plt.title('Sentiment Proportions')
plt.ylabel('')

In [None]:
sentiment_count.index

In [None]:
hotel_names = RSS_data['hotel_name'].unique()
hotel_names

In [None]:
pip install nltk wordcloud

In [None]:
explode = [0,0,0.1]
palette_color = sns.color_palette('Paired') 
fig , ax = plt.subplots(1,2,figsize=(12,6))
for i,hotel in enumerate (hotel_names):
    sentiment_count = RSS_data[RSS_data['hotel_name']==hotel]['label'].value_counts()
    ax[i].pie(sentiment_count ,labels = sentiment_count.index , colors = palette_color,explode=explode,autopct = '%1.1F%%')
    ax[i].set_title(f'{hotel} /n Customer sentiment',size=8)
plt.tight_layout()    
    

In [None]:
import nltk
import string
from scipy.sparse import csr_matrix
from nltk.corpus import stopwords
from wordcloud import WordCloud

In [None]:
from nltk.corpus import stopwords

In [None]:
nltk.download('stopwords')

In [None]:
def clean_text(text):
    stop_words = set(stopwords.words('english'))
    text = text.translate(str.maketrans('','',string.punctuation)).lower()
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return " ".join(words)

In [None]:
RSS_data['review']=RSS_data['review'].apply(clean_text)

In [None]:
pos_RSS_reviews = " ".join(RSS_data[RSS_data['label']=='positive']['review'])
neg_RSS_reviews = " ".join(RSS_data[RSS_data['label']=='negative']['review'])

In [None]:
pos_word_cloud = WordCloud(
    stopwords =set(stopwords.words('english')),
    width = 800 , height=400, background_color = 'white',colormap='Blues'
).generate(pos_RSS_reviews)

neg_word_cloud = WordCloud(
    stopwords =set(stopwords.words('english')),
    width = 800 , height=400, background_color = 'white',colormap='inferno'
).generate(neg_RSS_reviews)

In [None]:
plt.figure(figsize=(18,9))

plt.subplot(1,2,1)
plt.imshow(pos_word_cloud,interpolation='bilinear')
plt.axis('off')
plt.title('Wordcloud of positive reviews',size=14)
plt.subplot(1,2,2)
plt.imshow(neg_word_cloud,interpolation='bilinear')
plt.axis('off')
plt.title('Wordcloud of negative reviews',size=14)


plt.tight_layout()
plt.show()

# **sentiment overview**

In [None]:
queries = [{"food_Quality":"are customers satisfied with the food Quality","are Customers satisfied with the stuff?",""}]
for q in queries
xq = retriever.encode(query).tolist()

result = index.query(
    vector=xq, top_k=1000,
    include_metadata = True,
)