<a href="https://colab.research.google.com/github/adnangithubbd/Generative-AI/blob/main/review_parsing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# dataset downloaded from the link below
# https://www.kaggle.com/datasets/parve05/customer-review-dataset

In [None]:
import os
import google.generativeai as genai
# from dotenv import load_dotenv
from pydantic import BaseModel, Field
from typing import List, Optional, Literal
import json

In [None]:
genai.configure(api_key="AIzaSyAYJeTPCIkDaRRG6qd9vp3PBTMSMmRt41A")

model = genai.GenerativeModel("gemini-2.0-flash")


# gemini-2.0-flash

# gemini-1.5-pro

In [None]:
class Review(BaseModel):
    key_themes: List[str] = Field(description="Extract key themes discussed in the review.")
    summary: str = Field(description="Provide a brief summary of the review.")
    sentiment: Literal["positive", "negative", "neutral", "mixed"] = Field(description="Determine the sentiment of the review.")
    pros: Optional[List[str]] = Field(default=None, description="List all the pros.")
    cons: Optional[List[str]] = Field(default=None, description="List all the cons.")
    reviewer_name: Optional[str] = Field(default=None, description="Extract the name of the reviewer.")


In [None]:
import re

def extract_json(text):
    """Extract JSON content from Gemini response."""
    match = re.search(r"```json\s*(.*?)\s*```", text, re.DOTALL)
    return match.group(1) if match else text  #

In [None]:
def get_structured_review(review_text: str):
    prompt = f"""
    Extract structured information from the following review. Return the output as valid JSON.

    Review: "{review_text}"

    The JSON format should match this schema:
    {{
        "key_themes": ["theme1", "theme2"],
        "summary": "brief summary",
        "sentiment": "positive/negative/neutral/mixed",
        "pros": ["pro1", "pro2"],
        "cons": ["con1", "con2"],
        "reviewer_name": "John Doe"
    }}
    """

    response = model.generate_content(prompt)

    try:
        raw_text = extract_json(response.text)  # Remove Markdown code block
        structured_output = json.loads(raw_text)  # Convert to dictionary
        validated_output = Review(**structured_output)  # Validate with Pydantic
        return validated_output.dict()  # Return structured data
    except Exception as e:
        return {"error": str(e), "raw_response": response.text}

In [None]:

review_text = """
I recently upgraded to the Samsung Galaxy S24 Ultra, and I must say, it’s an absolute powerhouse! The Snapdragon 8 Gen 3 processor makes everything lightning fast—whether I’m gaming, multitasking, or editing photos. The 5000mAh battery easily lasts a full day even with heavy use, and the 45W fast charging is a lifesaver.

The S-Pen integration is a great touch for note-taking and quick sketches, though I don't use it often. What really blew me away is the 200MP camera—the night mode is stunning, capturing crisp, vibrant images even in low light. Zooming up to 100x actually works well for distant objects, but anything beyond 30x loses quality.

However, the weight and size make it a bit uncomfortable for one-handed use. Also, Samsung’s One UI still comes with bloatware—why do I need five different Samsung apps for things Google already provides? The $1,300 price tag is also a hard pill to swallow.


Insanely powerful processor (great for gaming and productivity)
Stunning 200MP camera with incredible zoom capabilities
Long battery life with fast charging
S-Pen support is unique and useful

Review by Nitish Singh

"""


# review_text = "I love the camera quality and battery life, but the software is buggy. - John Doe"
output = get_structured_review(review_text)

In [None]:
print(output)

In [None]:
response = model.generate_content("Hello, world! in java?")



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("/content/redmi6.csv", encoding='latin-1') # or 'iso-8859-1' or cp1252'

In [None]:
df.columns

In [None]:
new_df=df[['Review Title','Comments']]

In [None]:
new_df.isnull().sum()

In [None]:
new_df["Combined"] = new_df["Review Title"].str.cat(new_df["Comments"], sep=" ")


In [None]:
new_df['Combined']

In [None]:
import time
from google.api_core.exceptions import TooManyRequests

def safe_get_structured_review(text):
  try:
    return get_structured_review(text)
  except TooManyRequests:
    print("Rate limit exceeded . retry in 30 second")
    time.sleep(30)
    return safe_get_structured_review(text)

In [None]:
new_df['categorized'] = new_df['Combined'].apply(safe_get_structured_review)


In [None]:
new_df['categorized']

In [None]:
new_df.to_csv('categorized_redmi6.csv', index=False)

In [None]:
new_df['categorized']