# Web Scraping: Flipkart Reviews

**Objective:** Scrape 100-200 reviews (Review Text, Rating, Date) for a single product from Flipkart.
**Output:** Save to `../data/raw/scraped_reviews.csv`.

> **Note:** This scraped data is used **only for validation**, not for model training.

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
from pathlib import Path
import os

# Setup paths
# Robustly find project root
current_dir = Path(os.getcwd())
if current_dir.name == "notebooks":
    PROJECT_ROOT = current_dir.parent
elif (current_dir / "notebooks").exists():
    PROJECT_ROOT = current_dir
else:
    # Fallback or assume we are deeper/somewhere else
    PROJECT_ROOT = Path("..").resolve()

DATA_DIR = PROJECT_ROOT / "data" / "raw"
DATA_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_FILE = DATA_DIR / "scraped_reviews.csv"

print(f"Project Root: {PROJECT_ROOT}")
print(f"Data Directory: {DATA_DIR}")
print(f"Output File: {OUTPUT_FILE}")

In [None]:
# Target URL (Flipkart Product Reviews Page)
BASE_URL = "https://www.flipkart.com/tibra-collection-men-kurta-pyjama-set/product-reviews/itm82cbf59f5a028?pid=ETHFXSKHQPGZESBN&lid=LSTETHFXSKHQPGZESBNJOPCLT&marketplace=FLIPKART"

# Headers to mimic a real browser
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive"
}

In [None]:
def scrape_flipkart_reviews(base_url, target_count=150):
    reviews_data = []
    page = 1
    
    while len(reviews_data) < target_count:
        url = f"{base_url}&page={page}"
        print(f"Scraping page {page}: {url}...")
        
        try:
            response = requests.get(url, headers=HEADERS)
            if response.status_code != 200:
                print(f"Failed to retrieve page {page}. Status code: {response.status_code}")
                break
            
            soup = BeautifulSoup(response.content, "html.parser")
            
            # Flipkart review container class (subject to change, checking common ones)
            # As of late 2024/2025, these classes are common but might need adjustment if Flipkart updates UI
            review_cards = soup.find_all("div", class_="cPHDOP col-12-12")
            
            # Filter for actual review cards (sometimes other elements share the column class)
            # usually specific reviews correspond to these internal structures
            
            # Fallback/Alternate search if main container isn't clear, lets look for rating blocks
            # Common rating class: _3LWZlK
            
            # Let's iterate over a generic "row" or review block if specific container fails
            # A more robust selection often involves looking for specific child elements
            
            found_on_page = 0
            
            # More precise selector for review blocks
            blocks = soup.find_all("div", class_="_27M-vq") # Common wrapper for review
            if not blocks:
                 blocks = soup.find_all("div", class_="col _2wzgFH K0kLPL") # Older but still seen

            if not blocks:
               # Fallback to finding by components if container class changed
               # This is risky but lets try to grab all text blocks
               print("Warning: Standard review containers not found. Output might be empty. Check CSS selectors.")
            
            for card in blocks:
                try:
                    # Rating
                    rating_tag = card.find("div", class_="_3LWZlK")
                    if not rating_tag:
                        # sometimes ratings < 3 have different color/class, but usually _3LWZlK is consistent for structure or it might be _1BLPMq
                        pass
                    rating = rating_tag.text.strip() if rating_tag else None
                    
                    # Text
                    text_tag = card.find("div", class_="t-ZTKy")
                    if text_tag:
                        # remove 'READ MORE' if present
                        review_text = text_tag.get_text(separator=" ").replace("READ MORE", "").strip()
                    else:
                        review_text = ""
                        
                    # Date (usually in a p tag with class _2sc7ZR)
                    # There are usually two of these: name and date
                    meta_tags = card.find_all("p", class_="_2sc7ZR")
                    if len(meta_tags) >= 2:
                        date_str = meta_tags[1].text.strip()
                    else:
                        date_str = None
                        
                    if rating and review_text:
                        reviews_data.append({
                            "rating": rating,
                            "review_text": review_text,
                            "date": date_str,
                            "source": "Flipkart"
                        })
                        found_on_page += 1
                        
                except AttributeError:
                    continue
            
            print(f"Found {found_on_page} reviews on page {page}. Total: {len(reviews_data)}")
            
            if found_on_page == 0:
                print("No reviews found on this page. Ending scrape.")
                break

            page += 1
            time.sleep(random.uniform(2, 5)) # Polite delay
            
        except Exception as e:
            print(f"Error on page {page}: {e}")
            break
            
    return pd.DataFrame(reviews_data)

In [None]:
# Run Scraping
if __name__ == "__main__":
    print("Starting scraping...")
    df_reviews = scrape_flipkart_reviews(BASE_URL, target_count=150)
    
    if not df_reviews.empty:
        print(f"Saved {len(df_reviews)} reviews to {OUTPUT_FILE}")
        df_reviews.to_csv(OUTPUT_FILE, index=False)
        print(df_reviews.head())
    else:
        print("No reviews collected. Please check the selectors or URL.")