In [1]:
from time import sleep
from random import random
import pandas as pd
import requests
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import urllib.parse as urlparse
from urllib.parse import parse_qs

In [2]:
BASE_URL = 'https://www.flipkart.com/'
SEARCH_QUERY = "headphones"
TOP_N_PRODUCTS = 10
REVIEW_PAGES_TO_SCRAPE_FROM_PER_PRODUCT = 100

In [3]:
SAMPLE_URL = "https://www.flipkart.com/boat-rockerz-400-bluetooth-headset/product-reviews/itm14d0416b87d55?pid=ACCEJZXYKSG2T9GS&lid=LSTACCEJZXYKSG2T9GSVY4ZIC&marketplace=FLIPKART&page=1"
r = requests.get(SAMPLE_URL)    
soup = BeautifulSoup(r.content, 'html.parser') 
print(soup.prettify()[:500])

<!DOCTYPE html>
<html lang="en">
 <head>
  <link href="https://rukminim1.flixcart.com" rel="preconnect"/>
  <link href="//static-assets-web.flixcart.com/fk-p-linchpin-web/fk-cp-zion/css/app_modules.chunk.905c37.css" rel="stylesheet"/>
  <link href="//static-assets-web.flixcart.com/fk-p-linchpin-web/fk-cp-zion/css/app.chunk.104e9a.css" rel="stylesheet"/>
  <meta content="text/html; charset=utf-8" http-equiv="Content-type"/>
  <meta content="IE=Edge" http-equiv="X-UA-Compatible"/>
  <meta content=


In [4]:
rows = soup.find_all('div',attrs={'class':'col _2wzgFH K0kLPL'})
print(f"Count of rows(reviews):{len(rows)}\n\n\n")
# iteration over all blocks
for row in rows:
    # Print a sample row(review html block)
    # print(f"row:\n{row} \n\n")
    
    # finding all rows within the block
    sub_row = row.find_all('div',attrs={'class':'row'})
        
    # extracting text from 1st and 2nd row
    rating = sub_row[0].find('div').text
    print(f"rating:{rating} \n\n")
    
    summary = sub_row[0].find('p').text
    print(f"summary:{summary} \n\n")
    
    review = sub_row[1].find_all('div')[2].text
    print(f"review:{review} \n\n")
    
    location = sub_row[3].find('p',attrs={'class':'_2mcZGG'}).find_all('span')[1].text
    location = "".join(location.split(",")[1:]).strip()
    print(f"location:{location} \n\n")
    date = sub_row[3].find_all('p',attrs={'class':'_2sc7ZR'})[1].text
    print(f"date:{date} \n\n")
    
    
    sub_row_2 = row.find_all('div',attrs={'class':'_1e9_Zu'})[0].find_all('span',attrs={'class':'_3c3Px5'})
    
    upvotes = sub_row_2[0].text
    print(f"upvotes:{upvotes} \n\n")
    
    downvotes = sub_row_2[1].text
    print(f"downvotes:{downvotes} \n\n")
    
    break

Count of rows(reviews):10



rating:5 


summary:Worth every penny 


review:It was nice produt. I like it's design a lot.  It's easy to carry. And.   Looked stylish. 


location:Kadirur 


date:Jan, 2020 


upvotes:3864 


downvotes:294 




In [7]:
def get_popular_product_s_titles_and_urls(search_query : str, popular_products_count_limit : int = None):
    
    search_url = f"{BASE_URL}search?q={search_query}&sort=popularity"
    search_response = requests.get(search_url)
    
    # Pause the loop for 1-3 seconds to simulate natural setting not overwhelm the server with back to back requests without any pause
    # sleep(randint(1,3))
    
    search_html_soup = BeautifulSoup(search_response.content, 'html.parser')
    search_results_products = search_html_soup.find_all('div',attrs={'class':'_4ddWXP'})
    
    product_titles, product_urls = [],[]
    
    product_count = 0
    
    for product in tqdm(search_results_products, desc="Search Results Iteration", position=0, leave=True):
        
        ad_mention_subrow = product.find("div", attrs={"class":"_4HTuuX"})
        
        is_ad = not not ad_mention_subrow
        
        if not is_ad:
            title_mention_subrow = product.find("a", attrs={"class":"s1Q9rs"})
            
            product_title = title_mention_subrow["title"]
            product_relative_url = title_mention_subrow["href"]
            product_url = urljoin(BASE_URL,product_relative_url)
            
            parsed_url = urlparse.urlparse(product_url)
            parsed_url_path = parsed_url.path
            parsed_url_path_split = parsed_url_path.split("/")
            parsed_url_path_split[2] = "product-reviews"
            parsed_url_path_modified = "/".join(parsed_url_path_split)
            parsed_url_modified = parsed_url._replace(path=parsed_url_path_modified)
            product_url = parsed_url_modified.geturl()
            
            product_titles.append(product_title)
            product_urls.append(product_url)
            
            product_count += 1
            
            if popular_products_count_limit and (product_count >= popular_products_count_limit):
                break
                
    return product_titles, product_urls

In [8]:
product_titles, product_urls = get_popular_product_s_titles_and_urls(SEARCH_QUERY, TOP_N_PRODUCTS);

HBox(children=(HTML(value='Search Results Iteration'), FloatProgress(value=0.0, max=40.0), HTML(value='')))




In [9]:
from prettytable import PrettyTable
x = PrettyTable()
x.field_names = ["# Products", "# Reviews Per Page", "# Pages", "# Total Reviews Count"]
x.add_row([len(product_urls), 10, REVIEW_PAGES_TO_SCRAPE_FROM_PER_PRODUCT, len(product_urls)*10*REVIEW_PAGES_TO_SCRAPE_FROM_PER_PRODUCT])
print(x)

+------------+--------------------+---------+-----------------------+
| # Products | # Reviews Per Page | # Pages | # Total Reviews Count |
+------------+--------------------+---------+-----------------------+
|     10     |         10         |   100   |         10000         |
+------------+--------------------+---------+-----------------------+


In [10]:
dataset = []

for idx, url in enumerate(tqdm(product_urls, desc='products')):
    # iterating over review pages
    for i in tqdm(range(1,REVIEW_PAGES_TO_SCRAPE_FROM_PER_PRODUCT+1), desc="review pages", position=0, leave=False):
        parsed = urlparse.urlparse(url)
        pid = parse_qs(parsed.query)['pid'][0]
        URL = f"{url}&page={i}"
        
        r = requests.get(URL)
        
        # Pause the loop for 0-1 seconds to simulate natural setting not overwhelm the server with back to back requests without any pause
        sleep(random())
        soup = BeautifulSoup(r.content, 'html.parser') 

        rows = soup.find_all('div',attrs={'class':'col _2wzgFH K0kLPL'})

        for row in rows:

            # finding all rows within the block
            sub_row = row.find_all('div',attrs={'class':'row'})
            rating = sub_row[0].find('div').text
            summary = sub_row[0].find('p').text
            summary = summary.strip()
            review = sub_row[1].find_all('div')[2].text
            review = review.strip()
            location=""
            location_row = sub_row[3].find('p',attrs={'class':'_2mcZGG'})
            if location_row:
                location_row = location_row.find_all('span')
                if len(location_row)>=2:
                    location = location_row[1].text
                    location = "".join(location.split(",")[1:]).strip()
            date = sub_row[3].find_all('p',attrs={'class':'_2sc7ZR'})[1].text

            sub_row_2 = row.find_all('div',attrs={'class':'_1e9_Zu'})[0].find_all('span',attrs={'class':'_3c3Px5'})

            upvotes = sub_row_2[0].text
            downvotes = sub_row_2[1].text

            # appending to data
            dataset.append({'product_id':pid, 'product_title':product_titles[idx], 'rating': rating, 'summary': summary, 'review': review, 'location' : location, 'date' : date, 'upvotes' : upvotes, 'downvotes' : downvotes})

HBox(children=(HTML(value='products'), FloatProgress(value=0.0, max=10.0), HTML(value='')))

HBox(children=(HTML(value='review pages'), FloatProgress(value=0.0), HTML(value='')))




ChunkedEncodingError: ("Connection broken: ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None)", ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))

In [11]:
df = pd.DataFrame(dataset)

with pd.option_context('display.max_colwidth', -1):
    display(df.head(5))
    display(df.tail(5))

Unnamed: 0,product_id,product_title,rating,summary,review,location,date,upvotes,downvotes
0,ACCGJHCDF45AFUYY,"Boult Audio Audio Airbass X50 with Quad Mic ENC, 40H Playtime with Gaming Mode On Bluetooth Headset",5,Simply awesome,Amazing product Loved this earbuds Happy to purchasedAwesome design and lightweight White colour looks stunning Good sound quality and base,Bengaluru,1 day ago,0,0
1,ACCGJHCDF45AFUYY,"Boult Audio Audio Airbass X50 with Quad Mic ENC, 40H Playtime with Gaming Mode On Bluetooth Headset",5,Perfect product!,I am amazed by the sound quality it has. It has a rich crisp sound accompanied by a good bass quality. It is worth recommended,Ghaziabad,1 day ago,0,0
2,ACCGJHCDF45AFUYY,"Boult Audio Audio Airbass X50 with Quad Mic ENC, 40H Playtime with Gaming Mode On Bluetooth Headset",5,Worth every penny,"I am using it daily, it has a good battery backup. It's just wonderful. I am very happy to get this one at this price.",Ghaziabad,1 day ago,0,0
3,ACCGJHCDF45AFUYY,"Boult Audio Audio Airbass X50 with Quad Mic ENC, 40H Playtime with Gaming Mode On Bluetooth Headset",5,Fabulous!,"It offers a very good experience at this price point. The build quality of the case is amazing, and also can't ignore its design. It's wonderful.",Ghaziabad,1 day ago,0,0
4,ACCGJHCDF45AFUYY,"Boult Audio Audio Airbass X50 with Quad Mic ENC, 40H Playtime with Gaming Mode On Bluetooth Headset",5,Simply awesome,I am amazed by the sound quality it offers. It has a very rich sound. Feels so comfortable to my ears. Loved these a lot.,Ghaziabad,1 day ago,0,0


Unnamed: 0,product_id,product_title,rating,summary,review,location,date,upvotes,downvotes
98,ACCGJHCDF45AFUYY,"Boult Audio Audio Airbass X50 with Quad Mic ENC, 40H Playtime with Gaming Mode On Bluetooth Headset",5,Terrific purchase,I really liked my Boult Airbass X50. The sound quality is amazing with excellent high medium and low. The bass is also good. When you start listening to music or movie you fall in love with your earbuds.It has 4 mics and good noise cancelling which helps to take call even in noisy places. The design is also fancy and battery backup with the use of case just make it amazing. Boult provided two additional ear tips so that we can change it according to our comfort.It's made in India. So feel...,Ghaziabad,2 days ago,0,0
99,ACCGJHCDF45AFUYY,"Boult Audio Audio Airbass X50 with Quad Mic ENC, 40H Playtime with Gaming Mode On Bluetooth Headset",5,Awesome,Very super Airbass music and sound good,Ghaziabad,2 days ago,0,0
100,ACCGJHCDF45AFUYY,"Boult Audio Audio Airbass X50 with Quad Mic ENC, 40H Playtime with Gaming Mode On Bluetooth Headset",5,Just wow!,"Best product you can buy in this range. Sound quality is good, design is fabulous.",Gurgaon,2 days ago,0,0
101,ACCGJHCDF45AFUYY,"Boult Audio Audio Airbass X50 with Quad Mic ENC, 40H Playtime with Gaming Mode On Bluetooth Headset",5,Just wow!,Sound quality is good with effective bass.Both buds are well fitted.Charging is fine.Overall Good purchase,Gurugram,2 days ago,0,0
102,ACCGJHCDF45AFUYY,"Boult Audio Audio Airbass X50 with Quad Mic ENC, 40H Playtime with Gaming Mode On Bluetooth Headset",5,Simply awesome,"Light weight, noise cancellation is good.. bass is ok.. overall it's a good product.. no pain in ear even after using it for hours and you will forget you had ear buds in your years",New Delhi,2 days ago,0,0


In [12]:
df.to_csv("flipkart_reviews_dataset.csv", index=False)