# Web Scraping

Data scraping is one of the most used ways to collect data. In simple terms it means, to get HTML code for a webpage and scan it for data.

In [None]:
from time import sleep
from random import random
import pandas as pd
import requests
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import urllib.parse as urlparse
from urllib.parse import parse_qs

In [None]:
# Constants
BASE_URL = 'https://www.flipkart.com/'
SEARCH_QUERY = "headphones"
TOP_N_PRODUCTS = 10
REVIEW_PAGES_TO_SCRAPE_FROM_PER_PRODUCT = 100 #10 Reviews exist per page

# Requesting Desired Webpage

In [None]:
SAMPLE_URL = "https://www.flipkart.com/boat-rockerz-400-bluetooth-headset/product-reviews/itm14d0416b87d55?pid=ACCEJZXYKSG2T9GS&lid=LSTACCEJZXYKSG2T9GSVY4ZIC&marketplace=FLIPKART&page=1"
r = requests.get(SAMPLE_URL)    
soup = BeautifulSoup(r.content, 'html.parser') 
print(soup.prettify()[:500])

<!DOCTYPE html>
<html lang="en">
 <head>
  <link href="https://rukminim1.flixcart.com" rel="preconnect"/>
  <link href="//static-assets-web.flixcart.com/www/linchpin/fk-cp-zion/css/app.chunk.5a191e.css" rel="stylesheet"/>
  <meta content="text/html; charset=utf-8" http-equiv="Content-type"/>
  <meta content="IE=Edge" http-equiv="X-UA-Compatible"/>
  <meta content="102988293558" property="fb:page_id"/>
  <meta content="658873552,624500995,100000233612389" property="fb:admins"/>
  <meta content="n


If you're know HTML, this might look familiar.  
Next we'll see how to get our data.

# Test for a single product review page

In [None]:
# Extracting all review blocks
## Note col._2wzgFH.K0kLPL means 3 entities namely 'col', ' _2wzgFH' and 'K0kLPL' 
## This is written in HTML as 'col _2wzgFH K0kLPL'
## This can also be seen in Bullet 3

rows = soup.find_all('div',attrs={'class':'col _2wzgFH K0kLPL'})
print(f"Count of rows(reviews):{len(rows)}\n\n\n")
# iteration over all blocks
for row in rows:
    # Print a sample row(review html block)
    # print(f"row:\n{row} \n\n")
    
    # finding all rows within the block
    sub_row = row.find_all('div',attrs={'class':'row'})
        
    # extracting text from 1st and 2nd row
    rating = sub_row[0].find('div').text
    print(f"rating:{rating} \n\n")
    
    summary = sub_row[0].find('p').text
    print(f"summary:{summary} \n\n")
    
    review = sub_row[1].find_all('div')[2].text
    print(f"review:{review} \n\n")
    
    location = sub_row[3].find('p',attrs={'class':'_2mcZGG'}).find_all('span')[1].text
    location = "".join(location.split(",")[1:]).strip()
    print(f"location:{location} \n\n")
    
    date = sub_row[3].find_all('p',attrs={'class':'_2sc7ZR'})[1].text
    print(f"date:{date} \n\n")
    
    
    sub_row_2 = row.find_all('div',attrs={'class':'_1e9_Zu'})[0].find_all('span',attrs={'class':'_3c3Px5'})
    
    upvotes = sub_row_2[0].text
    print(f"upvotes:{upvotes} \n\n")
    
    downvotes = sub_row_2[1].text
    print(f"downvotes:{downvotes} \n\n")
    
    break

Count of rows(reviews):10



rating:5 


summary:Worth every penny 


review:It was nice produt. I like it's design a lot.  It's easy to carry. And.   Looked stylish. 


location:Kadirur 


date:Jan, 2020 


upvotes:1615 


downvotes:118 




# Search based Product URL Discovery

In [None]:
def get_popular_product_s_titles_and_urls(search_query : str, popular_products_count_limit : int = None):
    
    search_url = f"{BASE_URL}search?q={search_query}&sort=popularity"
    search_response = requests.get(search_url)
    
    # Pause the loop for 1-3 seconds to simulate natural setting not overwhelm the server with back to back requests without any pause
    # sleep(randint(1,3))
    
    search_html_soup = BeautifulSoup(search_response.content, 'html.parser')
    search_results_products = search_html_soup.find_all('div',attrs={'class':'_4ddWXP'})
    
    product_titles, product_urls = [],[]
    
    product_count = 0
    
    for product in tqdm(search_results_products, desc="Search Results Iteration", position=0, leave=True):
        
        ad_mention_subrow = product.find("div", attrs={"class":"_4HTuuX"})
        
        is_ad = not not ad_mention_subrow
        
        if not is_ad:
            
            title_mention_subrow = product.find("a", attrs={"class":"s1Q9rs"})
            
            product_title = title_mention_subrow["title"]
            product_relative_url = title_mention_subrow["href"]
            product_url = urljoin(BASE_URL,product_relative_url)
            
            parsed_url = urlparse.urlparse(product_url)
            parsed_url_path = parsed_url.path
            parsed_url_path_split = parsed_url_path.split("/")
            parsed_url_path_split[2] = "product-reviews"
            parsed_url_path_modified = "/".join(parsed_url_path_split)
            parsed_url_modified = parsed_url._replace(path=parsed_url_path_modified)
            product_url = parsed_url_modified.geturl()
            
            product_titles.append(product_title)
            product_urls.append(product_url)
            
            product_count += 1
            
            if popular_products_count_limit and (product_count >= popular_products_count_limit):
                break
                
    return product_titles, product_urls

## Collect Product Page URLs for Top 10 Popular Products for 'Headphones' search query

### Specify Search Query and Popular Product Count Limit(optional)

In [None]:
product_titles, product_urls = get_popular_product_s_titles_and_urls(SEARCH_QUERY, TOP_N_PRODUCTS);

HBox(children=(FloatProgress(value=0.0, description='Search Results Iteration', max=40.0, style=ProgressStyle(…




# Iterating over multiple products and multiple pages

In [None]:
from prettytable import PrettyTable
x = PrettyTable()
x.field_names = ["# Products", "# Reviews Per Page", "# Pages", "# Total Reviews Count"]
x.add_row([len(product_urls), 10, REVIEW_PAGES_TO_SCRAPE_FROM_PER_PRODUCT, len(product_urls)*10*REVIEW_PAGES_TO_SCRAPE_FROM_PER_PRODUCT])
print(x)

+------------+--------------------+---------+-----------------------+
| # Products | # Reviews Per Page | # Pages | # Total Reviews Count |
+------------+--------------------+---------+-----------------------+
|     10     |         10         |   100   |         10000         |
+------------+--------------------+---------+-----------------------+


In [None]:
dataset = []

for idx, url in enumerate(tqdm(product_urls, desc='products')):
    # iterating over review pages
    for i in tqdm(range(1,REVIEW_PAGES_TO_SCRAPE_FROM_PER_PRODUCT+1), desc="review pages", position=0, leave=False):
        parsed = urlparse.urlparse(url)
        pid = parse_qs(parsed.query)['pid'][0]
        URL = f"{url}&page={i}"
        
        r = requests.get(URL)
        
        # Pause the loop for 0-1 seconds to simulate natural setting not overwhelm the server with back to back requests without any pause
        sleep(random())
        soup = BeautifulSoup(r.content, 'html.parser') 

        rows = soup.find_all('div',attrs={'class':'col _2wzgFH K0kLPL'})

        for row in rows:

            # finding all rows within the block
            sub_row = row.find_all('div',attrs={'class':'row'})

            # extracting text from 1st 2nd and 4th row
            rating = sub_row[0].find('div').text
            summary = sub_row[0].find('p').text
            summary = summary.strip()
            review = sub_row[1].find_all('div')[2].text
            review = review.strip()
            location=""
            location_row = sub_row[3].find('p',attrs={'class':'_2mcZGG'})
            if location_row:
                location_row = location_row.find_all('span')
                if len(location_row)>=2:
                    location = location_row[1].text
                    location = "".join(location.split(",")[1:]).strip()
            date = sub_row[3].find_all('p',attrs={'class':'_2sc7ZR'})[1].text

            sub_row_2 = row.find_all('div',attrs={'class':'_1e9_Zu'})[0].find_all('span',attrs={'class':'_3c3Px5'})

            upvotes = sub_row_2[0].text
            downvotes = sub_row_2[1].text

            # appending to data
            dataset.append({'product_id':pid, 'product_title':product_titles[idx], 'rating': rating, 'summary': summary, 'review': review, 'location' : location, 'date' : date, 'upvotes' : upvotes, 'downvotes' : downvotes})

HBox(children=(FloatProgress(value=0.0, description='products', max=10.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='review pages', style=ProgressStyle(description_width='ini…

HBox(children=(FloatProgress(value=0.0, description='review pages', style=ProgressStyle(description_width='ini…

HBox(children=(FloatProgress(value=0.0, description='review pages', style=ProgressStyle(description_width='ini…

HBox(children=(FloatProgress(value=0.0, description='review pages', style=ProgressStyle(description_width='ini…

HBox(children=(FloatProgress(value=0.0, description='review pages', style=ProgressStyle(description_width='ini…

HBox(children=(FloatProgress(value=0.0, description='review pages', style=ProgressStyle(description_width='ini…

HBox(children=(FloatProgress(value=0.0, description='review pages', style=ProgressStyle(description_width='ini…

HBox(children=(FloatProgress(value=0.0, description='review pages', style=ProgressStyle(description_width='ini…

HBox(children=(FloatProgress(value=0.0, description='review pages', style=ProgressStyle(description_width='ini…

HBox(children=(FloatProgress(value=0.0, description='review pages', style=ProgressStyle(description_width='ini…




# View Sample set of reviews that we collected

In [None]:
df = pd.DataFrame(dataset)

with pd.option_context('display.max_colwidth', -1):
    display(df.head(5))
    display(df.tail(5))

Unnamed: 0,product_id,product_title,rating,summary,review,location,date,upvotes,downvotes
0,ACCFR3Q77R6RRGAC,OnePlus Bullets Wireless Z Bluetooth Headset,5,Terrific purchase,"first of all with mi 18 watt charger,it got full charged in 20mins.that was amazing and 16-18hrs playtime is best at this price range.I searched for a wireless earphone and bought many but finally settled with this.sound quality is good but build quality is premium.overall i would say it is the best earphone to go with and I guarantee you will not regret.drop like if you find it useful.",Kolkata,9 months ago,1405,169
1,ACCFR3Q77R6RRGAC,OnePlus Bullets Wireless Z Bluetooth Headset,5,Fabulous!,"I am using this product from 2 daysLet me share with you my experienceThis is my genuine and humble review 1st the delivery of flipkart is too good and then awesome packingThen comming to productThe experience of sound is really great we catch every instrumental sounds clearly and be joyful while listening and my rating is 4/5And then about bass, actually I expected too about bass boost but it not reached then that just good actually but I satisfied with that and my rating is 3/5Then ...",Nalgonda District,9 months ago,564,74
2,ACCFR3Q77R6RRGAC,OnePlus Bullets Wireless Z Bluetooth Headset,5,Worth every penny,LONG BUT WORTH READING.Honest review after one week. Just go for it. Dont get affected by negative reviews. Totally worth every penny and far ahead of its competition. The battery backup is splendid. Sound quality is very good nothing to complain here. Comfortable in the ears. Connectivity perfect. I haven't experienced any lag with normal media usage. I am a student and watch video lectures at sometimes 2x speed and never once I faced voice lag while changing video speed. Just go for it. Ju...,Arrah,9 months ago,1407,206
3,ACCFR3Q77R6RRGAC,OnePlus Bullets Wireless Z Bluetooth Headset,5,Terrific,writing review after using it for two days ..first of all its having best battery backup among other wireless headphones.only drained 20% battery aftr usage of some 8hrs.. build quality is above the mark.. bass wise its lil low.. bt its ok .. go for it .. it wont disappoint you .. a great thumbs-up for flipkart delivery teams👍🏻👍🏻,Kollam District,10 months ago,164,24
4,ACCFR3Q77R6RRGAC,OnePlus Bullets Wireless Z Bluetooth Headset,5,Simply awesome,The bass provided is a decent one and the sound clarity is pretty good.And all the other features are exceptional.U can surely go for this one for the price range..will not regeret.,Ernakulam District,10 months ago,941,186


Unnamed: 0,product_id,product_title,rating,summary,review,location,date,upvotes,downvotes
9381,ACCFGYHHKJ94GN6A,BoAt Bassheads 103 Blue Wired Headset,4,Delightful,Okay,Bengaluru,13 days ago,0,0
9382,ACCFGYHHKJ94GN6A,BoAt Bassheads 103 Blue Wired Headset,4,Really Nice,Good🌸🌸,Lucknow,13 days ago,0,0
9383,ACCFGYHHKJ94GN6A,BoAt Bassheads 103 Blue Wired Headset,4,Good choice,Nice,Rayagada District,13 days ago,0,0
9384,ACCFGYHHKJ94GN6A,BoAt Bassheads 103 Blue Wired Headset,5,Must buy!,My favourite product boat thinks you boat,West Godavari District,13 days ago,0,0
9385,ACCFGYHHKJ94GN6A,BoAt Bassheads 103 Blue Wired Headset,5,Classy product,Awesome 👍,Pune,13 days ago,0,0


In [None]:
count_reviews = df.shape[0]
print(f"Count of reviews:{count_reviews}")

Count of reviews:9386


# Serialize the dataframe to a csv file

In [None]:
df.to_csv("./flipkart_reviews_dataset.csv", index=False)