# Ratings Prediction: Data Collection
#### Objective:
To scrape the reviews of different laptops, Phones, Headphones, smart watches, Professional Cameras, Printers, monitors, Home theater, router from different e-commerce websites.

**Basically needed columns:**
1. reviews of the product.
2. rating of the product

In [1]:
#importing required libraries
import pandas as pd
import re
import time, sys
import tqdm.notebook as tqdm
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException

# Scraping Ratings & Reviews from `Flipkart.com`

In [2]:
#function for initializing webdriver with chromedriver.exe
def start_driver():
    #initializing webdriver with chrome driver
    driver = webdriver.Chrome('chromedriver.exe')
    
    return driver
#end of function start_driver



#function for initializing driver with https://flipkart.com and searching a value
def init_flipkart(search_key,driver):
    #initializing driver with url https://flipkart.com
    driver.get('https://flipkart.com')

    #closing login pop-up
    WebDriverWait(driver,5).until(ec.element_to_be_clickable((By.XPATH,'//button[@class="_2KpZ6l _2doB4z"]'))).click()

    #initializing search box with laptops
    WebDriverWait(driver,5).until(ec.element_to_be_clickable((By.XPATH,'//input[@class="_3704LK"]'))).send_keys(search_key)

    #clicking on search button
    WebDriverWait(driver,5).until(ec.element_to_be_clickable((By.XPATH,'//button[@class="L0Z3Pu"]'))).click()
    

#end of function init_flipkart


#function for fetching all the product urls for searched product
def get_product_urls(driver):
    #getting all the related product links
    product_urls = []
    page_counter = 0
    while 1:
        #getting product url containers
        try:
            links = []
            links = WebDriverWait(driver,5).until(ec.presence_of_all_elements_located((By.XPATH,'//div[@class="_2kHMtA"]/a[@class="_1fQZEK"]')))
        except TimeoutException:
            links = []
            links = WebDriverWait(driver,5).until(ec.presence_of_all_elements_located((By.XPATH,'//div[@class="_4ddWXP"]/a[@class="_2rpwqI"]')))
        except:
            print("No more links found")

        #scraping product url from containers
        for link in links:
            try:
                product_urls.append(link.get_attribute('href'))
                sys.stdout.write(f"\rFoundURL: [{len(product_urls)}]")
                sys.stdout.flush()
            except:
                break

        #goto next page if available
        try:
            page_counter += 1
            #breaking loop after visting 41 pages for url as flipkart throws error after this
            if page_counter >41:
                break
                
            WebDriverWait(driver,5).until(ec.element_to_be_clickable((By.XPATH,'//a[@class="_1LKTO3"]/span[text()="Next"]'))).click()
        except:
            break; #break loop if next page is not available

    #end of while 1
    return product_urls

#end of function get_product_urls


#function to scrape reviews and ratings from given product url
def get_rating_reviews(urls,driver,limit=False):
    ratings = []
    review_titles = []
    review_descriptions = []
    
    #scraping ratings & reviews of products
    for url in tqdm.tqdm(urls,desc="Processing"):
        #if number of records are more than limit than break
        if limit and len(ratings) > limit:
            break
        
        #initialize driver with url
        driver.get(url)
        try:
            #checking for total number of reviews available for current url
            num_reviews = WebDriverWait(driver,5).until(ec.element_to_be_clickable((By.XPATH,'//span[@class="_2_R_DZ"]/span/span[3]')))
            total_reviews = int(re.search(r'\d+',num_reviews.text)[0])
        except:
            sys.stdout.write("\rNumber of Reviews: TimeOut!")
            continue

        rating_containers = []
        title_containers = []
        description_containers = []
        
        #if total reviews is more than 3 than click on All [XX] reviews link
        if total_reviews > 3:      #reviews IF
            try:
                WebDriverWait(driver,5).until(ec.element_to_be_clickable((By.XPATH,'//div[@class="_3UAT2v _16PBlm"]/span[contains(text(),"All")]'))).click()
                time.sleep(2)
            except:
                sys.stdout.write("\rAll [XX] Reviews: TimeOut!")
                
            while 1:
                #checking and breaking loop when number of records are more than limit
                if limit and len(ratings) > limit:
                    break
                    
                error_flag = False
                try:
                    rating_containers = WebDriverWait(driver,5).until(ec.presence_of_all_elements_located((By.XPATH,'//div[@class="col _2wzgFH K0kLPL"]/div[1]/div')))
                    title_containers = WebDriverWait(driver,5).until(ec.presence_of_all_elements_located((By.XPATH,'//div[@class="col _2wzgFH K0kLPL"]/div[1]/p')))
                    description_containers = WebDriverWait(driver,5).until(ec.presence_of_all_elements_located((By.XPATH,'//div[@class="col _2wzgFH K0kLPL"]/div[2]/div/div')))
                except:
                    error_flag = True
                    sys.stdout.write("\rScraping Containers: TimeOut!")

                if not error_flag:
                    for rating in rating_containers:
                        ratings.append(rating.text)

                    for title in title_containers:
                        review_titles.append(title.text)

                    for description in description_containers:
                        review_descriptions.append(description.text.replace('\n',' '))
                        
                if limit:
                    sys.stdout.write("\rScrapeLIMIT[%d/%d] :: URL: %s" % (len(ratings),limit,url))
                else:
                    sys.stdout.write("\rScrapedDATA[%d] :: URL: %s" % (len(ratings),url))

                #move to next review page
                try:
                    WebDriverWait(driver,5).until(ec.element_to_be_clickable((By.XPATH,'//a[@class="_1LKTO3"]/span[text()="Next"]'))).click()
                    time.sleep(3)
                except:
                    break
                    
            #end of while 1
        else:
            error_flag = False
            try:
                rating_containers = WebDriverWait(driver,5).until(ec.presence_of_all_elements_located((By.XPATH,'//div[@class="col _2wzgFH"]/div[1]/div')))
                title_containers = WebDriverWait(driver,5).until(ec.presence_of_all_elements_located((By.XPATH,'//div[@class="col _2wzgFH"]/div[1]/p')))
                description_containers = WebDriverWait(driver,5).until(ec.presence_of_all_elements_located((By.XPATH,'//div[@class="col _2wzgFH"]/div[2]/div/div')))
            except:
                error_flag = True
                sys.stdout.write("\rScraping Containers: TimeOut!")
                
            if not error_flag:
                for rating in rating_containers:
                    ratings.append(rating.text)

                for title in title_containers:
                    review_titles.append(title.text)

                for description in description_containers:
                    review_descriptions.append(description.text.replace('\n',' '))
            
            if limit:
                sys.stdout.write("\rScrapeLIMIT[%d/%d] :: URL: %s" % (len(ratings),limit,url))
            else:
                sys.stdout.write("\rScrapedDATA[%d] :: URL: %s" % (len(ratings),url))
        #end of reviewsIF
    #end of forLoop
        
    #creating dataframe of scraped data
    df = pd.DataFrame({
        "Ratings": ratings,
        "Review Titles": review_titles,
        "Review Descriptions": review_descriptions
    })
    
    #closing driver
    driver.close()
    
    #returning dataframe object
    if limit:
        return df[:limit]
    else:
        return df
#end of function get_rating_reviews()


## Scraping ratings & reviews for `Laptops` from flipkart

In [3]:
#start driver
driver = start_driver()

#initialize search box with laptops
init_flipkart('laptops',driver)

#fetch the all related product urls
urls = get_product_urls(driver)

FoundURL: [495]

In [4]:
#scrape the ratings & reviews from urls and store it as dataframe object
laptop_df_flipkart = get_rating_reviews(urls,driver)

Processing:   0%|          | 0/495 [00:00<?, ?it/s]

ScrapedDATA[30631] :: URL: https://www.flipkart.com/avita-liber-core-i5-8th-gen-8-gb-256-gb-ssd-windows-10-home-ns13a2in199p-thin-light-laptop/p/itme90070f891149?pid=COMFQGZEZYHEGQAX&lid=LSTCOMFQGZEZYHEGQAXRHE7DY&marketplace=FLIPKART&q=laptops&store=6bo%2Fb5g&srno=s_26_578&otracker=search&otracker1=search&fm=SEARCH&iid=7b43357e-3770-4784-86b0-421662122f52.COMFQGZEZYHEGQAX.SEARCH&ppt=sp&ppn=sp&ssid=oadwro8ocw0000001624100894696&qH=c06ea84a1e3dc3c6100894696&qH=c06ea84a1e3dc3c6e3dc3c6100894696&qH=c06ea84a1e3dc3c6c6a84a1e3dc3c63c64a1e3dc3c63c6e3dc3c63c6

In [5]:
#interpreting scraped data
laptop_df_flipkart

Unnamed: 0,Ratings,Review Titles,Review Descriptions
0,5,Worth every penny,An affordable beast ! Pros: 1. Incredible perf...
1,5,Awesome,Best laptop in this price segment.. battery is...
2,4,Really Nice,To be honest Pro's 1) RGB keyboard 2)144Hzs wi...
3,4,Value-for-money,The Laptop is a masterpiece with stunnig desig...
4,5,Highly recommended,Good laptop but customer care folks are real d...
...,...,...,...
30626,5,Wonderful,Super it looks like Mac book
30627,5,Awesome,It is very slim laptop with light weight..It's...
30628,5,Highly recommended,"Highly recommended.delivery was good,I got the..."
30629,5,Awesome,Product is awesome. Feel like a mac only touch...


In [56]:
#copying scradped ratings & reviews to another dataframe object
df1 = laptop_df_flipkart.copy()
print(f"Original Shape: {df1.shape}")

#checking for duplicate rows
print(f"Total Number of Duplicated Rows: {df1.duplicated().sum()}")

#removing duplicate rows
df1.drop_duplicates(inplace=True)
print(f"New Shape: {df1.shape}")

Original Shape: (30631, 3)
Total Number of Duplicated Rows: 17130
New Shape: (13501, 3)


In [6]:
#saving scraped data to csv file
laptop_df_flipkart.to_csv('flipkart-ratings-reviews-laptop.csv')

## Scraping for ratings & reviews of `Phones` from Flipkart for 2000 records

In [7]:
#start driver
driver = start_driver()

#initialize search box with phones
init_flipkart('phones',driver)

#fetch the all related product urls
phone_urls = get_product_urls(driver)

FoundURL: [756]

In [8]:
#scrape the ratings & reviews from urls and store it as dataframe object
phone_df_flipkart = get_rating_reviews(phone_urls,driver,limit=2000)

Processing:   0%|          | 0/756 [00:00<?, ?it/s]

ScrapeLIMIT[2010/2000] :: URL: https://www.flipkart.com/realme-c12-power-silver-32-gb/p/itm4854d77becc77?pid=MOBFUEPQEEDFBHCE&lid=LSTMOBFUEPQEEDFBHCEUX6BAW&marketplace=FLIPKART&q=phones&store=tyy%2F4io&srno=s_1_1&otracker=search&otracker1=search&fm=SEARCH&iid=0332fdb1-0873-450e-a102-5acff127da7b.MOBFUEPQEEDFBHCE.SEARCH&ppt=hp&ppn=homepage&ssid=bpg9tjk6kg0000001624117051082&qH=28388ea49f54c5b8

In [9]:
#interpreting scraped data
phone_df_flipkart

Unnamed: 0,Ratings,Review Titles,Review Descriptions
0,5,Classy product,Very good mobile phone in this price range. It...
1,4,Pretty good,Uncompetitive battery power. Fine CAMERA but n...
2,5,Fabulous!,"Very good mobile I am very happy, its camera i..."
3,5,Terrific purchase,"Very good budget phone , its good affordable s..."
4,4,Wonderful,Bought it for my mom. She really likes it. The...
...,...,...,...
1995,5,Super!,Very nice
1996,5,Brilliant,Very very very very very good product ☺️☺️☺️☺️
1997,4,Good choice,Nice product
1998,5,Highly recommended,Very nice


In [58]:
#copying scradped ratings & reviews to another dataframe object
df2 = phone_df_flipkart.copy()
print(f"Original Shape: {df2.shape}")

#checking for duplicate rows
print(f"Total Number of Duplicated Rows: {df2.duplicated().sum()}")

#removing duplicate rows
df2.drop_duplicates(inplace=True)
print(f"New Shape: {df2.shape}")

Original Shape: (2000, 3)
Total Number of Duplicated Rows: 345
New Shape: (1655, 3)


In [10]:
#saving scraped data to CSV file
phone_df_flipkart.to_csv('flipkart-ratings-reviews-phones.csv')

## Scraping for ratings & reviews of `Smart Watches` from Flipkart for 2000 records

In [11]:
#start driver
driver = start_driver()

#initialize search box with phones
init_flipkart('smart watches',driver)

#fetch the all related product urls
smart_watches_urls = get_product_urls(driver)

FoundURL: [540]

In [12]:
#scrape the ratings & reviews from urls and store it as dataframe object
smart_watches_df_flipkart = get_rating_reviews(smart_watches_urls,driver,limit=2000)

Processing:   0%|          | 0/540 [00:00<?, ?it/s]

ScrapeLIMIT[2010/2000] :: URL: https://www.flipkart.com/cyxus-4g-mobile-watch-smartwatch/p/itm60624e699bc90?pid=SMWFGA23VTRCYTK5&lid=LSTSMWFGA23VTRCYTK5WDNI2C&marketplace=FLIPKART&q=smart+watches&store=ajy%2Fbuh&srno=s_3_49&otracker=search&otracker1=search&fm=SEARCH&iid=da4af797-667e-4727-bdd3-c2c5d18089b8.SMWFGA23VTRCYTK5.SEARCH&ppt=sp&ppn=sp&ssid=2wrxl0oq1s0000001624118086247&qH=87968ec020b2016bn=sp&ssid=2wrxl0oq1s0000001624118086247&qH=87968ec020b2016b6247&qH=87968ec020b2016b

In [13]:
#interpreting scraped data
smart_watches_df_flipkart

Unnamed: 0,Ratings,Review Titles,Review Descriptions
0,1,Utterly Disappointed,battery issue...
1,5,The badass watch,"Everything is gud, but battery draining is qui..."
2,1,Useless product,bad
3,1,Worthless,Very bad and nasty product
4,5,Great product,nice watch it is so good all I have to buy thi...
...,...,...,...
1995,1,Useless product,very bad product not buy this product
1996,1,Not recommended at all,very bad ....unsatisfied
1997,5,Best in the market!,watch is very good.
1998,5,Simply awesome,Best smart watch


In [59]:
#copying scradped ratings & reviews to another dataframe object
df3 = smart_watches_df_flipkart.copy()
print(f"Original Shape: {df3.shape}")

#checking for duplicate rows
print(f"Total Number of Duplicated Rows: {df3.duplicated().sum()}")

#removing duplicate rows
df3.drop_duplicates(inplace=True)
print(f"New Shape: {df3.shape}")

Original Shape: (2000, 3)
Total Number of Duplicated Rows: 565
New Shape: (1435, 3)


In [14]:
#saving scraped to csv file
smart_watches_df_flipkart.to_csv('flipkart-ratings-reviews-smart-watches.csv')

## Scraping for ratings & reviews of `Camera` from Flipkart for 2000 records

In [15]:
#start driver
driver = start_driver()

#initialize search box with phones
init_flipkart('camera',driver)

#fetch the all related product urls
camera_urls = get_product_urls(driver)

FoundURL: [642]

In [16]:
#scrape the ratings & reviews from urls and store it as dataframe object
camera_df_flipkart = get_rating_reviews(camera_urls,driver,limit=2000)

Processing:   0%|          | 0/642 [00:00<?, ?it/s]

ScrapeLIMIT[2004/2000] :: URL: https://www.flipkart.com/sony-cybershot-dsc-w800-bc-in5/p/itme8g8yfydrnwwv?pid=CAMDVGUGHKXNHYXB&lid=LSTCAMDVGUGHKXNHYXBG3GONO&marketplace=FLIPKART&q=camera&store=jek%2Fp31&srno=s_1_9&otracker=search&otracker1=search&fm=SEARCH&iid=cd33bcc5-ebda-412c-9f89-a1477c4fcca9.CAMDVGUGHKXNHYXB.SEARCH&ppt=hp&ppn=homepage&ssid=uudlsc7r400000001624119270093&qH=dd6d2dcc679d12b9ppn=homepage&ssid=uudlsc7r400000001624119270093&qH=dd6d2dcc679d12b90001624119270093&qH=dd6d2dcc679d12b9

In [17]:
#interpreting scraped data
camera_df_flipkart

Unnamed: 0,Ratings,Review Titles,Review Descriptions
0,3,Does the job,Satisfied with the product.. Its a really good...
1,5,Excellent,product really great . I am totally sctified t...
2,4,Worth the money,The camera simply awesome..and it is a best bu...
3,,,
4,,,
...,...,...,...
1995,5,Awesome,great cam.
1996,5,Must buy!,very nice product
1997,3,Nice,Much Better
1998,5,Wonderful,nice


In [60]:
#copying scradped ratings & reviews to another dataframe object
df4 = camera_df_flipkart.copy()
print(f"Original Shape: {df4.shape}")

#checking for duplicate rows
print(f"Total Number of Duplicated Rows: {df4.duplicated().sum()}")

#removing duplicate rows
df4.drop_duplicates(inplace=True)
print(f"New Shape: {df4.shape}")

Original Shape: (2000, 3)
Total Number of Duplicated Rows: 709
New Shape: (1291, 3)


In [18]:
#saving scraped to csv file
camera_df_flipkart.to_csv('flipkart-ratings-reviews-cameras.csv')

## Scraping for ratings & reviews of `Monitors` from Flipkart for 2000 records

In [19]:
#start driver
driver = start_driver()

#initialize search box with phones
init_flipkart('monitors',driver)

#fetch the all related product urls
monitors_urls = get_product_urls(driver)

FoundURL: [273]

In [20]:
#scrape the ratings & reviews from urls and store it as dataframe object
monitors_df_flipkart = get_rating_reviews(monitors_urls,driver,limit=2000)

Processing:   0%|          | 0/273 [00:00<?, ?it/s]

ScrapeLIMIT[2003/2000] :: URL: https://www.flipkart.com/lg-22-inch-full-hd-led-backlit-ips-panel-monitor-22mp68vq/p/itmety6nszghmbp4?pid=MONETY6MQUBYPSZC&lid=LSTMONETY6MQUBYPSZCQE1PSV&marketplace=FLIPKART&q=monitors&store=6bo%2Fg0i%2F9no&srno=s_1_10&otracker=search&otracker1=search&fm=SEARCH&iid=37beea04-5835-4525-b3a2-7a6b7ba255d2.MONETY6MQUBYPSZC.SEARCH&ppt=hp&ppn=homepage&ssid=v2h3zctq9s0000001624120260240&qH=4cbba110d06cbe92=4cbba110d06cbe92d06cbe92

In [21]:
#interpreting scraped data
monitors_df_flipkart

Unnamed: 0,Ratings,Review Titles,Review Descriptions
0,4,Value-for-money,4.5/5 Wow.. what a nice product I ❤️ it. SCRE...
1,4,Good choice,Very Good Product. Good Packaging. The Dell qu...
2,5,Great product,Great product from Dell . It's budget friendly...
3,5,Must buy!,Good product by Dell and Flipkart. It has both...
4,4,Wonderful,Its a very good product... Good build quality....
...,...,...,...
1995,4,Really Nice,Good Product But on Big Screen Picture quality...
1996,4,Very Good,best paking box
1997,5,Highly recommended,great experience.. works fine
1998,5,Classy product,super display quality


In [61]:
#copying scradped ratings & reviews to another dataframe object
df5 = monitors_df_flipkart.copy()
print(f"Original Shape: {df5.shape}")

#checking for duplicate rows
print(f"Total Number of Duplicated Rows: {df5.duplicated().sum()}")

#removing duplicate rows
df5.drop_duplicates(inplace=True)
print(f"New Shape: {df5.shape}")

Original Shape: (2000, 3)
Total Number of Duplicated Rows: 400
New Shape: (1600, 3)


In [22]:
#saving scraped to csv file
monitors_df_flipkart.to_csv('flipkart-ratings-reviews-monitors.csv')

## Scraping for ratings & reviews of `Routers` from Flipkart for 2000 records

In [23]:
#start driver
driver = start_driver()

#initialize search box with phones
init_flipkart('routers',driver)

#fetch the all related product urls
routers_urls = get_product_urls(driver)

FoundURL: [393]

In [24]:
#scrape the ratings & reviews from urls and store it as dataframe object
routers_df_flipkart = get_rating_reviews(routers_urls,driver,limit=2000)

Processing:   0%|          | 0/393 [00:00<?, ?it/s]

ScrapeLIMIT[2004/2000] :: URL: https://www.flipkart.com/d-link-dir-615-wireless-n-300-router/p/itme3xwg9x9jgsyh?pid=RTRE3XW76EHCJUGH&lid=LSTRTRE3XW76EHCJUGHL3JOYL&marketplace=FLIPKART&q=routers&store=6bo%2F2a2&spotlightTagId=BestvalueId_6bo%2F2a2&srno=s_1_12&otracker=search&otracker1=search&fm=SEARCH&iid=14dda5da-bbf5-4ba2-9216-d7b62ff25ac6.RTRE3XW76EHCJUGH.SEARCH&ppt=hp&ppn=homepage&ssid=gxa7ihwuv40000001624121219189&qH=d7d0ec8a06768096ppn=homepage&ssid=gxa7ihwuv40000001624121219189&qH=d7d0ec8a06768096

In [25]:
#interpreting scraped data
routers_df_flipkart

Unnamed: 0,Ratings,Review Titles,Review Descriptions
0,5,Worth every penny,This is the best WiFi router in this price ran...
1,5,Wonderful,"Just fabulous performance, i am very glad to c..."
2,5,Got a good router after doing a lot of research!,If you want a router for medium sized Flat/hom...
3,,,
4,,,
...,...,...,...
1995,5,Brilliant,Supar
1996,1,Worst experience ever!,Poor
1997,4,Nice product,Working well
1998,1,Absolute rubbish!,Not working


In [62]:
#copying scradped ratings & reviews to another dataframe object
df6 = routers_df_flipkart.copy()
print(f"Original Shape: {df6.shape}")

#checking for duplicate rows
print(f"Total Number of Duplicated Rows: {df6.duplicated().sum()}")

#removing duplicate rows
df6.drop_duplicates(inplace=True)
print(f"New Shape: {df6.shape}")

Original Shape: (2000, 3)
Total Number of Duplicated Rows: 291
New Shape: (1709, 3)


In [26]:
#saving scraped to csv file
routers_df_flipkart.to_csv('flipkart-ratings-reviews-routers.csv')

## Scraping for ratings & reviews of `Home Theaters` from Flipkart for 2000 records

In [27]:
#start driver
driver = start_driver()

#initialize search box with phones
init_flipkart('home theaters',driver)

#fetch the all related product urls
home_theaters_urls = get_product_urls(driver)

FoundURL: [1678]

In [28]:
#scrape the ratings & reviews from urls and store it as dataframe object
home_theaters_df_flipkart = get_rating_reviews(home_theaters_urls,driver,limit=2000)

Processing:   0%|          | 0/1678 [00:00<?, ?it/s]

ScrapeLIMIT[2010/2000] :: URL: https://www.flipkart.com/philips-spa8000b-94-120-w-bluetooth-home-theatre/p/itm86811e098cacd?pid=ACCEK4CAZHWWGJRS&lid=LSTACCEK4CAZHWWGJRSM8RXQC&marketplace=FLIPKART&q=home+theaters&store=0pm%2F0o7&srno=s_1_11&otracker=search&otracker1=search&fm=SEARCH&iid=c1e1015e-9211-426a-945a-9aeb12e77fe4.ACCEK4CAZHWWGJRS.SEARCH&ppt=hp&ppn=homepage&ssid=ruewt9yx740000001624122229141&qH=4c0b754589c1684a1684a

In [29]:
#interpreting scraped data
home_theaters_df_flipkart

Unnamed: 0,Ratings,Review Titles,Review Descriptions
0,5,Wonderful,Excellent product in budget .i m loving it.i p...
1,5,Terrific purchase,Nice product. Bluetooth connectivity are very ...
2,4,Delightful,Its good in this price range but if you are lo...
3,,,
4,,,
...,...,...,...
1995,4,Value-for-money,good product.....
1996,5,Worth every penny,worth for the money
1997,3,Decent product,spickar plag not working
1998,4,Very Good,Gud


In [63]:
#copying scradped ratings & reviews to another dataframe object
df7 = home_theaters_df_flipkart.copy()
print(f"Original Shape: {df7.shape}")

#checking for duplicate rows
print(f"Total Number of Duplicated Rows: {df7.duplicated().sum()}")

#removing duplicate rows
df7.drop_duplicates(inplace=True)
print(f"New Shape: {df7.shape}")

Original Shape: (2000, 3)
Total Number of Duplicated Rows: 321
New Shape: (1679, 3)


In [30]:
#saving scraped data to csv file
home_theaters_df_flipkart.to_csv('flipkart-ratings-reviews-home-theaters.csv')

## Scraping for ratings & reviews of `Headphones` from Flipkart for 2000 records

In [31]:
#start driver
driver = start_driver()

#initialize search box with phones
init_flipkart('headphones',driver)

#fetch the all related product urls
headphones_urls = get_product_urls(driver)

FoundURL: [1680]

In [32]:
#scrape the ratings & reviews from urls and store it as dataframe object
headphones_df_flipkart = get_rating_reviews(headphones_urls,driver,limit=2000)

Processing:   0%|          | 0/1680 [00:00<?, ?it/s]

ScrapeLIMIT[2010/2000] :: URL: https://www.flipkart.com/boat-rockerz-235v2-asap-charging-version-5-0-bluetooth-headset/p/itmffacf2db2cc8f?pid=ACCFZGAQJGYCYDCM&lid=LSTACCFZGAQJGYCYDCMMCMPP5&marketplace=FLIPKART&q=headphones&store=0pm%2Ffcn&spotlightTagId=BestvalueId_0pm%2Ffcn&srno=s_1_4&otracker=search&otracker1=search&fm=SEARCH&iid=d6044aea-120f-4eb9-95eb-da0c64c4a2f9.ACCFZGAQJGYCYDCM.SEARCH&ppt=sp&ppn=sp&ssid=96usgdfa680000001624123460239&qH=edd443896ef5dbfc

In [33]:
#interpreting scraped data
headphones_df_flipkart

Unnamed: 0,Ratings,Review Titles,Review Descriptions
0,5,Great product,I've been using this product for the last 3 da...
1,5,Highly recommended,Good battery 🔋 👌 Sound awesome 👌 Bass 🔊 super ...
2,5,Super!,Very nice product with amazing battery back up...
3,5,Awesome,Excellent quality.. Packing is awesome.. Neckb...
4,4,Really Nice,This review i was posted after 1 week using th...
...,...,...,...
1995,4,Pretty good,Ear buds not comfort to ears.last call memory ...
1996,5,Great product,"Nice design, build quality & nice colour. Good..."
1997,5,Terrific,Using this product since 10 days (as on 03.07....
1998,5,Just wow!,Super fast charge.sound was good bass also good


In [64]:
#copying scradped ratings & reviews to another dataframe object
df8 = headphones_df_flipkart.copy()
print(f"Original Shape: {df8.shape}")

#checking for duplicate rows
print(f"Total Number of Duplicated Rows: {df8.duplicated().sum()}")

#removing duplicate rows
df8.drop_duplicates(inplace=True)
print(f"New Shape: {df8.shape}")

Original Shape: (2000, 3)
Total Number of Duplicated Rows: 186
New Shape: (1814, 3)


In [34]:
#saving scraped data to csv file
headphones_df_flipkart.to_csv('flipkart-ratings-reviews-headphones.csv')

# Scraping Ratings & Reviews from `Amazon.in`

In [84]:
#function for initializing driver with https://amazon.in and searching a value
def init_amazon(search_key,driver):
    #initializing driver with url https://amazon.in
    driver.get('https://amazon.in')

    #initializing search box with search_key
    WebDriverWait(driver,5).until(ec.element_to_be_clickable((By.XPATH,'//input[@id="twotabsearchtextbox"]'))).send_keys(search_key)

    #clicking on search button
    WebDriverWait(driver,5).until(ec.element_to_be_clickable((By.XPATH,'//input[@id="nav-search-submit-button"]'))).click()
    

#end of function init_amazon


#function for fetching all the product urls for searched product
def amazon_product_urls(driver):
    #getting all the related product links
    product_urls = []
    page_counter = 0
    while 1:
        #getting product url containers
        try:
            time.sleep(3)
            links = []
            links = WebDriverWait(driver,10).until(ec.presence_of_all_elements_located((By.XPATH,'//div[@class="a-row a-size-small"]/span[2]/a[@class="a-link-normal"]')))
        except:
            print("No more links found")

        #scraping product url from containers
        for link in links:
            try:
                product_urls.append(link.get_attribute('href'))
                sys.stdout.write(f"\rFoundURL: [{len(product_urls)}]")
                sys.stdout.flush()
            except:
                break

        #goto next page if available
        try:
            page_counter += 1
            #breaking loop after visting 41 pages for current product url
            if page_counter >41:
                break
                
            WebDriverWait(driver,5).until(ec.element_to_be_clickable((By.XPATH,'//ul[@class="a-pagination"]/li[@class="a-last"]/a[contains(text(),"Next")]'))).click()
        except:
            break; #break loop if next page is not available

    #end of while 1
    return product_urls

#end of function get_product_urls


#function to scrape reviews and ratings from given product url
def amazon_rating_reviews(urls,driver,limit=False):
    ratings = []
    review_titles = []
    review_descriptions = []
    
    #scraping ratings & reviews of products
    for url in tqdm.tqdm(urls,desc="Processing"):
        #if number of records are more than limit than break
        if limit and len(ratings) > limit:
            break
        
        #initialize driver with url
        driver.get(url)
        try:
            #checking for total number of reviews available for current url
            num_reviews = WebDriverWait(driver,5).until(ec.element_to_be_clickable((By.XPATH,'//span[@id="acrCustomerReviewText"]')))
            total_reviews = int(re.sub(r'[^0-9]','',num_reviews.text))
        except:
            sys.stdout.write("\rNumber of Reviews: TimeOut!")
            continue

        rating_containers = []
        title_containers = []
        description_containers = []
        #clicking on See all review link
        try:
            WebDriverWait(driver,5).until(ec.element_to_be_clickable((By.XPATH,'//a[@class="a-link-emphasis a-text-bold"][contains(text(),"See all reviews")]'))).click()
            time.sleep(2)
        except:
            sys.stdout.write("\rAll [XX] Reviews: TimeOut!")
        
        
        #scraping reviews for current url
        while 1:
            #checking and breaking loop when number of records are more than limit
            if limit and len(ratings) > limit:
                break

            error_flag = False
            try:
                rating_containers = WebDriverWait(driver,5).until(ec.presence_of_all_elements_located((By.XPATH,'//div[@class="a-section celwidget"]/div[2]/a[1]')))
                title_containers = WebDriverWait(driver,5).until(ec.presence_of_all_elements_located((By.XPATH,'//div[@class="a-section celwidget"]/div[2]/a[2]/span')))
                description_containers = WebDriverWait(driver,5).until(ec.presence_of_all_elements_located((By.XPATH,'//div[@class="a-section celwidget"]/div[4]')))
            except:
                error_flag = True
                sys.stdout.write("\rScraping Containers: TimeOut!")

            if not error_flag and len(rating_containers) == len(title_containers) and len(rating_containers) == len(description_containers):
                for rating in rating_containers:
                    try:
                        ratings.append(re.search(r'\d+',rating.get_attribute('title'))[0])
                    except:
                        ratings.append(None)

                for title in title_containers:
                    try:
                        review_titles.append(title.text)
                    except:
                        review_titles.append(None)

                for description in description_containers:
                    try:
                        review_descriptions.append(description.text.replace('\n',' '))
                    except:
                        review_descriptions.append(None)
                    
            else:
                sys.stdout.write("\r Error: Invalid Array Length!")

            if limit:
                sys.stdout.write("\rScrapeLIMIT[%d/%d] :: URL: %s" % (len(ratings),limit,url))
            else:
                sys.stdout.write("\rScrapedDATA[%d] :: URL: %s" % (len(ratings),url))

            #move to next review page
            try:
                WebDriverWait(driver,5).until(ec.element_to_be_clickable((By.XPATH,'//ul[@class="a-pagination"]/li[@class="a-last"]/a[contains(text(),"Next")]'))).click()
                time.sleep(3)
            except:
                break

        #end of while 1
    #end of forLoop
        
    #creating dataframe of scraped data
    df = pd.DataFrame({
        "Ratings": ratings,
        "Review Titles": review_titles,
        "Review Descriptions": review_descriptions
    })
    
    #closing driver
    driver.close()
    
    #returning dataframe object
    if limit:
        return df[:limit]
    else:
        return df
#end of function amazon_rating_reviews()


## Scraping for ratings & reviews of `Laptops from Amazon`

In [47]:
#start driver
driver = start_driver()

#initialize search box with laptops
init_amazon('laptops',driver)

In [48]:
#getting all the product urls for searched key
amazon_laptops_urls = amazon_product_urls(driver)

FoundURL: [269]

In [49]:
#scraping ratings & reviews for searched product
laptop_df_amazon = amazon_rating_reviews(amazon_laptops_urls,driver)

Processing:   0%|          | 0/269 [00:00<?, ?it/s]

ScrapedDATA[145] :: URL: https://www.amazon.in/HP-15-6-inch-Laptop-Windows-15s-gy0003AU/dp/B08RFTYKFZ/ref=sr_1_3?dchild=1&keywords=laptops&qid=1624161462&sr=8-3#customerReviews-Graphite-81YM002TIN%2Fdp%2FB087D3VVW3%2Fref%3Dsr_1_2_sspa%3Fdchild%3D1%26keywords%3Dlaptops%26qid%3D1624161462%26sr%3D8-2-spons%26psc%3D1&qualifier=1624161462&id=676039676104181&widgetName=sp_atf#customerReviews Error: Invalid Array Length!
ScrapedDATA[408] :: URL: https://www.amazon.in/HP-15-6-inch-Laptop-Windows-15s-gy0003AU/dp/B08RFTYKFZ/ref=sr_1_3?dchild=1&keywords=laptops&qid=1624161462&sr=8-3#customerReviews Error: Invalid Array Length!
ScrapedDATA[408] :: URL: https://www.amazon.in/HP-Pentium-Processor-15-6-inch-15s-du1052tu/dp/B08HJZHTM1/ref=sr_1_4?dchild=1&keywords=laptops&qid=1624161462&sr=8-4#customerReviews Error: Invalid Array Length!
ScrapedDATA[448] :: URL: https://www.amazon.in/HP-Pentium-Processor-15-6-inch-15s-du1052tu/dp/B08HJZHTM1/ref=sr_1_4?dchild=1&keywords=laptops&qid=1624161462&sr=8-4#cus

Scraping Containers: TimeOut! Error: Invalid Array Length!ron-3501-i5-1135G7-Graphics/dp/B08QCKPLZX/ref=sr_1_306?dchild=1&keywords=laptops&qid=1624161546&sr=8-306#customerReviewsReviews083M1QK32&pd_rd_r=04e68d9c-5737-4cdc-aefa-2921c9985f68&pd_rd_w=xxNax&pd_rd_wg=UFhng&pf_rd_p=face5bc3-2f9a-4704-b92d-89c03325065d&pf_rd_r=DZ6FGEEBN1YZE95VR02J&qid=1624161542&sr=1-289-50cbf8e5-3c98-41b1-983b-e6cf8b4418b0#customerReviews
ScrapedDATA[7555] :: URL: https://www.amazon.in/Microsoft-Touchscreen-RadeonTM-Graphics-Platinum/dp/B0933L1RWD/ref=sxbs_sbv_search_btf?cv_ct_cx=laptops&dchild=1&keywords=laptops&pd_rd_i=B0933L1RWD&pd_rd_r=323f76ce-6d41-47f6-9828-b2b52b516857&pd_rd_w=UEBXR&pd_rd_wg=OpjEX&pf_rd_p=face5bc3-2f9a-4704-b92d-89c03325065d&pf_rd_r=X9TFTXFERZ408JRZ4D5E&qid=1624161546&sr=1-305-50cbf8e5-3c98-41b1-983b-e6cf8b4418b0#customerReviews

In [50]:
#interpreting scraped data
laptop_df_amazon

Unnamed: 0,Ratings,Review Titles,Review Descriptions
0,5,Deal of the year!,"I am so glad, I found a great deal on this lap..."
1,4,Best at this price,Bought it this Friday. waited a lot for perfec...
2,5,Best in its Class,"Writing this review after 2 weeks of usage, I'..."
3,3,"Great Purchase, until Zen 3 becomes easily ava...",Make sure you get the Serial number in the Inv...
4,5,A perfect laptop for every task BUT......,Review after 1 week: I purchased this laptop (...
...,...,...,...
7550,4,SUPER,"I REALLY LIKED IT AWESOME,BOOTING TIME IS VERY..."
7551,5,Premium machine for widows lovers,Very impressed with the laptop. Looks extremel...
7552,1,Extremely bad customer service,Horrible customer service . U r on your own on...
7553,1,Price is very high,The price is too high .for 8gb we are getting ...


In [65]:
#copying scradped ratings & reviews to another dataframe object
df9 = laptop_df_amazon.copy()
print(f"Original Shape: {df9.shape}")

#checking for duplicate rows
print(f"Total Number of Duplicated Rows: {df9.duplicated().sum()}")

#removing duplicate rows
df9.drop_duplicates(inplace=True)
print(f"New Shape: {df9.shape}")

Original Shape: (7555, 3)
Total Number of Duplicated Rows: 1059
New Shape: (6496, 3)


In [51]:
#saving scraped data to csv file
laptop_df_amazon.to_csv('amazon-ratings-reviews-laptops.csv')

## Scraping for ratings & reviews of `Phones` from Amazon

In [85]:
#start driver
driver = start_driver()

#initialize search box with phones
init_amazon('phones',driver)

In [86]:
#getting all the product urls for searched key
amazon_phones_urls = amazon_product_urls(driver)

FoundURL: [279]

In [87]:
#scraping ratings & reviews for searched product
phones_df_amazon = amazon_rating_reviews(amazon_phones_urls,driver)

Processing:   0%|          | 0/279 [00:00<?, ?it/s]

ScrapedDATA[40035] :: URL: https://www.amazon.in/Coolpad-Cool-Silver-Storage-Pop-up/dp/B08HRBGB9Z/ref=sr_1_306?dchild=1&keywords=phones&qid=1624172908&sr=8-306#customerReviewserReviewsewsiews1020%2Fdp%2FB0844516JL%2Fref%3Dsr_1_82_sspa%3Fdchild%3D1%26keywords%3Dphones%26qid%3D1624172856%26sr%3D8-82-spons%26psc%3D1&qualifier=1624172856&id=1032098169345152&widgetName=sp_atf_next#customerReviewswsiews

In [88]:
#interpreting scraped data
phones_df_amazon

Unnamed: 0,Ratings,Review Titles,Review Descriptions
0,5,Value of Money 💖,Charger and speaker 🤷
1,5,No music player..lots of app,Good..but no music player!!! Lots of app.
2,5,Ram and finishing,Good
3,5,Superb product in prise,Tecno good phone camera processar also all of ...
4,5,Samsung Z Fold 2 is Masterpiece,One of the best Samsung phones so far But you ...
...,...,...,...
40030,5,Beautiful Indian product,Nice product
40031,5,Awesome phone,Great phone. There is no overheating like othe...
40032,5,Software problem,Mobile was software issues. Mobile not working...
40033,1,Waste project worset mobile,This phone is really bad ibwont recumbent anyo...


In [89]:
#copying scradped ratings & reviews to another dataframe object
df10 = phones_df_amazon.copy()
print(f"Original Shape: {df10.shape}")

#checking for duplicate rows
print(f"Total Number of Duplicated Rows: {df10.duplicated().sum()}")

#removing duplicate rows
df10.drop_duplicates(inplace=True)
print(f"New Shape: {df10.shape}")

Original Shape: (40035, 3)
Total Number of Duplicated Rows: 13176
New Shape: (26859, 3)


In [90]:
#saving scraped data to csv file
phones_df_amazon.to_csv('amazon-ratings-reviews-phones.csv')

## Scraping for ratings & reviews of `Headphones` from Amazon

In [97]:
#start driver
driver = start_driver()

#initialize search box with phones
init_amazon('Headphones',driver)

In [98]:
#getting all the product urls for searched key
amazon_headphones_urls = amazon_product_urls(driver)

FoundURL: [369]

In [99]:
#scraping ratings & reviews for searched product
headphones_df_amazon = amazon_rating_reviews(amazon_headphones_urls,driver)

Processing:   0%|          | 0/369 [00:00<?, ?it/s]

ScrapedDATA[17834] :: URL: https://www.amazon.in/gp/slredirect/picassoRedirect.html/ref=pa_sp_btf_aps_sr_pg20_1?ie=UTF8&adId=A093795110ROWSIX0WPIQ&url=%2FSamsung-Hands-Free-Headset-Compatible-Smartphones%2Fdp%2FB07X9HN89S%2Fref%3Dsr_1_312_sspa%3Fdchild%3D1%26keywords%3DHeadphones%26qid%3D1624199883%26sr%3D8-312-spons%26psc%3D1&qualifier=1624199883&id=8977307858816452&widgetName=sp_btf#customerReviewssp_mtf#customerReviewsrReviewssiewst_we=asin&cv_ct_wn=osp-single-source-earns-comm&dchild=1&keywords=Headphones&linkCode=oas&pd_rd_i=B07TF2R9KW&pd_rd_r=53b27863-abb0-4ba5-9242-04c17f8145e9&pd_rd_w=xuQjs&pd_rd_wg=QF3nQ&pf_rd_p=04cb5ab0-b8fd-4cb3-8087-30e7d341d745&pf_rd_r=VM2R1RJMVEMQMFYF93PV&qid=1624199806&sr=1-4-483c64d8-df78-4008-ae20-e69f683e58b1&tag=technologytoday-21#customerReviews#customerReviewssmerReviews

In [100]:
#interpreting scraped data
headphones_df_amazon

Unnamed: 0,Ratings,Review Titles,Review Descriptions
0,5,Bang for buck,They are really a beast . Vocals are crisp eve...
1,5,Best product as comparison to other companies ...,According to price product is more good by my ...
2,5,Probass thunder,Super Bass... Sounds quality is good .. Rich f...
3,3,Average Product - Bad Handling of package & fa...,"The sound output is good, it has good bass, th..."
4,1,BADDEST HEADPHONE MIC,"A Truly Pathetic Headphone With Baddest Mic, W..."
...,...,...,...
17829,4,Samsung EHS61 Good Sound Quality. Good Price a...,Samsung EHS61 is a very good ear phone and has...
17830,5,Bad product,It's not real Samsung it got defected just wit...
17831,5,Great product,Best quality at this price and Great sound.. C...
17832,5,100% Original You can buy Without any Doubt,This is Made in china priduct....Not 100% orig...


In [101]:
#copying scradped ratings & reviews to another dataframe object
df11 = headphones_df_amazon.copy()
print(f"Original Shape: {df11.shape}")

#checking for duplicate rows
print(f"Total Number of Duplicated Rows: {df11.duplicated().sum()}")

#removing duplicate rows
df11.drop_duplicates(inplace=True)
print(f"New Shape: {df11.shape}")

Original Shape: (17834, 3)
Total Number of Duplicated Rows: 5278
New Shape: (12556, 3)


In [102]:
#saving scraped data to csv file
headphones_df_amazon.to_csv('amazon-ratings-reviews-headphones.csv')

## Scraping for ratings & reviews of `Printers` from Amazon

In [105]:
#start driver
driver = start_driver()

#initialize search box with phones
init_amazon('printers',driver)

In [106]:
#getting all the product urls for searched key
amazon_printers_urls = amazon_product_urls(driver)

FoundURL: [225]

In [107]:
#scraping ratings & reviews for searched product
printers_df_amazon = amazon_rating_reviews(amazon_printers_urls,driver)

Processing:   0%|          | 0/225 [00:00<?, ?it/s]

ScrapedDATA[47770] :: URL: https://www.amazon.in/Brother-HL-L2366DW-Monochrome-Printer-Printing/dp/B01NBPXE64/ref=sr_1_305?dchild=1&keywords=printers&qid=1624211901&sr=8-305#customerReviewssiewsmerReviewsFB0892XK8HJ%2Fref%3Dsr_1_137_sspa%3Fdchild%3D1%26keywords%3Dprinters%26qid%3D1624211862%26sr%3D8-137-spons%26psc%3D1&qualifier=1624211862&id=6949714785532628&widgetName=sp_mtf#customerReviewsiewsmerReviewswsiewseviewstomerReviewswsasin&cv_ct_wn=osp-single-source-earns-comm&dchild=1&keywords=printers&linkCode=oas&pd_rd_i=B07B4KDTHP&pd_rd_r=0652a59c-c799-4939-a1ff-09aa3bb0c515&pd_rd_w=28shU&pd_rd_wg=KbGCQ&pf_rd_p=04cb5ab0-b8fd-4cb3-8087-30e7d341d745&pf_rd_r=3HZ1X5G46YATVA8E8VTA&qid=1624211829&sr=1-3-483c64d8-df78-4008-ae20-e69f683e58b1&tag=digitin-21#customerReviewseviews

In [108]:
#interpreting scraped data
printers_df_amazon

Unnamed: 0,Ratings,Review Titles,Review Descriptions
0,2,So sorrowful,.bad.machine is not working.toner is stopped w...
1,3,Printer,Good for xerox and prining. But need instructi...
2,4,Power cable and data cable are missing,Power cable and data cable are missing
3,5,Five Stars,good machine.. its worth to buy ..
4,4,Four Stars,Good product Cost effective ..
...,...,...,...
47765,5,It is good buy,Overall a good product so far
47766,5,Nice,Good
47767,1,vaste of money,purchase of brother printer is waste of money ...
47768,4,Very good printer,"Perfect with all features in it, like Automati..."


In [109]:
#copying scradped ratings & reviews to another dataframe object
df12 = printers_df_amazon.copy()
print(f"Original Shape: {df12.shape}")

#checking for duplicate rows
print(f"Total Number of Duplicated Rows: {df12.duplicated().sum()}")

#removing duplicate rows
df12.drop_duplicates(inplace=True)
print(f"New Shape: {df12.shape}")

Original Shape: (47770, 3)
Total Number of Duplicated Rows: 8705
New Shape: (39065, 3)


In [110]:
#saving scraped data to csv file
headphones_df_amazon.to_csv('amazon-ratings-reviews-printers.csv')

## Scraping for ratings & reviews of `Smart Watches` from Amazon

In [132]:
#start driver
driver = start_driver()

#initialize search box with phones
init_amazon('smart watches',driver)

In [133]:
#getting all the product urls for searched key
amazon_smart_watches_urls = amazon_product_urls(driver)

FoundURL: [348]

In [134]:
#scraping ratings & reviews for searched product
smart_watches_df_amazon = amazon_rating_reviews(amazon_smart_watches_urls,driver,limit=4000)

Processing:   0%|          | 0/348 [00:00<?, ?it/s]

ScrapeLIMIT[4004/4000] :: URL: https://www.amazon.in/Noise-Colorfit-Pro-Touch-Control/dp/B07YY1BY5B/ref=sr_1_3?dchild=1&keywords=smart+watches&qid=1624239706&sr=8-3#customerReviews-HD-Color-Display-Sports-Modes-Breathing%2Fdp%2FB08HHCDN97%2Fref%3Dsr_1_2_sspa%3Fdchild%3D1%26keywords%3Dsmart%2Bwatches%26qid%3D1624239706%26sr%3D8-2-spons%26psc%3D1&qualifier=1624239706&id=199877266482789&widgetName=sp_atf#customerReviews

In [135]:
#interpreting scraped data
smart_watches_df_amazon

Unnamed: 0,Ratings,Review Titles,Review Descriptions
0,4,"Not a disappointment, but can be better",Original 3rd Jan 2021: This is my first smartw...
1,2,Initially impressive,Got the watch today... Display is just amazing...
2,3,"Glitchy watch, hangs at any time","It freezes on random screens, and the whatsapp..."
3,1,Very poor quality and below to expectations. D...,Your browser does not support HTML5 video. It...
4,4,Lite version of GTS 2.,I've been using amazfit watches for the last 2...
...,...,...,...
3995,5,Sync issues,Bought this watch considering noise come up wi...
3996,3,Awesome watch but display gets scratches easily,I liked the battery backup and the design styl...
3997,2,For Accuracy 0,I can give better heart rate by checking my pu...
3998,1,RIP privacy.,"don't allow full access to it, specially sms a..."


In [136]:
#copying scradped ratings & reviews to another dataframe object
df13 = smart_watches_df_amazon.copy()
print(f"Original Shape: {df13.shape}")

#checking for duplicate rows
print(f"Total Number of Duplicated Rows: {df13.duplicated().sum()}")

#removing duplicate rows
df13.drop_duplicates(inplace=True)
print(f"New Shape: {df13.shape}")

Original Shape: (4000, 3)
Total Number of Duplicated Rows: 55
New Shape: (3945, 3)


In [137]:
#saving scraped data to csv file
smart_watches_df_amazon.to_csv('amazon-ratings-reviews-smart_watches.csv')

## Scraping for ratings & reviews of `Monitors` from Amazon

In [138]:
#start driver
driver = start_driver()

#initialize search box with phones
init_amazon('monitors',driver)

In [139]:
#getting all the product urls for searched key
amazon_monitors_urls = amazon_product_urls(driver)

FoundURL: [341]

In [140]:
#scraping ratings & reviews for searched product
monitors_df_amazon = amazon_rating_reviews(amazon_monitors_urls,driver,limit=4000)

Processing:   0%|          | 0/341 [00:00<?, ?it/s]

ScrapeLIMIT[4008/4000] :: URL: https://www.amazon.in/BenQ-Borderless-Brightness-Intelligence-Technology/dp/B07LDH32ZP/ref=sr_1_3?dchild=1&keywords=monitors&qid=1624241496&sr=8-3#customerReviews7AM500NWXXL%2Fdp%2FB08XB1F1RD%2Fref%3Dsr_1_2_sspa%3Fdchild%3D1%26keywords%3Dmonitors%26qid%3D1624241496%26sr%3D8-2-spons%26psc%3D1&qualifier=1624241496&id=3158142503010014&widgetName=sp_atf#customerReviews

In [141]:
#interpreting scraped data
monitors_df_amazon

Unnamed: 0,Ratings,Review Titles,Review Descriptions
0,5,There is Backlight Glow (Not Leaking / Normal ...,I bought this product because I edit videos fo...
1,5,"Best 27"" IPS Monitor @ Rs 15,499","Bought this monitor on 5th Jan 2018 at Rs. 15,..."
2,5,Excellent value for money product.,"I have worked on many monitors (Dell, HP, Acer..."
3,4,Value for money,I bought it mainly for its auto brightness. Bu...
4,5,Brilliant Eyecare protection technology | Brig...,I was skeptical about this BenQ G480 eye care ...
...,...,...,...
3995,5,Five Stars,Beautiful Bezel less display with eye care pro...
3996,5,Excellent product,highly recommended. Budget friendly and does t...
3997,3,Poor Monitor for any purpose,You can not compare it with HP monitor 27 es ....
3998,5,Best full hd monitor,Very good quality. Build quality is good.


In [142]:
#copying scradped ratings & reviews to another dataframe object
df14 = monitors_df_amazon.copy()
print(f"Original Shape: {df14.shape}")

#checking for duplicate rows
print(f"Total Number of Duplicated Rows: {df14.duplicated().sum()}")

#removing duplicate rows
df14.drop_duplicates(inplace=True)
print(f"New Shape: {df14.shape}")

Original Shape: (4000, 3)
Total Number of Duplicated Rows: 1287
New Shape: (2713, 3)


In [143]:
#saving scraped data to csv file
monitors_df_amazon.to_csv('amazon-ratings-reviews-monitors.csv')

# Combining all the dataframe object to a single object

In [144]:
#combining all the dataframe to one single dataframe using pd.concat method
df_combined = pd.concat([df1,df2,df3,df4,df5,df6,df7,df8,df9,df10,df11,df12,df13,df14], ignore_index=True, sort=False)

print(f"Shape of Combined DataFrame: {df_combined.shape}")

#interpreting combined dataframe
df_combined

Shape of Combined DataFrame: (116318, 3)


Unnamed: 0,Ratings,Review Titles,Review Descriptions
0,5,Worth every penny,An affordable beast ! Pros: 1. Incredible perf...
1,5,Awesome,Best laptop in this price segment.. battery is...
2,4,Really Nice,To be honest Pro's 1) RGB keyboard 2)144Hzs wi...
3,4,Value-for-money,The Laptop is a masterpiece with stunnig desig...
4,5,Highly recommended,Good laptop but customer care folks are real d...
...,...,...,...
116313,2,Smart product,A very smart product.But its a VA panel not IP...
116314,2,No webcam?,So this thing costs so much and doesn't have a...
116315,2,Airplay to Samsung M5 (27') from 16' Macbook p...,I bought M5 27' smart monitor. The endpoint of...
116316,5,Working,Try once and get to know then tell of it has G...


# Taking Data from combine dataframe in equal portion according to ratings

In [145]:
#seperating records according to ratings and finding which rating has minimum number of records.
length_of_records = {}
df_ratings_record = {}
for r in ['1','2','3','4','5']:
    length_of_records[r] = len(df_combined[df_combined['Ratings']==r])
    df_ratings_record[r] = df_combined[df_combined['Ratings']==r]
    
min_ratings_record = min(length_of_records, key=length_of_records.get)
print(f"Minimum number of records among ratings are for: {min_ratings_record} :: {length_of_records[min_ratings_record]}")

#Taking out records in equal portion as accordance with minimum number of records.
lor = length_of_records[min_ratings_record]     #length of minimum number of records.

df_final = pd.concat([
    df_ratings_record['1'][:lor],
    df_ratings_record['2'][:lor],
    df_ratings_record['3'][:lor],
    df_ratings_record['4'][:lor],
    df_ratings_record['5'][:lor],
    
], ignore_index=True, sort=False)


#interpreting final dataframe
df_final

Minimum number of records among ratings are for: 2 :: 6317


Unnamed: 0,Ratings,Review Titles,Review Descriptions
0,1,Did not meet expectations,Windows updates not happening and graphics car...
1,1,Worthless,"Facing battery issue only 10days old laptop, t..."
2,1,Unsatisfactory,"Slow,never ever see such kind of product, bett..."
3,1,Horrible,Working very slow..takes 15-20 minutes to star...
4,1,Not recommended at all,Brost leptop don't by this leptop i claim this...
...,...,...,...
31580,5,Must buy!,good build quality and do its purpose very well.
31581,5,Brilliant,"product is good, but keyboard is missing and s..."
31582,5,Super!,Super perfomance
31583,5,Fabulous!,Excellent item.


In [146]:
#Randomizing the dataframe
import random
index = [i for i in df_final.index]
random.shuffle(index)
df_final = df_final.set_index([index]).sort_index()
df_final

Unnamed: 0,Ratings,Review Titles,Review Descriptions
0,3,Nice,"By the way, everything is good in a laptop, bu..."
1,5,Wonderful,Best laptop for students
2,1,Did not meet expectations,"Totally worthless, not even real d-link router..."
3,3,Good,As expected ...value for money...little slow b...
4,5,Simply awesome,Great great great product in this price
...,...,...,...
31580,2,HP415_Gr3,Bit disappointed with print quality. The defau...
31581,3,NOTHING,NOTHING
31582,4,Worth the money,This modem is actually pretty good.
31583,5,Worth every penny,Writing the review after 2 days of use... disp...


# Saving the final collected data to a csv file for further processing & building model

In [147]:
df_final.to_csv('final-data-ratings-reviews.csv')

__EOF__