# Importing libraries for Web Scraping

In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
import csv
import json

#### modifying the PATH environment variable
* **By appending the path to the ChromeDriver executable to the PATH environment variable, it makes it accessible to any command or program that runs in that environment.**

In [2]:
os.environ['PATH'] += r"C:\Program Files (x86)\chromedriver-win64\chromedriver.exe"

## Web Scraping Code (Amazon)

#### scrape function 'scrape_product_link'

In [3]:
def scrape_product_link(link,csv_writer):
    driver = webdriver.Chrome()
    driver.get(link)
    driver.implicitly_wait(5)
    
    page = 20
    
    while(page != 0):
        item_number=1
        # Find all product links
        print("product links")
        product_links = driver.find_elements(By.CSS_SELECTOR, 'a[class="a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal"]')
        
        # Iterate over each product link
        for product_link in product_links:
            try:
                # Scroll element into view
                print('finding product link')
                driver.execute_script("arguments[0].scrollIntoView();", product_link)
                print('waiting it to be clickable')
                # Wait for the element to be clickable
                WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'a[class="a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal"]')))
                print('wait over')
                # Click on the element
                print('fetching link')
                link = product_link.get_attribute('href')
                print('click on product link')
                product_link.click()
                
                # Switch to the newly opened tab
                driver.switch_to.window(driver.window_handles[-1])
                time.sleep(4)  # Adjust as necessary
                print('**************************************')
                try:
                    title = driver.find_element(By.CSS_SELECTOR, 'span[id="productTitle"]').text.strip() #done
                    print('title successful')
                except:
                    print('title failed')
                    continue            
                try:
                    price = driver.find_element(By.CSS_SELECTOR, 'div[id="corePriceDisplay_desktop_feature_div"]').find_element(By.CSS_SELECTOR,'span[class="a-price-whole"]').text.strip() #done
                    print('price successful')
                except:
                    print('price failed')
                    continue  
                try:
                    img_url = driver.find_element(By.CSS_SELECTOR, 'div[id="main-image-container"] ul[class="a-unordered-list a-nostyle a-horizontal list maintain-height"]').find_element(By.CSS_SELECTOR,'div[id="imgTagWrapperId"] img').get_attribute('src') #done
                    print('img_url successful')
                except:
                    print('img_url failed')
                    continue 
                try: 
                    description = []
                    description_rows = driver.find_element(By.CSS_SELECTOR,'div[id="feature-bullets"]').find_elements(By.CSS_SELECTOR, 'ul[class="a-unordered-list a-vertical a-spacing-mini"] li')
                    print('description rows fetched successfully')
                    for row in description_rows:
                        try:
                            desc = row.find_element(By.CSS_SELECTOR, 'span[class="a-list-item"]').text.strip()
                            print('description row successful')
                            description.append(desc)
                        except:
                            print("description row failed")
                            continue
                except:
                    print('description failed')
                    continue 
        
                # Collecting review
                feedback=[]
                try:
                    # first operating on the focal reviews
                    try:
                        reviews = driver.find_elements(By.CSS_SELECTOR,'span[class="cr-widget-FocalReviews"] div[id="cm-cr-dp-review-list"] div[data-hook="review"]')
                        for review in reviews:
                            content = review.find_element(By.CSS_SELECTOR,'div[data-hook="review-collapsed"] span').text.strip()
                            feedback.append(content)
                            print('focal review fetched successfully')
                    except:
                        print('could not fetch the focal reviews')
                        continue

                    # fetching the esktop global reviews
                    try:
                        reviews = driver.find_elements(By.CSS_SELECTOR,'span[class="cr-widget-DesktopGlobalReviews"] div[id="cm-cr-global-review-list"] div[data-hook="review"]')
                        for review in reviews:
                            content = review.find_element(By.CSS_SELECTOR,'div[data-hook="review-collapsed"] span').text.strip()
                            feedback.append(content)
                            print('Desktop global review fetched successfully')
                    except:
                        print('could not fetch the global reviews')
                except:
                    print('unable to fetch reviews')
                    continue
                    
                
                print('performing csv write ')
                # Write to CSV
                csv_writer.writerow([title, price, link, img_url, "; ".join(description), "; ".join(feedback)])          
                print('write success ful')
                print('------------------')
                print(f'item_number: {item_number}')
                print(f'page number: { page}')
                item_number += 1
                driver.close()
                driver.switch_to.window(driver.window_handles[0])
            except Exception as e:
                print(f"Exception occurred ---: {str(e)}")
                driver.switch_to.window(driver.window_handles[0])
                continue  # Skip to the next product link if there's an issue
    
        # Example: Click the "Next" button for pagination (repeat as necessary)                                                     class="s-pagination-item s-pagination-next s-pagination-button s-pagination-separator"
        # Replace with actual pagination handling logic
        try:
            next_button = driver.find_element(By.CSS_SELECTOR, 'span[class="s-pagination-strip"]').find_element(By.CSS_SELECTOR, 'a[class="s-pagination-item s-pagination-next s-pagination-button s-pagination-separator"]')
            next_button.click()
            page-=1
            time.sleep(3)
        except:
            print('pagination didnt occur')
    driver.quit()

In [4]:

csv_file_path = 'product_CameraLenses_page1.csv'

# CSV header
header = ['Title', 'Price', 'Link', 'Image URL', 'Description', 'Reviews']

# Open CSV file
with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(header)

    # Example product page link
    product_page_link = r"https://www.amazon.in/s?k=camera+lenses&crid=OOBID4Q0FLU9&sprefix=camera+lenses%2Caps%2C249&ref=nb_sb_noss_1"
    
    # Scrape product data and write to CSV
    scrape_product_link(product_page_link, writer)



product links
finding product link
waiting it to be clickable
wait over
fetching link
click on product link
**************************************
title successful
price successful
img_url successful
description rows fetched successfully
description row successful
description row successful
description row successful
description row successful
description row successful
Desktop global review fetched successfully
Desktop global review fetched successfully
Desktop global review fetched successfully
performing csv write 
write success ful
------------------
item_number: 1
page number: 20
finding product link
waiting it to be clickable
wait over
fetching link
click on product link
**************************************
title successful
price successful
img_url successful
description rows fetched successfully
description row successful
focal review fetched successfully
focal review fetched successfully
focal review fetched successfully
focal review fetched successfully
focal review fetched 

IndexError: list index out of range