Option I: E-Commerce Web Scraping
Objective: Extract product information from an e-commerce website.
Task:
Website Selection: Choose a popular e-commerce website that lists various products (e.g.,
Amazon, Flipkart). The website should have multiple categories of products.
Data Extraction:
● Select at least two product categories (e.g., Electronics, Clothing).
● For each category, scrape the following details for at least 20 products:
● Product Name
● Price
● Product Description
● User Reviews and ratings
● Product Image URL

Data Formatting:
● Organize the scraped data into a structured format (e.g., CSV, JSON) such that it can
be used to fine tune a base LLM like GPT-4
● Ensure the data is clean and consistent.
Documentation:
● Provide a brief explanation of your scraping approach, and why the data format is
suitable for fine tuning a model.
● Include any challenges faced and how they were resolved.
● List the tools and libraries used.
Ethical Considerations:
● Discuss how you ensured that your scraping activities adhered to the website’s terms
of service and legal considerations.
Submission: Submit the code along with the extracted data file and documentation.

In [60]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


Web scraping data from the Amazon website for TVs using beautiful soup

In [61]:
#created 4 lists
product_name=[]
product_review=[]
product_price=[]
product_image=[]

In [62]:
for i in range(1,10):
    # Construct URL for each page
    url = 'https://www.amazon.in/s?k=tv&crid=L5MRJCSS8S65&sprefix=tv%2Caps%2C231&ref=nb_sb_noss_'+str(i)
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    # Finding the container element for product information
    box = soup.find('span', attrs={'class': 'rush-component s-latency-cf-section'})
    a_elements = box.find('a', attrs={'class': 'a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal'})
    # Check if the container element is found
    if box is not None:
        # Now you can safely find elements within 'box'
        name = box.find_all('span', class_='a-size-medium a-color-base a-text-normal')
        review = box.find_all('span', class_='a-icon-alt')
        price = box.find_all('span', class_='a-price-whole')
        div_elements=box.find_all('div', class_='a-section aok-relative s-image-fixed-height')
    else:
        print("Box element not found.")
    count_0 = 0
    count_1 = 0
    count_2 = 0
    count_3=0
    for i in name:
        if count_0 <= 20:
            product_name.append(i.text)
            count_0 += 1
    for i in review:
        if count_1 <= 20:
            product_review.append(i.text)
            count_1 += 1
    for i in price:
        if count_2 <= 20:
            product_price.append(i.text)
            count_2 += 1
    for div_element in div_elements:
    # Find all img elements within the div element
        img_elements = div_element.find_all('img', class_='s-image')
        # Iterate through img elements
        for img_element in img_elements:
            if count_3<=20:
            # Extract the src attribute
                src = img_element.get('src', '')
            # Split src by '1x' and take the first part
                src_link = src.split('1x')[0]
            # Append the link to the product_image list
                product_image.append(src_link)
            # Increment count_3
                count_3 += 1


In [68]:
name1 = []

for product in product_name:
    words = product.split()
    name1.append(' '.join(words[:2]))
print(len(name1))

189


In [69]:
# Create a dictionary to store scraped data
data = {
    'Name':name1,
    'Price': product_price,
    'review': product_review,
    'description': product_name,
    'image link': product_image
}

In [70]:
# Create a DataFrame from the dictionary 'data'
product_df_1 = pd.DataFrame.from_dict(data)
print(product_df_1)


               Name   Price              review  \
0        Xiaomi 108  11,990  4.2 out of 5 stars   
1            LG 108  22,999  4.2 out of 5 stars   
2          Redmi 80  26,999  4.1 out of 5 stars   
3             MI 80  25,999  4.2 out of 5 stars   
4             VW 80  29,990  3.9 out of 5 stars   
..              ...     ...                 ...   
184          LG 139  13,489  4.2 out of 5 stars   
185           LG 80   7,499  4.3 out of 5 stars   
186  iFFALCON 138.7  12,999  4.1 out of 5 stars   
187     Sony Bravia  42,990  4.7 out of 5 stars   
188          LG 108  14,490  4.2 out of 5 stars   

                                           description  \
0    Xiaomi 108 cm (43 inches) X Series 4K Ultra HD...   
1    LG 108 cm (43 inches) 4K Ultra HD Smart LED TV...   
2    Redmi 80 cm (32 inches) F Series HD Ready Smar...   
3    MI 80 cm (32 inches) A Series HD Ready Smart G...   
4    VW 80 cm (32 inches) Frameless Series HD Ready...   
..                                     

In [71]:
def create_description(description):
    # Split the string by whitespace and get the first word
    first_word = description.split()[0]
    # Concatenate the first word with 'tv'
    name = first_word + ' tv'
    return name

# Apply the function to create the 'description' column
product_df_1['Name'] = product_df_1['description'].apply(create_description)
print(product_df_1)

            Name   Price              review  \
0      Xiaomi tv  11,990  4.2 out of 5 stars   
1          LG tv  22,999  4.2 out of 5 stars   
2       Redmi tv  26,999  4.1 out of 5 stars   
3          MI tv  25,999  4.2 out of 5 stars   
4          VW tv  29,990  3.9 out of 5 stars   
..           ...     ...                 ...   
184        LG tv  13,489  4.2 out of 5 stars   
185        LG tv   7,499  4.3 out of 5 stars   
186  iFFALCON tv  12,999  4.1 out of 5 stars   
187      Sony tv  42,990  4.7 out of 5 stars   
188        LG tv  14,490  4.2 out of 5 stars   

                                           description  \
0    Xiaomi 108 cm (43 inches) X Series 4K Ultra HD...   
1    LG 108 cm (43 inches) 4K Ultra HD Smart LED TV...   
2    Redmi 80 cm (32 inches) F Series HD Ready Smar...   
3    MI 80 cm (32 inches) A Series HD Ready Smart G...   
4    VW 80 cm (32 inches) Frameless Series HD Ready...   
..                                                 ...   
184  LG 139 cm (5

In [72]:
# Save the DataFrame  to a CSV file
product_df_1.to_csv('product_data_Tvs_1.csv', index=False)

In [73]:
product_df_1

Unnamed: 0,Name,Price,review,description,image link
0,Xiaomi tv,11990,4.2 out of 5 stars,Xiaomi 108 cm (43 inches) X Series 4K Ultra HD...,https://m.media-amazon.com/images/I/71L+JnVXFT...
1,LG tv,22999,4.2 out of 5 stars,LG 108 cm (43 inches) 4K Ultra HD Smart LED TV...,https://m.media-amazon.com/images/I/81P0neh8MS...
2,Redmi tv,26999,4.1 out of 5 stars,Redmi 80 cm (32 inches) F Series HD Ready Smar...,https://m.media-amazon.com/images/I/819Lw2PE8t...
3,MI tv,25999,4.2 out of 5 stars,MI 80 cm (32 inches) A Series HD Ready Smart G...,https://m.media-amazon.com/images/I/713A5VksK6...
4,VW tv,29990,3.9 out of 5 stars,VW 80 cm (32 inches) Frameless Series HD Ready...,https://m.media-amazon.com/images/I/717oSOB4hC...
...,...,...,...,...,...
184,LG tv,13489,4.2 out of 5 stars,LG 139 cm (55 inches) 4K Ultra HD Smart LED TV...,https://m.media-amazon.com/images/I/71kuqRw8L5...
185,LG tv,7499,4.3 out of 5 stars,LG 80 cm (32 inches) HD Ready Smart LED TV 32L...,https://m.media-amazon.com/images/I/71w49hsBwv...
186,iFFALCON tv,12999,4.1 out of 5 stars,iFFALCON 138.7 cm (55 inches) 4K Ultra HD Smar...,https://m.media-amazon.com/images/I/81MRU+3RJL...
187,Sony tv,42990,4.7 out of 5 stars,Sony Bravia 164 cm (65 inches) 4K Ultra HD Sma...,https://m.media-amazon.com/images/I/81P0neh8MS...


Web scraping data from the Amazon website for shirts using beautiful soup

In [74]:
product_name=[]
product_review=[]
product_price1=[]
product_image=[]
for i in range(1,10):
    product_price=[]
    url = 'https://www.amazon.in/s?k=shirt+for+men&crid=1QR0K9KBRJQK9&qid=1710239754&sprefix=shirt%2Caps%2C235&ref=sr_pg_'+str(i)
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    box = soup.find('span', attrs={'class': 'rush-component s-latency-cf-section'})
    if box is not None:
        name = box.find_all('span', class_='a-size-base-plus a-color-base a-text-normal')
        review = box.find_all('span', class_='a-icon-alt')
        price = box.find_all('span', class_='a-price-whole')
        div_elements=box.find_all('div', class_='a-section aok-relative s-image-tall-aspect')

    else:
        print("Box element not found.")
    count_0 = 0
    count_1 = 0
    count_2 = 0
    count_3=0
    for i in name:
        if count_0 <= 20:
            product_name.append(i.text)
            count_0 += 1
    for i in review:
        if count_1 <= 20:
            product_review.append(i.text)
            count_1 += 1
    for i in price:
        if count_2 <= 20:
            product_price.append(i.text)
            count_2 += 1
    product_price1=product_price1+product_price        
    for div_element in div_elements[:len(product_price)]:
        img_tag = div_element.find('img')
        if img_tag:
            product_image.append(img_tag['src'])
            count_3+=1


In [75]:
data1 = {
    'Name': product_name,
    'Price': product_price1,
    'review': product_review,
    'image link': product_image
}


In [76]:
product_df = pd.DataFrame(data1)
product_df


product_df.to_csv('product_data_shirts.csv', index=False)

In [77]:
product_df.to_excel('product_data2.xlsx', index=False)