# Web Scraping with BeautifulSoup
This code snippet demonstrates web scraping using the `requests` and `BeautifulSoup` libraries in Python. The goal is to extract data from a website's HTML content.

In [3]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd
import numpy as np

# Scraping Data From Amazon

In [6]:
def scrap_data_from_amazon(url: str):
    # Set the header for the request to mimic a web browser
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36',
        'Accepted-Language': 'en-US, en;q=0.5'
    }
    
    # Send a GET request to the provided URL with the specified header
    response = requests.get(url, headers=header)
    
    # Create a BeautifulSoup object to parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all elements with class 'a-size-base-plus a-color-base' to extract brand names
    brand = soup.find_all('span', {'class': 'a-size-base-plus a-color-base'})
    
    # Find all elements with class 'a-size-base-plus a-color-base a-text-normal' to extract descriptions
    description = soup.find_all('span', {'class': 'a-size-base-plus a-color-base a-text-normal'})

    # Find all elements with class 'a-price-whole' to extract prices
    price = soup.find_all('span', {'class': 'a-price-whole'})

    # Find all elements with class 'a-icon a-icon-star-small a-star-small-4 aok-align-bottom' to extract ratings
    ratings_list = []
    rating_tags = soup.find_all('i', class_='a-icon-star-small')
    for rate_tag in rating_tags:
        span_tag = rate_tag.find('span', class_='a-icon-alt')
        if span_tag:
            rating = span_tag.text.replace(' out of 5 stars', '')
            ratings_list.append(rating)
        else:
            ratings_list.append(np.nan)

    # Find all elements with class 'a-size-base s-underline-text' to extract rating counts
    rating_counts_list = []
    count_tags = soup.find_all('a', class_='a-link-normal s-underline-text s-underline-link-text s-link-style')
    for count_tag in count_tags:
        span_tag = count_tag.find('span', class_='a-size-base')
        if span_tag:
            rating_count = span_tag.text
            rating_counts_list.append(rating_count)
        else:
            rating_counts_list.append(np.nan)

    # Find all <a> tags with class 'a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal' to extract URLs
    urls = soup.find_all('a', {'class': 'a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal'})
    
    image_urls = soup.find_all('img',{'class':'s-image'})
    
    # Extract the text content from the 'brand' elements and store them in a list
    brand_names_list = [each.text for each in brand]
    
    # Extract the text content from the 'description' elements and store them in a list
    descriptions_list = [each.text for each in description]

    # Extract the text content from the 'price' elements and store them in a list
    prices_list = [each.text for each in price]
    
    # Create a list of product URLs by appending the href attribute to 'https://www.amazon.in'
    product_link_list = ['https://www.amazon.in' + each['href'] for each in urls]
    
    image_urls_list = [each['src'] for each in image_urls]
    
    # Return the lists of brand names, descriptions, and product URLs
    return brand_names_list, descriptions_list, prices_list, ratings_list, rating_counts_list, product_link_list,image_urls_list

# Initializing Empty Lists for Amazon Data

In [9]:
amazon_product_brand_names = []
amazon_product_descriptions = []
amazon_product_prices = []
amazon_product_ratings = []
amazon_product_rating_counts = []
amazon_product_urls = []
amazon_product_image_urls = []

# Extracting Men's Clothing Data

## Mens T-Shirts

In [12]:
# Set the total number of pages
total_pages = 30

# Initialize the progress bar
progress_bar = tqdm(total=total_pages, unit='page')

# Iterate over each page
for page in range(1, total_pages + 1):
    # Create the URL for the specific page
    url = f'https://www.amazon.in/s?i=apparel&rh=n%3A1968120031%2Cp_36%3A4595084031&page=2&content-id=amzn1.sym.f5e83e00-a666-492b-b882-5fa6fba3548e&pd_rd_r=21025b0c-ec69-41d2-96e6-ec699afe0fee&pd_rd_w=lDWwe&pd_rd_wg=IQSQx&pf_rd_p=f5e83e00-a666-492b-b882-5fa6fba3548e&pf_rd_r=4SSBTYVCG98DM53DR4PQ&ref=sr_pg_{page}'
    
    # Scrape data from the Amazon page using the provided URL
    brand_names, descriptions, prices, ratings, rating_counts, urls, image_urls = scrap_data_from_amazon(url)
    
    # Extend the respective lists with the obtained data from the current page
    amazon_product_brand_names.extend(brand_names)
    amazon_product_descriptions.extend(descriptions)
    amazon_product_prices.extend(prices)
    amazon_product_ratings.extend(ratings)
    amazon_product_rating_counts.extend(rating_counts)
    amazon_product_urls.extend(urls)
    amazon_product_image_urls.extend(image_urls)
    
    # Update the progress bar
    progress_bar.update(1)

# Close the progress bar
progress_bar.close()

100%|██████████| 30/30 [00:54<00:00,  1.81s/page]


## Mens Jeans

In [14]:
# Set the total number of pages
total_pages = 30

# Initialize the progress bar
progress_bar = tqdm(total=total_pages, unit='page')

# Iterate over each page
for page in range(1, total_pages + 1):
    # Create the URL for the specific page
    url = f'https://www.amazon.in/s?i=apparel&rh=n%3A1968076031&fs=true&page=2&ref=sr_pg_{page}'
    
    # Scrape data from the Amazon page using the provided URL
    brand_names, descriptions, prices, ratings, rating_counts, urls, image_urls = scrap_data_from_amazon(url)
    
    # Extend the respective lists with the obtained data from the current page
    amazon_product_brand_names.extend(brand_names)
    amazon_product_descriptions.extend(descriptions)
    amazon_product_prices.extend(prices)
    amazon_product_ratings.extend(ratings)
    amazon_product_rating_counts.extend(rating_counts)
    amazon_product_urls.extend(urls)
    amazon_product_image_urls.extend(image_urls)
    
    # Update the progress bar
    progress_bar.update(1)

# Close the progress bar
progress_bar.close()

100%|██████████| 30/30 [00:50<00:00,  1.69s/page]


## Mens Coat

In [16]:
# Set the total number of pages
total_pages = 30

# Initialize the progress bar
progress_bar = tqdm(total=total_pages, unit='page')

# Iterate over each page
for page in range(1, total_pages + 1):
    # Create the URL for the specific page
    url = f'https://www.amazon.in/s?i=apparel&rh=n%3A1968088031&fs=true&page=2&qid=1684610738&ref=sr_pg_{page}'
    
    # Scrape data from the Amazon page using the provided URL
    brand_names, descriptions, prices, ratings, rating_counts, urls, image_urls = scrap_data_from_amazon(url)
    
    # Extend the respective lists with the obtained data from the current page
    amazon_product_brand_names.extend(brand_names)
    amazon_product_descriptions.extend(descriptions)
    amazon_product_prices.extend(prices)
    amazon_product_ratings.extend(ratings)
    amazon_product_rating_counts.extend(rating_counts)
    amazon_product_urls.extend(urls)
    amazon_product_image_urls.extend(image_urls)
    
    # Update the progress bar
    progress_bar.update(1)

# Close the progress bar
progress_bar.close()

100%|██████████| 30/30 [00:52<00:00,  1.74s/page]


## Mens Sweaters

In [18]:
# Set the total number of pages
total_pages = 30

# Initialize the progress bar
progress_bar = tqdm(total=total_pages, unit='page')

# Iterate over each page
for page in range(1, total_pages + 1):
    # Create the URL for the specific page
    url = f'https://www.amazon.in/s?i=apparel&rh=n%3A1968077031&fs=true&page=2&qid=1684610897&ref=sr_pg_{page}'
    
    # Scrape data from the Amazon page using the provided URL
    brand_names, descriptions, prices, ratings, rating_counts, urls, image_urls = scrap_data_from_amazon(url)
    
    # Extend the respective lists with the obtained data from the current page
    amazon_product_brand_names.extend(brand_names)
    amazon_product_descriptions.extend(descriptions)
    amazon_product_prices.extend(prices)
    amazon_product_ratings.extend(ratings)
    amazon_product_rating_counts.extend(rating_counts)
    amazon_product_urls.extend(urls)
    amazon_product_image_urls.extend(image_urls)
    
    # Update the progress bar
    progress_bar.update(1)

# Close the progress bar
progress_bar.close()

100%|██████████| 30/30 [00:49<00:00,  1.64s/page]


## Mens Shorts

In [20]:
# Set the total number of pages
total_pages = 30

# Initialize the progress bar
progress_bar = tqdm(total=total_pages, unit='page')

# Iterate over each page
for page in range(1, total_pages + 1):
    # Create the URL for the specific page
    url = f'https://www.amazon.in/s?i=apparel&rh=n%3A1571271031%2Cn%3A1968024031%2Cn%3A1968097031%2Cp_36%3A4595084031&dc&ds=v1%3ACAonSUFlRvplQjMR4po2YHdQTQTqt77VqvlhX3AzbV4&rnid=1571271031&ref=sr_pg_{page}'
    
    # Scrape data from the Amazon page using the provided URL
    brand_names, descriptions, prices, ratings, rating_counts, urls, image_urls = scrap_data_from_amazon(url)
    
    # Extend the respective lists with the obtained data from the current page
    amazon_product_brand_names.extend(brand_names)
    amazon_product_descriptions.extend(descriptions)
    amazon_product_prices.extend(prices)
    amazon_product_ratings.extend(ratings)
    amazon_product_rating_counts.extend(rating_counts)
    amazon_product_urls.extend(urls)
    amazon_product_image_urls.extend(image_urls)
    
    # Update the progress bar
    progress_bar.update(1)

# Close the progress bar
progress_bar.close()

100%|██████████| 30/30 [00:50<00:00,  1.69s/page]


## Mens Innerwear

In [22]:
# Set the total number of pages
total_pages = 30

# Initialize the progress bar
progress_bar = tqdm(total=total_pages, unit='page')

# Iterate over each page
for page in range(1, total_pages + 1):
    # Create the URL for the specific page
    url = f'https://www.amazon.in/s?i=apparel&rh=n%3A1571271031%2Cn%3A1968024031%2Cn%3A1968126031%2Cp_36%3A4595084031&dc&ds=v1%3AamBqkG9tRXdF92oUTAhQ5Qbj%2BFFoD4G2UK0YqVZOtYw&rnid=1571271031&ref=sr_pg_{page}'
    
    # Scrape data from the Amazon page using the provided URL
    brand_names, descriptions, prices, ratings, rating_counts, urls, image_urls = scrap_data_from_amazon(url)
    
    # Extend the respective lists with the obtained data from the current page
    amazon_product_brand_names.extend(brand_names)
    amazon_product_descriptions.extend(descriptions)
    amazon_product_prices.extend(prices)
    amazon_product_ratings.extend(ratings)
    amazon_product_rating_counts.extend(rating_counts)
    amazon_product_urls.extend(urls)
    amazon_product_image_urls.extend(image_urls)
    
    # Update the progress bar
    progress_bar.update(1)

# Close the progress bar
progress_bar.close()

100%|██████████| 30/30 [00:56<00:00,  1.89s/page]


## Mens Sweat-Shirt

In [24]:
# Set the total number of pages
total_pages = 30

# Initialize the progress bar
progress_bar = tqdm(total=total_pages, unit='page')

# Iterate over each page
for page in range(1, total_pages + 1):
    # Create the URL for the specific page
    url = f'https://www.amazon.in/s?i=apparel&rh=n%3A1571271031%2Cn%3A1968024031%2Cn%3A1968062031%2Cn%3A11960414031%2Cp_36%3A4595084031&dc&ds=v1%3AxDSbDq%2B2SUuQKwIW2gtvVo7%2B1qBfiXTqQEc8kLl%2FdqQ&rnid=1571271031&ref=sr_pg_{page}'
    
    # Scrape data from the Amazon page using the provided URL
    brand_names, descriptions, prices, ratings, rating_counts, urls, image_urls = scrap_data_from_amazon(url)
    
    # Extend the respective lists with the obtained data from the current page
    amazon_product_brand_names.extend(brand_names)
    amazon_product_descriptions.extend(descriptions)
    amazon_product_prices.extend(prices)
    amazon_product_ratings.extend(ratings)
    amazon_product_rating_counts.extend(rating_counts)
    amazon_product_urls.extend(urls)
    amazon_product_image_urls.extend(image_urls)
    
    # Update the progress bar
    progress_bar.update(1)

# Close the progress bar
progress_bar.close()

100%|██████████| 30/30 [00:51<00:00,  1.73s/page]


## Mens Cap & Hats

In [26]:
# Set the total number of pages
total_pages = 30

# Initialize the progress bar
progress_bar = tqdm(total=total_pages, unit='page')

# Iterate over each page
for page in range(1, total_pages + 1):
    # Create the URL for the specific page
    url = f'https://www.amazon.in/s?i=apparel&rh=n%3A1571271031%2Cn%3A1968024031%2Cn%3A1968025031%2Cn%3A1968039031%2Cp_36%3A4595084031&dc&ds=v1%3AoJMUPwTypCtbidK%2B751yVOPz0Gni3ze748mEJYLt%2BeY&rnid=1571271031&ref=sr_pg_{page}'
    
    # Scrape data from the Amazon page using the provided URL
    brand_names, descriptions, prices, ratings, rating_counts, urls, image_urls = scrap_data_from_amazon(url)
    
    # Extend the respective lists with the obtained data from the current page
    amazon_product_brand_names.extend(brand_names)
    amazon_product_descriptions.extend(descriptions)
    amazon_product_prices.extend(prices)
    amazon_product_ratings.extend(ratings)
    amazon_product_rating_counts.extend(rating_counts)
    amazon_product_urls.extend(urls)
    amazon_product_image_urls.extend(image_urls)
    
    # Update the progress bar
    progress_bar.update(1)

# Close the progress bar
progress_bar.close()

100%|██████████| 30/30 [00:57<00:00,  1.92s/page]


# Extracting Women's Clothing Data

## Womens Kurtis

In [28]:
# Set the total number of pages
total_pages = 30

# Initialize the progress bar
progress_bar = tqdm(total=total_pages, unit='page')

# Iterate over each page
for page in range(1, total_pages + 1):
    # Create the URL for the specific page
    url = f'https://www.amazon.in/s?i=apparel&rh=n%3A1968255031&fs=true&page=2&qid=1684611025&ref=sr_pg_{page}'
    
    # Scrape data from the Amazon page using the provided URL
    brand_names, descriptions, prices, ratings, rating_counts, urls, image_urls = scrap_data_from_amazon(url)
    
    # Extend the respective lists with the obtained data from the current page
    amazon_product_brand_names.extend(brand_names)
    amazon_product_descriptions.extend(descriptions)
    amazon_product_prices.extend(prices)
    amazon_product_ratings.extend(ratings)
    amazon_product_rating_counts.extend(rating_counts)
    amazon_product_urls.extend(urls)
    amazon_product_image_urls.extend(image_urls)
    
    # Update the progress bar
    progress_bar.update(1)

# Close the progress bar
progress_bar.close()

100%|██████████| 30/30 [00:57<00:00,  1.93s/page]


## Womens Western-Wear

In [30]:
# Set the total number of pages
total_pages = 30

# Initialize the progress bar
progress_bar = tqdm(total=total_pages, unit='page')

# Iterate over each page
for page in range(1, total_pages + 1):
    # Create the URL for the specific page
    url = f'https://www.amazon.in/s?i=apparel&rh=n%3A11400137031&fs=true&page=2&ref=sr_pg_{page}'
    
    # Scrape data from the Amazon page using the provided URL
    brand_names, descriptions, prices, ratings, rating_counts, urls, image_urls = scrap_data_from_amazon(url)
    
    # Extend the respective lists with the obtained data from the current page
    amazon_product_brand_names.extend(brand_names)
    amazon_product_descriptions.extend(descriptions)
    amazon_product_prices.extend(prices)
    amazon_product_ratings.extend(ratings)
    amazon_product_rating_counts.extend(rating_counts)
    amazon_product_urls.extend(urls)
    amazon_product_image_urls.extend(image_urls)
    
    # Update the progress bar
    progress_bar.update(1)

# Close the progress bar
progress_bar.close()

100%|██████████| 30/30 [00:53<00:00,  1.79s/page]


## Womens Salwar Suits

In [32]:
# Set the total number of pages
total_pages = 30

# Initialize the progress bar
progress_bar = tqdm(total=total_pages, unit='page')

# Iterate over each page
for page in range(1, total_pages + 1):
    # Create the URL for the specific page
    url = f'https://www.amazon.in/s?i=apparel&rh=n%3A3723380031&fs=true&page=2&ref=sr_pg_{page}'
    
    # Scrape data from the Amazon page using the provided URL
    brand_names, descriptions, prices, ratings, rating_counts, urls, image_urls = scrap_data_from_amazon(url)
    
    # Extend the respective lists with the obtained data from the current page
    amazon_product_brand_names.extend(brand_names)
    amazon_product_descriptions.extend(descriptions)
    amazon_product_prices.extend(prices)
    amazon_product_ratings.extend(ratings)
    amazon_product_rating_counts.extend(rating_counts)
    amazon_product_urls.extend(urls)
    amazon_product_image_urls.extend(image_urls)
    
    # Update the progress bar
    progress_bar.update(1)

# Close the progress bar
progress_bar.close()

100%|██████████| 30/30 [00:48<00:00,  1.61s/page]


## Womens Sarees

In [34]:
# Set the total number of pages
total_pages = 30

# Initialize the progress bar
progress_bar = tqdm(total=total_pages, unit='page')

# Iterate over each page
for page in range(1, total_pages + 1):
    # Create the URL for the specific page
    url = f'https://www.amazon.in/s?i=apparel&rh=n%3A1968256031&fs=true&page=2&ref=sr_pg_{page}'
    
    # Scrape data from the Amazon page using the provided URL
    brand_names, descriptions, prices, ratings, rating_counts, urls, image_urls = scrap_data_from_amazon(url)
    
    # Extend the respective lists with the obtained data from the current page
    amazon_product_brand_names.extend(brand_names)
    amazon_product_descriptions.extend(descriptions)
    amazon_product_prices.extend(prices)
    amazon_product_ratings.extend(ratings)
    amazon_product_rating_counts.extend(rating_counts)
    amazon_product_urls.extend(urls)
    amazon_product_image_urls.extend(image_urls)
    
    # Update the progress bar
    progress_bar.update(1)

# Close the progress bar
progress_bar.close()

100%|██████████| 30/30 [00:50<00:00,  1.70s/page]


## Womens Lingerie

In [36]:
# Set the total number of pages
total_pages = 30

# Initialize the progress bar
progress_bar = tqdm(total=total_pages, unit='page')

# Iterate over each page
for page in range(1, total_pages + 1):
    # Create the URL for the specific page
    url = f'https://www.amazon.in/s?i=apparel&rh=n%3A1571271031%2Cn%3A1953602031%2Cn%3A1968253031%2Cn%3A3723378031%2Cp_36%3A4595084031&dc&ds=v1%3AmyGvtxUXcSm0yh4fabDKCcMhC5uKhyhNZUZim9lnoto&rnid=1571271031&ref=sr_pg_{page}'
    
    # Scrape data from the Amazon page using the provided URL
    brand_names, descriptions, prices, ratings, rating_counts, urls, image_urls = scrap_data_from_amazon(url)
    
    # Extend the respective lists with the obtained data from the current page
    amazon_product_brand_names.extend(brand_names)
    amazon_product_descriptions.extend(descriptions)
    amazon_product_prices.extend(prices)
    amazon_product_ratings.extend(ratings)
    amazon_product_rating_counts.extend(rating_counts)
    amazon_product_urls.extend(urls)
    amazon_product_image_urls.extend(image_urls)
    
    # Update the progress bar
    progress_bar.update(1)

# Close the progress bar
progress_bar.close()

100%|██████████| 30/30 [00:55<00:00,  1.84s/page]


## Womens Swimsuits

In [38]:
# Set the total number of pages
total_pages = 30

# Initialize the progress bar
progress_bar = tqdm(total=total_pages, unit='page')

# Iterate over each page
for page in range(1, total_pages + 1):
    # Create the URL for the specific page
    url = f'https://www.amazon.in/s?i=apparel&rh=n%3A1571271031%2Cn%3A1953602031%2Cn%3A1968457031%2Cp_36%3A4595084031&dc&ds=v1%3AXiCoBLbxoth98UdFRKvaYnh83XVdXHeFJe0g5w%2F6%2Fg0&rnid=1571271031&ref=sr_pg_{page}'
    
    # Scrape data from the Amazon page using the provided URL
    brand_names, descriptions, prices, ratings, rating_counts, urls, image_urls = scrap_data_from_amazon(url)
    
    # Extend the respective lists with the obtained data from the current page
    amazon_product_brand_names.extend(brand_names)
    amazon_product_descriptions.extend(descriptions)
    amazon_product_prices.extend(prices)
    amazon_product_ratings.extend(ratings)
    amazon_product_rating_counts.extend(rating_counts)
    amazon_product_urls.extend(urls)
    amazon_product_image_urls.extend(image_urls)
    
    # Update the progress bar
    progress_bar.update(1)

# Close the progress bar
progress_bar.close()

100%|██████████| 30/30 [00:52<00:00,  1.75s/page]


## Womens Lehnga Choli

In [40]:
# Set the total number of pages
total_pages = 30

# Initialize the progress bar
progress_bar = tqdm(total=total_pages, unit='page')

# Iterate over each page
for page in range(1, total_pages + 1):
    # Create the URL for the specific page
    url = f'https://www.amazon.in/s?i=apparel&rh=n%3A1571271031%2Cn%3A1953602031%2Cn%3A1968533031%2Cp_36%3A4595084031&dc&ds=v1%3AQvOBKs6yyfX0W5zwsecPHG8DUyvgqvkmXv%2FLWCzO%2Fls&rnid=1571271031&ref=sr_pg_{page}'
    
    # Scrape data from the Amazon page using the provided URL
    brand_names, descriptions, prices, ratings, rating_counts, urls, image_urls = scrap_data_from_amazon(url)
    
    # Extend the respective lists with the obtained data from the current page
    amazon_product_brand_names.extend(brand_names)
    amazon_product_descriptions.extend(descriptions)
    amazon_product_prices.extend(prices)
    amazon_product_ratings.extend(ratings)
    amazon_product_rating_counts.extend(rating_counts)
    amazon_product_urls.extend(urls)
    amazon_product_image_urls.extend(image_urls)
    
    # Update the progress bar
    progress_bar.update(1)

# Close the progress bar
progress_bar.close()

100%|██████████| 30/30 [00:50<00:00,  1.70s/page]


## Womens Cap & Hats

In [42]:
# Set the total number of pages
total_pages = 30

# Initialize the progress bar
progress_bar = tqdm(total=total_pages, unit='page')

# Iterate over each page
for page in range(1, total_pages + 1):
    # Create the URL for the specific page
    url = f'https://www.amazon.in/s?i=apparel&rh=n%3A1571271031%2Cn%3A1953602031%2Cn%3A1968391031%2Cn%3A1968404031%2Cp_36%3A4595084031&dc&ds=v1%3AtIpvPOnl9ZyOez%2FDe2n%2Fp558PEDOJcU8oUeV9YL309k&rnid=1571271031&ref=sr_pg_{page}'
    
    # Scrape data from the Amazon page using the provided URL
    brand_names, descriptions, prices, ratings, rating_counts, urls, image_urls = scrap_data_from_amazon(url)
    
    # Extend the respective lists with the obtained data from the current page
    amazon_product_brand_names.extend(brand_names)
    amazon_product_descriptions.extend(descriptions)
    amazon_product_prices.extend(prices)
    amazon_product_ratings.extend(ratings)
    amazon_product_rating_counts.extend(rating_counts)
    amazon_product_urls.extend(urls)
    amazon_product_image_urls.extend(image_urls)
    
    # Update the progress bar
    progress_bar.update(1)

# Close the progress bar
progress_bar.close()

100%|██████████| 30/30 [00:49<00:00,  1.64s/page]


# Converting These Lists into a DataFrame

In [62]:
# Create a dictionary named 'data' to store the data for the DataFrame
# The keys in the dictionary represent the column names in the DataFrame
# The values are lists containing the data for each column

brand_names_list = amazon_product_brand_names
descriptions_list = amazon_product_descriptions
prices_list = amazon_product_prices
ratings_list = amazon_product_ratings
rating_counts_list = amazon_product_rating_counts
product_link_list = amazon_product_urls
image_url_list = amazon_product_image_urls

print("brand_names_list: ", len(brand_names_list))
print("descriptions_list: ", len(descriptions_list))
print("prices_list: ", len(prices_list))
print("ratings_list: ", len(ratings_list))
print("rating_counts_list: ", len(rating_counts_list))
print("product_link_list: ", len(product_link_list))
print("image_url_list: ", len(image_url_list))

min_length = min(len(brand_names_list), len(descriptions_list), len(prices_list), len(ratings_list), len(rating_counts_list), len(product_link_list), len(image_url_list))

brand_names_list = brand_names_list[:min_length]
descriptions_list = descriptions_list[:min_length]
prices_list = prices_list[:min_length]
ratings_list = ratings_list[:min_length]
rating_counts_list = rating_counts_list[:min_length]
product_link_list = product_link_list[:min_length]
image_url_list = image_url_list[:min_length]

data = {'brand': brand_names_list, 'description': descriptions_list, 'price': prices_list, 'rating': ratings_list, 'rating_count': rating_counts_list, 'url': product_link_list,'img':image_url_list}

# Convert the 'data' dictionary to a DataFrame using the pd.DataFrame() function
# Each key in the 'data' dictionary will become a column in the DataFrame
df = pd.DataFrame(data)
df

brand_names_list:  26723
descriptions_list:  26723
prices_list:  26586
ratings_list:  23404
rating_counts_list:  35763
product_link_list:  26723
image_url_list:  26723


Unnamed: 0,brand,description,price,rating,rating_count,url,img
0,Amazon Brand - House & Shields,Men Polo Shirt,229,3.2,,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...,https://m.media-amazon.com/images/I/71XLDZOOQS...
1,Crafting Club,Mens Round Neck Regular Fit Short Sleeve,299,5.0,13,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...,https://m.media-amazon.com/images/I/518bD5f9yI...
2,DIXCY SCOTT MAXIMUS,Men's T-Shirt,299,3.5,,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...,https://m.media-amazon.com/images/I/61yyx18xx9...
3,DIXCY SCOTT MAXIMUS,Men's Regular Fit T-Shirt (PR17539PL_DSM CHARC...,299,3.5,,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...,https://m.media-amazon.com/images/I/61qAUyHxvm...
4,London Hills,Men Printed Round Neck Full Sleeve Regular Fit...,298,4.1,2,https://www.amazon.in/London-Hills-Printed-Sle...,https://m.media-amazon.com/images/I/61p21cELnE...
...,...,...,...,...,...,...,...
23399,Belanto,Women's & Girls Nipple Cover Strapless Bra Ins...,299,4.4,,https://www.amazon.in/Belanto-Strapless-Backle...,https://m.media-amazon.com/images/I/41I1CM6RMT...
23400,Bureaucrat,Nipple Cover Silicone Pasties Reusable NoShow ...,299,3.8,,https://www.amazon.in/Bureaucrat-Silicone-Reus...,https://m.media-amazon.com/images/I/51w-DZAFRe...
23401,HSR,Women Waist Shapewear with Anti Rolling Strip ...,199,5.0,1044,https://www.amazon.in/HSR-Shapewear-Rolling-Co...,https://m.media-amazon.com/images/I/61QZYAyDwx...
23402,Fearless Tape,Fearless Tape Generic Tape Double Sided Tape f...,249,2.0,,https://www.amazon.in/Fear-less-Generic-Tape-I...,https://m.media-amazon.com/images/I/71ko4Tq+b1...


# Saving DataFrame as .csv

In [67]:
# Save the DataFrame as a CSV file with the filename 'clothing_data.csv'
# Set the parameter 'index' to False to exclude the row index labels from the CSV file
df.to_csv('clothing_data.csv', index=False)