In [23]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# Make an HTTP request to the website and retrieve the HTML content
response = requests.get('https://www.buzzfeed.com/in')
html = response.text

# Parse the HTML using Beautiful Soup
soup = BeautifulSoup(html, 'html.parser')

# Extract all the <a> tags from the HTML
a_tags = soup.find_all('a')

# Initialize a counter for images above 10KB and counter for retrieved images
num_images_above_10kb = 0
num_retrieved_images = 0

# Loop through each <a> tag and extract the href link
for a_tag in a_tags:
    if num_retrieved_images >= 5:
        break  # Break out of the loop once desired count is reached
    
    page_url = a_tag.get('href')
    # Join the href link with the base URL
    page_url = urljoin(response.url, page_url)
    print(f'Page URL: {page_url}')
    
    try:
        # Make an HTTP request to the page URL and retrieve the HTML content
        page_response = requests.get(page_url)
        page_html = page_response.text
        
        # Parse the HTML using Beautiful Soup for the page
        page_soup = BeautifulSoup(page_html, 'html.parser')
        
        # Extract all the <img> tags from the page HTML
        img_tags = page_soup.find_all('img')
        
        # Loop through each <img> tag and extract the image src link
        for img_tag in img_tags:
            if num_retrieved_images >= 5:
                break  # Break out of the loop once desired count is reached
            
            img_src = img_tag.get('src')
            # Join the image src link with the base URL
            img_url = urljoin(page_response.url, img_src)
            print(f'Image URL: {img_url}')
            
            try:
                # Make an HTTP request to the image URL and retrieve the headers
                img_response = requests.get(img_url)
                headers = img_response.headers
                
                # Convert headers to array format
                headers_array = []
                for key, value in headers.items():
                    headers_array.append({key: value})
                    
                print('Headers Data:')
                print(headers_array)
                
                # Check the size/weight of the image
                if len(img_response.content) > 10 * 1024:
                    num_images_above_10kb += 1
                    num_retrieved_images += 1
                    print(f'Size/Weight: {len(img_response.content)} bytes (above 10KB)')
                    
                    # Check if the desired count is reached
                    if num_retrieved_images >= 5:
                        print('5 images above 10KB retrieved. Exiting...')
                        break  # Break out of the loop once desired count is reached
                else:
                    print(f'Size/Weight: {len(img_response.content)} bytes')
                
                print('---')
                
            except requests.exceptions.RequestException as e:
                print(f'Error occurred while making request: {e}')
                print('Skipping this image.')
                print('---')

    except requests.exceptions.RequestException as e:
        print(f'Error occurred while making request: {e}')
        print('Skipping this page.')
        print('---')

print(f'Total number of images above 10KB: {num_images_above_10kb}')


Page URL: https://www.buzzfeed.com?origin=tb
Image URL: https://img.buzzfeed.com/buzzfeed-static/static/2023-04/17/10/campaign_images/2eb4f357329a/31-hilarious-ramadan-memes-that-will-have-you-gas-3-1452-1681727536-0_dblwide.jpg?output-format=auto&output-quality=100
Headers Data:
[{'Connection': 'keep-alive'}, {'Content-Length': '17772'}, {'Content-Type': 'image/jpeg'}, {'Etag': '"19v52jdcKUsLM/1lEmTfj1G0UOuail4ts2+RdLpeO7c"'}, {'Fastly-Io-Info': 'ifsz=101823 idim=625x220 ifmt=jpeg ofsz=17772 odim=625x220 ofmt=jpeg'}, {'Fastly-Stats': 'io=1'}, {'X-Amz-Id-2': '1qi1CmYAg3QIZyT5mgBI/Hudq+r2LwSL1jYI84F7pVCVtXXSp2x7ipeCeeIClsuy/gNdE5Teb8ECN3toDgl/iA=='}, {'X-Amz-Replication-Status': 'COMPLETED'}, {'X-Amz-Request-Id': 'PB037MPWAT6Z0DVT'}, {'X-Amz-Server-Side-Encryption': 'AES256'}, {'X-Amz-Version-Id': '1BzSF42PbVuLresIS5eP7huSkpnb4mZe'}, {'Via': '1.1 varnish, 1.1 varnish, 1.1 varnish'}, {'Server': 'FastlyIO'}, {'Cache-Control': 'public, max-age=86400'}, {'Accept-Ranges': 'bytes'}, {'Date': 

Headers Data:
[{'Connection': 'keep-alive'}, {'Content-Length': '12078'}, {'Content-Type': 'image/jpeg'}, {'Etag': '"HbOy26i7BOVkjjQOoxFz47O9vXjPzoBnZa3CkPBIlBg"'}, {'Fastly-Io-Info': 'ifsz=257278 idim=625x415 ifmt=jpeg ofsz=12078 odim=300x199 ofmt=jpeg'}, {'Fastly-Stats': 'io=1'}, {'X-Amz-Id-2': '7MceB9QfAl4F63JXHqI5Rr9LdkYWM6A8JkY9uRWXjbrLvAHLsOT8nm7+H/yp4RWYLb6bXDpbFdI='}, {'X-Amz-Replication-Status': 'COMPLETED'}, {'X-Amz-Request-Id': 'AJ6WKZ59HWTP4E98'}, {'X-Amz-Server-Side-Encryption': 'AES256'}, {'X-Amz-Version-Id': 'bDITEg1mRZARMsYOju1mR4uWpWd9Lx47'}, {'Via': '1.1 varnish, 1.1 varnish, 1.1 varnish'}, {'Server': 'FastlyIO'}, {'Cache-Control': 'public, max-age=86400'}, {'Accept-Ranges': 'bytes'}, {'Date': 'Mon, 24 Apr 2023 07:03:18 GMT'}, {'Age': '1441871'}, {'Timing-Allow-Origin': '*'}, {'Access-Control-Allow-Origin': '*'}, {'X-Served-By': 'cache-iad-kiad7000070-IAD, cache-iad-kiad7000156-IAD, cache-maa10250-MAA'}, {'X-Cache': 'MISS, HIT, HIT'}, {'X-Cache-Hits': '0, 22, 2'}, {'X

In [27]:
pip install scrapy


Note: you may need to restart the kernel to use updated packages.


In [3]:
import scrapy

class BuzzfeedSpider(scrapy.Spider):
    name = 'buzzfeed'
    start_urls = ['https://www.buzzfeed.com/in']
    num_images_above_10kb = 0

    def parse(self, response):
        # Extract all the <a> tags from the HTML
        a_tags = response.css('a')

        # Loop through each <a> tag and extract the href link
        for a_tag in a_tags:
            page_url = response.urljoin(a_tag.attrib['href'])
            self.logger.info(f'Page URL: {page_url}')

            # Make a request to the page URL
            yield scrapy.Request(page_url, callback=self.parse_page)

    def parse_page(self, response):
        # Extract all the <img> tags from the page HTML
        img_tags = response.css('img')

        # Loop through each <img> tag and extract the image src link
        for img_tag in img_tags:
            img_src = img_tag.attrib['src']
            img_url = response.urljoin(img_src)
            self.logger.info(f'Image URL: {img_url}')

            # Make a request to the image URL and retrieve the headers
            yield scrapy.Request(img_url, callback=self.parse_image,
                                 meta={'img_url': img_url})

    def parse_image(self, response):
        img_url = response.meta['img_url']
        headers = response.headers
        img_size = len(response.body)

        # Convert headers to array format
        headers_array = []
        for key, value in headers.items():
            headers_array.append({key.decode(): value.decode()})

        self.logger.info('Headers Data:')
        self.logger.info(headers_array)

        # Check the size/weight of the image
        if img_size > 10 * 1024:
            self.num_images_above_10kb += 1
            self.logger.info(f'Size/Weight: {img_size} bytes (above 10KB)')

            # Check if the desired count is reached
            if self.num_images_above_10kb == 5:
                self.logger.info('5 images above 10KB retrieved. Exiting...')
                raise scrapy.exceptions.CloseSpider('5 images above 10KB retrieved')

        else:
            self.logger.info(f'Size/Weight: {img_size} bytes')

In [4]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import csv
import requests

def get_image_links(domain_url, output_path='output_images.csv', min_image_size_kb=300):
    driver = None

    try:
        # Set up Chrome options in headless mode
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--disable-gpu')
        
        # Initialize Chrome WebDriver
        driver = webdriver.Chrome(options=chrome_options)

        # Fetch HTML content of the website using Selenium
        driver.get(domain_url)

        # Extract image links
        image_links = []
        for img_tag in driver.find_elements_by_tag_name('img'):
            img_url = img_tag.get_attribute('src')

            # Fetch image data and check size
            img_response = requests.get(img_url)
            img_size_kb = len(img_response.content) / 1024

            # If image size is not less than min_image_size_kb, add to the list
            if img_size_kb >= min_image_size_kb:
                image_links.append(img_url)
                print(f'Image URL (Size: {img_size_kb} KB): {img_url}')

        # Save the image URLs to CSV
        with open(output_path, 'w', newline='') as csvfile:
            csv_writer = csv.writer(csvfile)
            csv_writer.writerows([[url] for url in image_links])

    except Exception as e:
        print(f"Error: {e}")

    finally:
        if driver:
            # Close the WebDriver
            driver.quit()

# Example usage
domain_url = "https://flipkart.com"
get_image_links(domain_url)


Unable to obtain driver using Selenium Manager: Selenium Manager failed for: /Users/rentsher/opt/anaconda3/lib/python3.9/site-packages/selenium/webdriver/common/macos/selenium-manager --browser chrome --output json.
The chromedriver version cannot be discovered


Error: Message: Selenium Manager failed for: /Users/rentsher/opt/anaconda3/lib/python3.9/site-packages/selenium/webdriver/common/macos/selenium-manager --browser chrome --output json.
The chromedriver version cannot be discovered

