In [4]:
import requests
import os
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin

def download_assets(url, output_dir):
    # Create the output directory if it doesn't already exist
    os.makedirs(output_dir, exist_ok=True)

    # Send a GET request to the URL and parse the HTML content
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all of the script, image, and link tags in the HTML content
    script_tags = soup.find_all('script')
    img_tags = soup.find_all('img')
    link_tags = soup.find_all('link')
    total_assets = len(script_tags) + len(img_tags) + len(link_tags)
    assets_downloaded = 0

    # Download each asset and save it to the appropriate subdirectory
    for tag in script_tags + img_tags + link_tags:
        # Determine the asset URL and download it
        asset_url = urljoin(url, tag.get('src') or tag.get('href'))
        asset_response = requests.get(asset_url)

        # Determine the path to the asset's output directory and create it if necessary
        parsed_url = urlparse(asset_url)
        path = parsed_url.path
        if path.startswith('/'):
            path = path[1:]
        output_path = os.path.join(output_dir, os.path.dirname(path))
        os.makedirs(output_path, exist_ok=True)

        # Save the asset to the appropriate file in the output directory
        filename = os.path.basename(path)
        with open(os.path.join(output_path, filename), 'wb') as f:
            f.write(asset_response.content)
        
        # Increment the asset download count and print the status
        assets_downloaded += 1
        print(f"Downloaded {assets_downloaded}/{total_assets} assets", end='\r')
        
    print("\nDone!")

if __name__ == '__main__':
    # Replace this URL with the website you want to scrape
    url = 'http://akkhi-techno-bright.com/'

    # Replace this output directory with the directory you want to save the assets to
    output_dir = '/Users/rentsher/Desktop/badri/scraping_test/'

    download_assets(url, output_dir)


Downloaded 1/72 assets

IsADirectoryError: [Errno 21] Is a directory: '/Users/rentsher/Desktop/badri/scraping_test/'

In [8]:
import csv
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import concurrent.futures

def get_random_user_agent():
    ua = UserAgent()
    return ua.random

def extract_content(url, headers=None):
    try:
        print(f"Accessing URL: {url}")
        response = session.get(url, headers=headers)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract all content from div elements with class 'sc-gEvEer gWlAOS'
        print("Extracting content...")
        elements = soup.find_all('div', class_='sc-gEvEer gWlAOS')
        extracted_content = [element.get_text(strip=True) for element in elements]

        for i, content in enumerate(extracted_content, 1):
            print(f"Content #{i}: {content}")

        return extracted_content

    except requests.exceptions.HTTPError as e:
        if e.response.status_code == 403:
            print("Access Denied (403). Try using headers to mimic a legitimate user agent.")
        else:
            print(f"HTTP Error: {e}")
        return []

def save_to_csv(file_path, content):
    with open(file_path, 'a', newline='', encoding='utf-8') as csvfile:  # Use 'a' for append mode
        csv_writer = csv.writer(csvfile)
        csv_writer.writerows([[item] for item in content])

def process_content(output_csv_path):
    base_url = "https://yourstory.com/companies/search?page="
    sector = "&sector=Marketplace&hitsPerPage=300"

    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [executor.submit(extract_content, f"{base_url}{i}{sector}", headers={'User-Agent': get_random_user_agent()}) for i in range(1, 6)]  # Adjust the range as needed
        concurrent.futures.wait(futures)
        results = [future.result() for future in futures]

    all_content = [item for sublist in results for item in sublist]

    print(f"Saving content to {output_csv_path}...")
    save_to_csv(output_csv_path, all_content)

if __name__ == "__main__":
    output_csv_path = "/Users/rentsher/Desktop/AllContentOutput.csv"

    # Truncate the existing output file or create a new one
    with open(output_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(['Content'])

    session = requests.Session()
    session.headers.update({'User-Agent': get_random_user_agent()})

    process_content(output_csv_path)


Accessing URL: https://yourstory.com/companies/search?page=1&sector=Marketplace&hitsPerPage=300Accessing URL: https://yourstory.com/companies/search?page=2&sector=Marketplace&hitsPerPage=300

Accessing URL: https://yourstory.com/companies/search?page=3&sector=Marketplace&hitsPerPage=300
Accessing URL: https://yourstory.com/companies/search?page=4&sector=Marketplace&hitsPerPage=300
Accessing URL: https://yourstory.com/companies/search?page=5&sector=Marketplace&hitsPerPage=300
Access Denied (403). Try using headers to mimic a legitimate user agent.Access Denied (403). Try using headers to mimic a legitimate user agent.
Access Denied (403). Try using headers to mimic a legitimate user agent.
Access Denied (403). Try using headers to mimic a legitimate user agent.Access Denied (403). Try using headers to mimic a legitimate user agent.


Saving content to /Users/rentsher/Desktop/AllContentOutput.csv...


In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import csv
import requests

def get_image_links(domain_url, output_path='output_images.csv', min_image_size_kb=300):
    driver = None

    try:
        # Set up Chrome options in headless mode
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--disable-gpu')
        
        # Initialize Chrome WebDriver
        driver = webdriver.Chrome(options=chrome_options)

        # Fetch HTML content of the website using Selenium
        driver.get(domain_url)

        # Extract image links
        image_links = []
        for img_tag in driver.find_elements_by_tag_name('img'):
            img_url = img_tag.get_attribute('src')

            # Fetch image data and check size
            img_response = requests.get(img_url)
            img_size_kb = len(img_response.content) / 1024

            # If image size is not less than min_image_size_kb, add to the list
            if img_size_kb >= min_image_size_kb:
                image_links.append(img_url)
                print(f'Image URL (Size: {img_size_kb} KB): {img_url}')

        # Save the image URLs to CSV
        with open(output_path, 'w', newline='') as csvfile:
            csv_writer = csv.writer(csvfile)
            csv_writer.writerows([[url] for url in image_links])

    except Exception as e:
        print(f"Error: {e}")

    finally:
        if driver:
            # Close the WebDriver
            driver.quit()

# Example usage
domain_url = "https://flipkart.com"
get_image_links(domain_url)


Unable to obtain driver using Selenium Manager: Selenium Manager failed for: /Users/rentsher/opt/anaconda3/lib/python3.9/site-packages/selenium/webdriver/common/macos/selenium-manager --browser chrome --output json.
The chromedriver version cannot be discovered


Error: Message: Selenium Manager failed for: /Users/rentsher/opt/anaconda3/lib/python3.9/site-packages/selenium/webdriver/common/macos/selenium-manager --browser chrome --output json.
The chromedriver version cannot be discovered

