<a href="https://colab.research.google.com/github/alicepicco333/The-Meme-Archive/blob/main/meme_scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install requests-html

Collecting requests-html
  Downloading requests_html-0.10.0-py3-none-any.whl (13 kB)
Collecting pyquery (from requests-html)
  Downloading pyquery-2.0.0-py3-none-any.whl (22 kB)
Collecting fake-useragent (from requests-html)
  Downloading fake_useragent-1.5.0-py3-none-any.whl (17 kB)
Collecting parse (from requests-html)
  Downloading parse-1.20.1-py2.py3-none-any.whl (20 kB)
Collecting bs4 (from requests-html)
  Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Collecting w3lib (from requests-html)
  Downloading w3lib-2.1.2-py3-none-any.whl (21 kB)
Collecting pyppeteer>=0.0.14 (from requests-html)
  Downloading pyppeteer-2.0.0-py3-none-any.whl (82 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.9/82.9 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Collecting pyee<12.0.0,>=11.0.0 (from pyppeteer>=0.0.14->requests-html)
  Downloading pyee-11.1.0-py3-none-any.whl (15 kB)
Collecting urllib3<2.0.0,>=1.25.8 (from pyppeteer>=0.0.14->requests-html)
  Downloading urllib

In [2]:
!pip install httpx

Collecting httpx
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/75.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx)
  Downloading httpcore-1.0.4-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.8/77.8 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: h11, httpcore, httpx
Successfully installed h11-0.14.0 httpcore-1.0.4 httpx-0.27.0


In [23]:
import os
import json
import requests
import random
import time
from bs4 import BeautifulSoup

def scrape_additional_metadata(soup):
    # Add your logic to scrape additional metadata here
    return {}

def scrape_knowyourmeme(url, session):
    try:
        response = session.get(url)
        response.raise_for_status()
    except requests.exceptions.HTTPError as e:
        print(f"Error: {e.response.status_code} Client Error: {e.response.reason} for url: {url}")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract text content from all <p> tags
    paragraphs = soup.find_all('p')
    content = [p.get_text(strip=True) for p in paragraphs]

    title_tag = soup.find('meta', property='og:title')
    caption_tag = soup.find('div', class_='bodycopy')
    image_tag = soup.find('meta', property='og:image')

    title = title_tag['content'] if title_tag else None
    caption = caption_tag.get_text(strip=True) if caption_tag else None
    image_url = image_tag['content'] if image_tag else None

    additional_metadata = scrape_additional_metadata(soup)

    metadata = {
        'title': title,
        'caption': caption,
        'image_url': image_url,
        'content': content,
        **additional_metadata,
    }

    return metadata

def scrape_and_save_data(url, session, downloaded_image_count, metadata_list, max_errors):
    metadata = scrape_knowyourmeme(url, session)

    if metadata:
        metadata_list.append(metadata)

        # Download and save the image only if the response status is 200
        image_url = metadata['image_url']
        image_response = session.get(image_url, allow_redirects=False)

        if image_response.status_code == 200:
            # Create the 'images' folder if it doesn't exist
            os.makedirs('images', exist_ok=True)

            # Format the image filename
            title_for_filename = metadata['title'] or 'unknown'
            caption_for_filename = metadata['caption'][:20] if metadata['caption'] else 'unknown_caption'
            image_filename = f"{downloaded_image_count:03}_{title_for_filename}_{caption_for_filename}.jpg"
            image_path = os.path.join('images', image_filename)

            try:
                # Save the image to the 'images' folder
                with open(image_path, 'wb') as file:
                    file.write(image_response.content)
            except FileNotFoundError as e:
                print(f"Error: {e} for url: {url}")
        elif image_response.status_code == 410:
            print(f"Error: {image_response.status_code} {image_response.reason} for url: {image_url}")
        else:
            print(f"Error downloading image: {image_response.status_code} {image_response.reason} for url: {image_url}")

def main():
    MAX_IMAGES = 5000
    MAX_ERRORS = 15
    metadata_list = []

    with requests.Session() as session:
        downloaded_image_count = 0
        unique_urls = set()  # To store unique URLs

        while downloaded_image_count < MAX_IMAGES:
            url = f"https://knowyourmeme.com/photos/{random.randint(1, 100000)}"

            # Check if the URL is unique, if not, continue to the next iteration
            if url in unique_urls:
                continue

            scrape_and_save_data(url, session, downloaded_image_count + 1, metadata_list, MAX_ERRORS)
            downloaded_image_count += 1
            unique_urls.add(url)  # Add the URL to the set
            time.sleep(1)  # Add a delay to avoid overloading the server

    # Save metadata to a JSON file
    with open('metadata.json', 'w', encoding='utf-8') as json_file:
        json.dump(metadata_list, json_file, ensure_ascii=False, indent=2)

if __name__ == "__main__":
    main()


Error: 410 Client Error: Gone for url: https://knowyourmeme.com/photos/49278
Error: 410 Client Error: Gone for url: https://knowyourmeme.com/photos/42226
Error: [Errno 2] No such file or directory: 'images/016_[Image - 72961] | Joseph Ducreux / Archaic Rap_unknown_caption.jpg' for url: https://knowyourmeme.com/photos/72961
Error: 410 Client Error: Gone for url: https://knowyourmeme.com/photos/9095
Error: [Errno 2] No such file or directory: 'images/023_[Image - 40013] | On Notice/Dead to Me_unknown_caption.jpg' for url: https://knowyourmeme.com/photos/40013
Error: 410 Client Error: Gone for url: https://knowyourmeme.com/photos/90547
Error: 410 Client Error: Gone for url: https://knowyourmeme.com/photos/76059
Error: 410 Client Error: Gone for url: https://knowyourmeme.com/photos/70671
Error: 410 Client Error: Gone for url: https://knowyourmeme.com/photos/55763
Error: 410 Client Error: Gone for url: https://knowyourmeme.com/photos/24201
Error: [Errno 2] No such file or directory: 'images

ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

In [22]:
rm -r images
