In [17]:
import os
import requests
import boto3
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd
import json
import hashlib
import markitdown as md

## S3 Configuration

In [18]:
AWS_ACCESS_KEY = "AKIASBGQLOUGU4LBU7PD"
AWS_SECRET_KEY = "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"
S3_BUCKET_NAME = "scrapedimages"
S3_REGION = "us-east-2"

## Function to generate a hash for image validation

In [19]:
def generate_hash(file_path):
    hash_md5 = hashlib.md5()
    with open(file_path, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

## Function to upload files to S3

In [20]:
def upload_to_s3(file_path, bucket_name, s3_key, aws_access_key, aws_secret_key, region):
    s3 = boto3.client(
        's3',
        aws_access_key_id=aws_access_key,
        aws_secret_access_key=aws_secret_key,
        region_name=region
    )
    try:
        s3.upload_file(file_path, bucket_name, s3_key)
        s3_url = f"https://{bucket_name}.s3.{region}.amazonaws.com/{s3_key}"
        print(f"Uploaded {file_path} to {s3_url}")
        return s3_url
    except Exception as e:
        print(f"Failed to upload {file_path} to S3: {e}")
        return None

## Function to scrape images

In [21]:
def scrape_images(url, save_directory="images"):
    os.makedirs(save_directory, exist_ok=True)
    response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')
    img_tags = soup.find_all('img')
    image_urls = []
    hashes = set()
    for img_tag in img_tags:
        img_url = img_tag.get('src')
        if img_url:
            img_url = urljoin(url, img_url)
            img_name = os.path.basename(img_url)
            img_path = os.path.join(save_directory, img_name)
            try:
                img_data = requests.get(img_url).content
                with open(img_path, 'wb') as img_file:
                    img_file.write(img_data)

                img_hash = generate_hash(img_path)
                if img_hash in hashes:
                    print(f"Duplicate image skipped: {img_url}")
                    continue

                hashes.add(img_hash)
                s3_key = f"scraped_images/{img_name}"
                s3_url = upload_to_s3(img_path, S3_BUCKET_NAME, s3_key, AWS_ACCESS_KEY, AWS_SECRET_KEY, S3_REGION)
                if s3_url:
                    image_urls.append(s3_url)
            except Exception as e:
                print(f"Failed to download {img_url}: {e}")
    return image_urls

## Function to scrape tables

In [22]:
def scrape_tables(url, output_directory="tables"):
    os.makedirs(output_directory, exist_ok=True)
    response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')
    tables = soup.find_all('table')
    table_files = []
    for i, table in enumerate(tables):
        try:
            df = pd.read_html(str(table))[0]
            file_path = os.path.join(output_directory, f"table_{i + 1}.csv")
            df.to_csv(file_path, index=False)
            table_files.append(file_path)
            print(f"Saved table {i + 1} to {file_path}")
        except Exception as e:
            print(f"Failed to parse table {i + 1}: {e}")
    return table_files

## Function to scrape metadata

In [23]:
def scrape_metadata(url):
    response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')
    metadata = {
        "title": soup.title.string if soup.title else "No Title",
        "meta_tags": [
            {"name": tag.get("name"), "content": tag.get("content")}
            for tag in soup.find_all("meta")
        ]
    }
    metadata_file = "metadata.json"
    with open(metadata_file, "w", encoding="utf-8") as file:
        json.dump(metadata, file, indent=4)
    return metadata_file

## Function to save markdown file using markitdown

In [24]:
def save_markdown(metadata_file, table_files, image_urls, output_file="output.md"):
    doc = md.Document()

    # Add Metadata
    doc.add_heading("Metadata", level=1)
    with open(metadata_file, "r", encoding="utf-8") as file:
        metadata = json.load(file)
        doc.add_codeblock(json.dumps(metadata, indent=4), language="json")

## Adding Images, Tables and saving Markdown file

In [None]:
doc = md.Document()

doc.add_heading("Images", level=1)

for img_url in image_urls:
        doc.add_paragraph(f"![Image]({img_url})")