# Scraper Toolkit

In [None]:
// scraper_toolkit.ts

// =====================
// Import Deno Standard Libraries
// =====================

import { parse as parseYaml } from "@std/yaml";
import { parse as parseToml } from "@std/toml";
import { parse as parseIni } from "@std/ini";
import { parse as parseJSONC } from "jsr:@std/jsonc";
import { delay } from "jsr:@std/async";
import { TokenBucket } from "jsr:@std/collections";
import { DOMParser } from "jsr:@std/dom";

// =====================
// Interfaces and Types
// =====================

interface ScraperConfig {
  targetUrl: string;
  selectors: Record<string, string>;
  rateLimit: {
    requests: number;
    perSeconds: number;
  };
  httpClient?: {
    maxRetries?: number;
    backoffFactor?: number; // in milliseconds
    headers?: Record<string, string>;
  };
}

// =====================
// Config Manager
// =====================

export class ConfigManager {
  private config: ScraperConfig = {
    targetUrl: "",
    selectors: {},
    rateLimit: { requests: 1, perSeconds: 1 },
  };

  constructor(private configPathOrUrl: string) {}

  async loadConfig(): Promise<ScraperConfig> {
    let fileContent: string;

    if (this.isValidUrl(this.configPathOrUrl)) {
      fileContent = await this.fetchRemoteConfig(this.configPathOrUrl);
    } else {
      fileContent = await Deno.readTextFile(this.configPathOrUrl);
    }

    const ext = this.getFileExtension(this.configPathOrUrl);

    switch (ext) {
      case "yaml":
      case "yml":
        this.config = parseYaml(fileContent) as ScraperConfig;
        break;
      case "toml":
        this.config = parseToml(fileContent) as ScraperConfig;
        break;
      case "ini":
        this.config = parseIni(fileContent) as ScraperConfig;
        break;
      case "jsonc":
      case "json":
        this.config = parseJSONC(fileContent) as ScraperConfig;
        break;
      case "txt":
      case "text":
        this.config = { ...this.config, content: fileContent };
        break;
      default:
        throw new Error(`Unsupported config file format: .${ext}`);
    }

    this.validateConfig();
    return this.config;
  }

  private isValidUrl(str: string): boolean {
    try {
      new URL(str);
      return true;
    } catch {
      return false;
    }
  }

  private async fetchRemoteConfig(url: string): Promise<string> {
    const response = await fetch(url);
    if (!response.ok) {
      throw new Error(
        `Failed to fetch remote config from ${url}: ${response.status} ${response.statusText}`,
      );
    }
    return await response.text();
  }

  private getFileExtension(path: string): string {
    return path.split('.').pop()?.toLowerCase() ?? "";
  }

  private validateConfig() {
    if (!this.config.targetUrl) {
      throw new Error("Configuration Error: 'targetUrl' is required.");
    }
    if (!this.config.selectors || Object.keys(this.config.selectors).length === 0) {
      throw new Error("Configuration Error: 'selectors' must have at least one entry.");
    }
    if (
      !this.config.rateLimit ||
      typeof this.config.rateLimit.requests !== "number" ||
      typeof this.config.rateLimit.perSeconds !== "number"
    ) {
      throw new Error(
        "Configuration Error: 'rateLimit' must have 'requests' and 'perSeconds' as numbers.",
      );
    }
  }
}

// =====================
// HTTP Client
// =====================

export interface HttpClientOptions {
  maxRetries?: number;
  backoffFactor?: number;
  headers?: Record<string, string>;
}

export class HttpClient {
  private maxRetries: number;
  private backoffFactor: number;
  private headers: Headers;

  constructor(options: HttpClientOptions = {}) {
    this.maxRetries = options.maxRetries ?? 3;
    this.backoffFactor = options.backoffFactor ?? 500;
    this.headers = new Headers(options.headers);
  }

  async fetchWithRetry(
    url: string,
    options: RequestInit = {},
  ): Promise<Response> {
    let attempt = 0;
    while (attempt <= this.maxRetries) {
      try {
        const response = await fetch(url, {
          ...options,
          headers: this.headers,
        });

        if (!response.ok && this.shouldRetry(response.status)) {
          throw new Error(`HTTP Error: ${response.status}`);
        }

        return response;
      } catch (error) {
        attempt++;
        if (attempt > this.maxRetries) {
          throw new Error(
            `Failed to fetch ${url} after ${this.maxRetries} attempts: ${error}`,
          );
        }
        await delay(this.backoffFactor * attempt);
      }
    }
    throw new Error("Unexpected error in fetchWithRetry");
  }

  private shouldRetry(status: number): boolean {
    return status === 429 || (status >= 500 && status < 600);
  }
}

// =====================
// Rate Limiter
// =====================

export class RateLimiter {
  private tokenBucket: TokenBucket;

  constructor(requests: number, perSeconds: number) {
    this.tokenBucket = new TokenBucket(requests, perSeconds);
  }

  async acquire(): Promise<void> {
    await this.tokenBucket.removeTokens(1);
  }
}

// =====================
// HTML Parser
// =====================

export interface HtmlSelectors {
  [key: string]: string;
}

export class HtmlParser {
  private parser: DOMParser;

  constructor() {
    this.parser = new DOMParser();
  }

  parse(html: string): Document | null {
    return this.parser.parseFromString(html, "text/html");
  }

  extractData(document: Document, selectors: HtmlSelectors): Record<string, string> {
    const data: Record<string, string> = {};
    for (const [key, selector] of Object.entries(selectors)) {
      const element = document.querySelector(selector);
      data[key] = element ? element.textContent?.trim() ?? "" : "";
    }
    return data;
  }
}

// =====================
// Export All Components
// =====================

export {
  ConfigManager,
  HttpClient,
  RateLimiter,
  HtmlParser,
};

In [None]:
// scraper_facade.ts

import {
  ConfigManager,
  HttpClient,
  RateLimiter,
  HtmlParser,
  ScraperConfig,
  HtmlSelectors,
} from "./scraper_toolkit.ts";

export class ScraperFacade {
  private configManager: ConfigManager;
  private httpClient: HttpClient;
  private rateLimiter: RateLimiter;
  private htmlParser: HtmlParser;
  private config: ScraperConfig | null = null;

  constructor(configPathOrUrl: string) {
    this.configManager = new ConfigManager(configPathOrUrl);
  }

  async initialize(): Promise<void> {
    this.config = await this.configManager.loadConfig();
    this.httpClient = new HttpClient(this.config.httpClient);
    this.rateLimiter = new RateLimiter(
      this.config.rateLimit.requests,
      this.config.rateLimit.perSeconds
    );
    this.htmlParser = new HtmlParser();
  }

  async scrape(): Promise<Record<string, string>> {
    if (!this.config) {
      throw new Error("Scraper not initialized. Call initialize() first.");
    }

    await this.rateLimiter.acquire();
    const response = await this.httpClient.fetchWithRetry(this.config.targetUrl);
    const html = await response.text();

    const document = this.htmlParser.parse(html);
    if (!document) {
      throw new Error("Failed to parse HTML.");
    }

    return this.htmlParser.extractData(document, this.config.selectors);
  }

  async scrapeMultipleUrls(urls: string[]): Promise<Record<string, Record<string, string>>> {
    if (!this.config) {
      throw new Error("Scraper not initialized. Call initialize() first.");
    }

    const results: Record<string, Record<string, string>> = {};

    for (const url of urls) {
      await this.rateLimiter.acquire();
      const response = await this.httpClient.fetchWithRetry(url);
      const html = await response.text();

      const document = this.htmlParser.parse(html);
      if (!document) {
        console.warn(`Failed to parse HTML for URL: ${url}`);
        continue;
      }

      results[url] = this.htmlParser.extractData(document, this.config.selectors);
    }

    return results;
  }

  setCustomSelectors(selectors: HtmlSelectors): void {
    if (!this.config) {
      throw new Error("Scraper not initialized. Call initialize() first.");
    }
    this.config.selectors = selectors;
  }

  getConfig(): ScraperConfig | null {
    return this.config;
  }
}

| Feature                     | Description                                                                                     |
|-----------------------------|-------------------------------------------------------------------------------------------------|
| `ScraperFacade`             | Encapsulates all the complexity of the scraping process.                                      |
| `initialize()`              | Sets up all necessary components based on the configuration.                                   |
| `scrape()`                  | Provides a simple interface to perform a scrape operation on the configured URL.              |
| `scrapeMultipleUrls()`      | Demonstrates how the facade can be extended to handle more complex scenarios.                  |
| `setCustomSelectors()`      | Allows for runtime modification of the selectors, providing flexibility.                       |
| `getConfig()`               | Allows access to the current configuration if needed.                                         |
| Usage                       | Here's how you would use this facade in a Supabase Edge Function or standalone script:        |


In [None]:
import { ScraperFacade } from "./scraper_facade.ts";

export class Actor {
  private scraperFacade: ScraperFacade;

  constructor(configPathOrUrl: string) {
    this.scraperFacade = new ScraperFacade(configPathOrUrl);
  }

  static create(configPathOrUrl: string): Actor {
    return new Actor(configPathOrUrl);
  }

  async initialize(): Promise<void> {
    await this.scraperFacade.initialize();
  }

  async scrapeSingle(url: string): Promise<Record<string, string>> {
    // You can add additional logic here if needed
    return this.scraperFacade.scrape();
  }

  async scrapeMultiple(urls: string[]): Promise<Record<string, Record<string, string>>> {
    return this.scraperFacade.scrapeMultipleUrls(urls);
  }

  setSelectors(selectors: HtmlSelectors): void {
    this.scraperFacade.setCustomSelectors(selectors);
  }

  getConfiguration(): ScraperConfig | null {
    return this.scraperFacade.getConfig();
  }
}

In [None]:
// supabase_edge_function.ts

import puppeteer from 'https://deno.land/x/puppeteer@16.2.0/mod.ts';
import { Actor } from './actor.ts';

const actor = Actor.create(Deno.env.get('CONFIG_PATH_OR_URL') || 'default_config.json');

Deno.serve(async (req) => {
  try {
    await actor.initialize();

    const url = new URL(req.url).searchParams.get('url') || 'http://www.example.com';
    const result = await actor.scrapeSingle(url);

    return new Response(JSON.stringify(result), {
      headers: { 'Content-Type': 'application/json' },
    });
  } catch (e) {
    console.error(e);
    return new Response(JSON.stringify({ error: e.message }), {
      headers: { 'Content-Type': 'application/json' },
      status: 500,
    });
  }
});