# URL Cache

In [1]:
import datetime as dt
import io
import gzip

import requests
from bs4 import BeautifulSoup

class GzipUtil:
    """Utility methods for compressing strings."""
    
    @staticmethod
    def compress_string(s):
        buf = io.BytesIO()
        with gzip.GzipFile(mode="w", fileobj=buf) as fh:
            fh.write(s.encode('utf-8'))
        buf.seek(0)
        return buf.read()

    @staticmethod
    def decompress_string(b):
        buf = io.BytesIO(b)
        with gzip.GzipFile(mode="r", fileobj=buf) as fh:
            return fh.read().decode('utf-8')

class URLContentCacheSqlite:
    """Retrieve URL content using a SQLite based cache."""
    
    def __init__(self, db):
        self.db = db
        self._init_schema()
    
    def _init_schema(self):
        cur = self.db.cursor()
        cur.execute(f"""
            CREATE TABLE IF NOT EXISTS url_cache (
                url VARCHAR(500) NOT NULL,
                content BLOB,
                content_type VARCHAR(200),
                created_date TIMESTAMP NOT NULL
            )
        """)

        cur.execute(f"""
            CREATE UNIQUE INDEX IF NOT EXISTS i_url_cache_pk ON url_cache (url)
        """)
        self.db.commit()
    
    def get(self, url):
        cache_r = self.get_cache(url)
        if cache_r is None:
            resp = requests.get(url)
            if resp.status_code != 200:
                raise Exception(f"Could not download url ({resp.code}) - {url}")
            self.put_cache(url, resp.text, resp.headers['Content-Type'])
            cache_r = self.get_cache(url)
        return cache_r
    
    def get_cache(self, url):
        cur = self.db.cursor()
        cur.execute("""
            SELECT content, content_type, created_date
            FROM url_cache
            WHERE url = ?
        """, [url])
        results = cur.fetchall()
        if len(results) > 0:
            r = { col_info[0]: v for col_info, v in zip(cur.description, results[0]) }
            r['content'] = GzipUtil.decompress_string(r['content'])
            return r

    def put_cache(self, url, content, content_type):
        cur = self.db.cursor()
        cur.execute(f"""
            INSERT INTO url_cache VALUES(?, ?, ?, CURRENT_TIMESTAMP)
        """, [url, GzipUtil.compress_string(content), content_type])
        db.commit()