# Wikipedia album detail core code

In [None]:
import json
import re
import pandas as pd

from bs4 import BeautifulSoup

In [None]:
class DocumentWordsDB:
    """Store and retrieve word lists for documents."""
    
    def __init__(self, db):
        self.db = db
        self.init_schema()
        
    def init_schema(self):
        cur = self.db.cursor()
        cur.execute("""
            CREATE TABLE IF NOT EXISTS document_words (
                category VARCHAR(100) NOT NULL,
                doc_id VARCHAR(100) NOT NULL,
                words TEXT
            )
        """)
        cur.execute("""
            CREATE UNIQUE INDEX IF NOT EXISTS i_document_words_pk
            ON document_words (category, doc_id)
        """)
        self.db.commit()
    
    def save(self, category, doc_id, words):
        words_json = json.dumps(words)
        cur = self.db.cursor()
        cur.execute("""
            INSERT INTO document_words (category, doc_id, words)
            VALUES(?, ?, ?) 
            ON CONFLICT(category, doc_id) 
            DO UPDATE SET words = excluded.words
        """, [category, doc_id, words_json])
        
    def get(self, category, doc_id):
        cur = self.db.cursor()
        cur.execute("""
            SELECT words
            FROM document_words
            WHERE category = ? AND doc_id = ?
        """, [category, doc_id])
        r = cur.fetchone()
        if r is not None:
            return json.loads(r[0])

In [None]:
class AlbumDetailWPDocWords:
    """Logic for extracting words from an album detail document."""
    
    WORD_SPLIT_RE = re.compile(r'[\'\":;,\.\/\!\(\)\[\]{}\s=&\?]+')
    HAS_LETTER_RE = re.compile(r'[a-z]')

    @classmethod
    def get_words(cls, html):
        bs = BeautifulSoup(html)
        html_txt = bs.getText(' ')
        words = []
        for w in cls.WORD_SPLIT_RE.split(html_txt):
            if len(w) > 1:
                w = w.lower()
                if cls.HAS_LETTER_RE.search(w):
                    words.append(w)
        return words

    def get_detail_words(url):
        r = url_cache.get(url)
        if r is not None:
            return get_words(r['content'])
        
class AlbumDetailWPDocWordsRetriever:
    """Processor for getting and storing word lists for album details."""
    
    def __init__(self, url_cache, db):
        self.db = db
        self.url_cache = url_cache
        self.doc_words_db = DocumentWordsDB(self.db)
        
    def get_words(self, doc_id, url):
        words = self.doc_words_db.get('wp_album_year_list_detail', doc_id)
        if words is None:
            r = None
            if url is not None:
                r = self.url_cache.get(url)
            words = []
            if r is not None:
                words = AlbumDetailWPDocWords.get_words(r['content'])
            self.doc_words_db.save('wp_album_year_list_detail', doc_id, words)
        return words
    
    def process_albums(self, albums):
        count = 0
        for _id, album in albums.iterrows():
            self.get_words(_id, album.AlbumLink)
            count += 1
            if (count % 1000) == 0:
                self.db.commit()
                print(f"[{dt.datetime.now()}] Processed {count} albums.")
        
        self.db.commit()
        print(f"Processed {count} albums.")