# Wikipedia Album Detail Download

In [1]:
import os
import re
from urllib.parse import urljoin
import datetime as dt

import requests
from bs4 import BeautifulSoup
import sqlite3
import pandas as pd
import numpy as np
import scipy
import scipy.sparse

import matplotlib.pyplot as plt
from sklearn.cluster import MiniBatchKMeans

from IPython.display import display, HTML, clear_output

In [2]:
%run ./url_cache.ipynb
%run ./wikipedia_access.ipynb
%run ./wikipedia_album_detail_lib.ipynb

## Configure DB and Load Album Extract

In [3]:
db = sqlite3.Connection("var/data/music.db")

In [4]:
url_cache = URLContentCacheSqlite(db)

wayl_db = WikipediaAlbumYearListsDB(db)
albums = wayl_db.get_all()

### Get Albums With Detail Links

In [5]:
def get_album_link(links):
    if links is not None and 'Album' in links and len(links['Album']) > 0:
        return links['Album'][0]['href']
    
albums['AlbumLink'] = albums.Links.apply(get_album_link)

In [6]:
albums_w_detail_links = albums[albums.AlbumLink.notnull()]

### Download Links

In [8]:
# See what was downloaded already
downloaded_urls = pd.read_sql(f"""
    select url
    from url_cache
""", db)

downloaded_urls = set(downloaded_urls.url)
print(f"Already downloaded {len(downloaded_urls)} album details.")

d_count = len(downloaded_urls)

for _, album_row in albums_w_detail_links[~albums_w_detail_links.AlbumLink.isin(downloaded_urls)].iterrows():
    print(f"[{d_count + 1}] Getting: {album_row.Artist} - {album_row.Album} [{album_row.Year}]")
    r = url_cache.get(album_row.AlbumLink)
    if r is None:
        print(f"[WARNING] link not found.")
    d_count += 1
    
    if (d_count % 25) == 0:
        clear_output()

Already downloaded 29504 album details.
[29505] Getting: The Aislers Set - The Last Match (album) [2000]
[29506] Getting: The Business/Dropkick Murphys - Mob Mentality [2000]
[29507] Getting: Happyland - Welcome To... Happyland [1998]
[29508] Getting: Zumpano - Look What The Rookie Did [1995]
[29509] Getting: Björk - The Best Mixes from the Album Debut for All the People Who Don't Buy white Labels [1994]
[29510] Getting: Prince - The black Album [1994]
[29511] Getting: Atkins - Atkins [1982]
[29512] Getting: Robocar Poli - Scott Polin [1973]
[29513] Getting: Ptolomy Pscycon - Loose Capacitor [1971]
[29514] Getting: Booker T & Priscilla Coolidge - Booker T. & Priscilla [1971]
