# Wikipedia Album Detail Download

In [9]:
%run ./wikipedia_setup.ipynb

## Configure DB and Load Album Extract

In [2]:
db = sqlite3.Connection("var/data/music.db")

In [12]:
url_cache = URLContentCacheSqlite(db)

wayl_db = WikipediaAlbumYearListsDB(db)
albums = wayl_db.get_all()

### Get Albums With Detail Links

In [6]:
def get_album_link(links):
    if links is not None and 'Album' in links and len(links['Album']) > 0:
        return links['Album'][0]['href']
    
albums['AlbumLink'] = albums.Links.apply(get_album_link)

In [7]:
albums_w_detail_links = albums[albums.AlbumLink.notnull()]

### Download Links

In [8]:
# See what was downloaded already
downloaded_urls = set(pd.read_sql(f"""
    select url
    from url_cache
""", db).url)

d_count = albums_w_detail_links.AlbumLink.isin(downloaded_urls).sum()
print(f"Already downloaded {d_count} album details.")

for _, album_row in albums_w_detail_links[~albums_w_detail_links.AlbumLink.isin(downloaded_urls)].iterrows():
    print(f"[{d_count + 1}] Getting: {album_row.Artist} - {album_row.Album} [{album_row.Year}]")
    r = url_cache.get(album_row.AlbumLink)
    if r is None:
        print(f"[WARNING] link not found.")
    d_count += 1
    
    if (d_count % 25) == 0:
        clear_output()

[29601] Getting: Robocar Poli - Scott Polin [1973]
[29602] Getting: Ptolomy Pscycon - Loose Capacitor [1971]
[29603] Getting: Booker T & Priscilla Coolidge - Booker T. & Priscilla [1971]


## Process HTML and Get Text

In [27]:
album_detail_text = (albums_w_detail_links[['Artist', 'Album', 'AlbumLink']])
album_detail_text = album_detail_text[album_detail_text.AlbumLink.isin(downloaded_urls)].copy()

album_detail_text = album_detail_text.sort_values(by=['Artist', 'Album', 'AlbumLink'])
album_detail_text = album_detail_text.drop_duplicates(['Artist', 'Album'])

album_detail_text

Unnamed: 0_level_0,Artist,Album,AlbumLink
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
17484,!!!,!!!,https://en.wikipedia.org/wiki/!!!_(album)
14564,!!!,Louden Up Now,https://en.wikipedia.org/wiki/Louden_Up_Now
11819,!!!,Myth Takes,https://en.wikipedia.org/wiki/Myth_Takes
5228,!!!,Shake the Shudder,https://en.wikipedia.org/wiki/Shake_the_Shudder
26233,"""Weird Al"" Yankovic","""Weird Al"" Yankovic",https://en.wikipedia.org/wiki/%22Weird_Al%22_Y...
...,...,...,...
15328,Ünloco,Becoming i,https://en.wikipedia.org/wiki/Becoming_i
17250,Ünloco,Healing,https://en.wikipedia.org/wiki/Healing_(%C3%9Cn...
18271,Ünloco,Useless,https://en.wikipedia.org/wiki/Useless_(album)
13347,…And You Will Know Us by the Trail of Dead,So Divided,https://en.wikipedia.org/wiki/So_Divided


In [29]:
album_detail_text['AlbumDetailText'] = list(url_cache.get_all_url_text(album_detail_text.AlbumLink))

[2021-10-31 09:00:23.227444] Processed 100 urls.
[2021-10-31 09:00:41.464358] Processed 200 urls.
[2021-10-31 09:00:59.475622] Processed 300 urls.
[2021-10-31 09:01:25.158966] Processed 400 urls.
[2021-10-31 09:01:42.093395] Processed 500 urls.
[2021-10-31 09:02:00.335183] Processed 600 urls.
[2021-10-31 09:02:13.948112] Processed 700 urls.
[2021-10-31 09:02:30.185562] Processed 800 urls.
[2021-10-31 09:02:52.181866] Processed 900 urls.
[2021-10-31 09:03:08.160701] Processed 1000 urls.
[2021-10-31 09:03:24.051416] Processed 1100 urls.
[2021-10-31 09:03:41.269849] Processed 1200 urls.
[2021-10-31 09:03:55.466968] Processed 1300 urls.
[2021-10-31 09:04:10.411364] Processed 1400 urls.
[2021-10-31 09:04:26.589791] Processed 1500 urls.
[2021-10-31 09:04:46.001741] Processed 1600 urls.
[2021-10-31 09:05:01.937757] Processed 1700 urls.
[2021-10-31 09:05:16.387499] Processed 1800 urls.
[2021-10-31 09:05:36.545089] Processed 1900 urls.
[2021-10-31 09:05:56.446529] Processed 2000 urls.
[2021-10-

## Save Album Detail

In [30]:
album_detail_text.sample(2)

Unnamed: 0_level_0,Artist,Album,AlbumLink,AlbumDetailText
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1402,NF,Clouds (The Mixtape),https://en.wikipedia.org/wiki/Clouds_(The_Mixt...,\n \n \n \n Clouds (The Mixtape) - Wikipedia \...
29209,Tom Fogerty,Zephyr National,https://en.wikipedia.org/wiki/Zephyr_National,\n \n \n \n Zephyr National - Wikipedia \n \n ...


In [36]:
album_detail_text.to_sql('album_detail_text', db, if_exists = 'replace', index=False)

In [37]:
cur = db.cursor()
cur.execute("create unique index i_album_detail_text_1 on album_detail_text (Artist, Album)")
cur.execute("create index i_album_detail_text_2 on album_detail_text (AlbumLink)")
db.commit()
cur.close()