# Wikipedia Artist Detail Download

In [1]:
%run ./wikipedia_setup.ipynb

## Configure DB and Load Album Extract

In [2]:
db = sqlite3.Connection("var/data/music.db")

In [3]:
url_cache = URLContentCacheSqlite(db)

wayl_db = WikipediaAlbumYearListsDB(db)
albums = wayl_db.get_all()

### Get Artist Links

In [4]:
def get_artist_link(links):
    if links is not None and 'Artist' in links:
        return links['Artist'][0]['href']
    
albums['ArtistLink'] = albums.Links.apply(get_artist_link)

In [5]:
artists = albums[albums.ArtistLink.notnull()].drop_duplicates('Artist')[['Artist', 'ArtistLink']].copy().sort_values(by='Artist')

In [13]:
# See what was downloaded already
downloaded_urls = pd.read_sql(f"""
    select url
    from url_cache
""", db)

downloaded_urls = set(downloaded_urls.url)

artists['PrevDownloaded'] = artists.ArtistLink.isin(downloaded_urls)
print(f"Previously downloaded: {artists['PrevDownloaded'].sum()}")

d_count = artists['PrevDownloaded'].sum()

for _, artist_row in artists[~artists.PrevDownloaded].iterrows():
    print(f"[{d_count + 1}] Getting: {artist_row.Artist}")
    url_cache.get(artist_row.ArtistLink)
    d_count += 1
    if (d_count % 25) == 0:
        clear_output()

Previously downloaded: 11134
[11135] Getting: Ashtrayhead
[11136] Getting: Pulkas
[11137] Getting: Sisters Of Glory
[11138] Getting: The black Crowes
[11139] Getting: Wildside
[11140] Getting: Wizardthrone


## Get A Single Link Per Artist

In [17]:
downloaded_urls = set(pd.read_sql(f"""
    select url
    from url_cache
""", db).url)

artists = artists[artists.ArtistLink.isin(downloaded_urls)].drop(columns='PrevDownloaded')

In [20]:
artists = artists.sort_values(by=['Artist', 'ArtistLink']).drop_duplicates('Artist')

## Process Artist HTML and Extract Text

In [22]:
def get_all_url_text(urls):
    p_count = 0
    
    def get_url_text(url):
        nonlocal p_count
        h = url_cache.get(url)
        try:
            if h is not None:
                h_doc = BeautifulSoup(h['content'])
                return h_doc.text
        except Exception as e:
            print(f"[WARN] Couldn't download {url} - {e}")
        finally:
            p_count += 1
            if (p_count % 100) == 0:
                print(f"[{dt.datetime.now()}] Processed {p_count} urls.")
    
    return urls.apply(get_url_text)

artists['ArtistDetailText'] = get_all_url_text(artists.ArtistLink)

[2021-10-30 10:08:48.995274] Processed 100 urls.
[2021-10-30 10:09:16.660085] Processed 200 urls.
[2021-10-30 10:09:43.000200] Processed 300 urls.
[2021-10-30 10:10:10.104714] Processed 400 urls.
[2021-10-30 10:10:35.283309] Processed 500 urls.
[2021-10-30 10:11:01.629205] Processed 600 urls.
[2021-10-30 10:11:28.894114] Processed 700 urls.
[2021-10-30 10:11:58.722298] Processed 800 urls.
[2021-10-30 10:12:27.047044] Processed 900 urls.
[2021-10-30 10:12:54.327798] Processed 1000 urls.
[2021-10-30 10:13:27.379760] Processed 1100 urls.
[2021-10-30 10:13:53.257401] Processed 1200 urls.
[2021-10-30 10:22:20.426897] Processed 1300 urls.
[2021-10-30 10:22:43.765982] Processed 1400 urls.
[2021-10-30 10:23:17.141421] Processed 1500 urls.
[2021-10-30 10:23:47.456642] Processed 1600 urls.
[2021-10-30 10:24:10.944356] Processed 1700 urls.
[2021-10-30 10:24:38.739454] Processed 1800 urls.
[2021-10-30 10:25:08.979454] Processed 1900 urls.
[2021-10-30 10:25:38.198487] Processed 2000 urls.
[2021-10-

## Save Artist Info

In [24]:
artists.sample(5)

Unnamed: 0_level_0,Artist,ArtistLink,ArtistDetailText
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
149,Dave,https://en.wikipedia.org/wiki/Dave_(rapper),\n\n\n\nDave (rapper) - Wikipedia\n\n\n\n\n\n\...
1593,Bowerbirds,https://en.wikipedia.org/wiki/Bowerbirds_(band),\n\n\n\nBowerbirds (band) - Wikipedia\n\n\n\n\...
2338,Tiësto,https://en.wikipedia.org/wiki/Ti%C3%ABsto,\n\n\n\nTiësto - Wikipedia\n\n\n\n\n\n\n\n\n\n...
4904,X Japan,https://en.wikipedia.org/wiki/X_Japan,\n\n\n\nX Japan - Wikipedia\n\n\n\n\n\n\n\n\n\...
27494,Bootsy Collins,https://en.wikipedia.org/wiki/Bootsy_Collins,\n\n\n\nBootsy Collins - Wikipedia\n\n\n\n\n\n...


In [35]:
artists.to_sql('artist', db, if_exists = 'replace', index=False)

In [36]:
cur = db.cursor()
cur.execute("create unique index i_artist_artist on artist (Artist)")
db.commit()
cur.close()            