# Wikipedia Artist Detail Download

In [2]:
%run ./wikipedia_setup.ipynb

## Configure DB and Load Album Extract

In [3]:
db = sqlite3.Connection("var/data/music.db")

In [4]:
url_cache = URLContentCacheSqlite(db)

wayl_db = WikipediaAlbumYearListsDB(db)
albums = wayl_db.get_all()

### Get Artist Links

In [5]:
def get_artist_link(links):
    if links is not None and 'Artist' in links:
        return links['Artist'][0]['href']
    
albums['ArtistLink'] = albums.Links.apply(get_artist_link)

In [6]:
artists = albums[albums.ArtistLink.notnull()].drop_duplicates('Artist')[['Artist', 'ArtistLink']].copy().sort_values(by='Artist')

In [7]:
# See what was downloaded already
downloaded_urls = pd.read_sql(f"""
    select url
    from url_cache
""", db)

downloaded_urls = set(downloaded_urls.url)

artists['PrevDownloaded'] = artists.ArtistLink.isin(downloaded_urls)
print(f"Previously downloaded: {artists['PrevDownloaded'].sum()}")

d_count = artists['PrevDownloaded'].sum()

for _, artist_row in artists[~artists.PrevDownloaded].iterrows():
    print(f"[{d_count + 1}] Getting: {artist_row.Artist}")
    url_cache.get(artist_row.ArtistLink)
    d_count += 1
    if (d_count % 25) == 0:
        clear_output()

Previously downloaded: 11134
[11135] Getting: Ashtrayhead
[11136] Getting: Pulkas
[11137] Getting: Sisters Of Glory
[11138] Getting: The black Crowes
[11139] Getting: Wildside
[11140] Getting: Wizardthrone


## Get A Single Link Per Artist

In [8]:
downloaded_urls = set(pd.read_sql(f"""
    select url
    from url_cache
""", db).url)

artists = artists[artists.ArtistLink.isin(downloaded_urls)].drop(columns='PrevDownloaded')

In [9]:
artists = artists.sort_values(by=['Artist', 'ArtistLink']).drop_duplicates('Artist')

## Process Artist HTML and Extract Text

In [None]:
artists['ArtistDetailText'] = list(url_cache.get_all_url_text(artists.ArtistLink))

## Save Artist Info

In [11]:
artists.sample(5)

Unnamed: 0_level_0,Artist,ArtistLink,ArtistDetailText
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
9873,Y&T,https://en.wikipedia.org/wiki/Y%26T,\n \n \n \n Y&T - Wikipedia \n \n \n \n \n \n ...
2693,A.C.E,https://en.wikipedia.org/wiki/A.C.E_(South_Kor...,\n \n \n \n A.C.E (South Korean band) - Wikipe...
12954,The Pipettes,https://en.wikipedia.org/wiki/The_Pipettes,\n \n \n \n The Pipettes - Wikipedia \n \n \n ...
20430,The Stereo Bus,https://en.wikipedia.org/wiki/The_Stereo_Bus,\n \n \n \n The Stereo Bus - Wikipedia \n \n \...
15031,Darius,https://en.wikipedia.org/wiki/Darius_Danesh,\n \n \n \n Darius Campbell - Wikipedia \n \n ...


In [12]:
artists.to_sql('artist', db, if_exists = 'replace', index=False)

In [13]:
cur = db.cursor()
cur.execute("create unique index i_artist_artist on artist (Artist)")
db.commit()
cur.close()            