# Wikipedia Artist Detail Download

In [2]:
%run ./wikipedia_setup.ipynb

## Configure DB and Load Album Extract

In [3]:
db = sqlite3.Connection("var/data/music.db")

In [4]:
url_cache = URLContentCacheSqlite(db)

wayl_db = WikipediaAlbumYearListsDB(db)
albums = wayl_db.get_all()

### Get Artist Links

In [5]:
def get_artist_link(links):
    if links is not None and 'Artist' in links:
        return links['Artist'][0]['href']
    
albums['ArtistLink'] = albums.Links.apply(get_artist_link)

In [6]:
artists = albums[albums.ArtistLink.notnull()].drop_duplicates('Artist')[['Artist', 'ArtistLink']].copy().sort_values(by='Artist')

In [7]:
# See what was downloaded already
downloaded_urls = pd.read_sql(f"""
    select url
    from url_cache
""", db)

downloaded_urls = set(downloaded_urls.url)

artists['PrevDownloaded'] = artists.ArtistLink.isin(downloaded_urls)
print(f"Previously downloaded: {artists['PrevDownloaded'].sum()}")

d_count = artists['PrevDownloaded'].sum()

for _, artist_row in artists[~artists.PrevDownloaded].iterrows():
    print(f"[{d_count + 1}] Getting: {artist_row.Artist}")
    url_cache.get(artist_row.ArtistLink)
    d_count += 1
    if (d_count % 25) == 0:
        clear_output()

Previously downloaded: 11134
[11135] Getting: Ashtrayhead
[11136] Getting: Pulkas
[11137] Getting: Sisters Of Glory
[11138] Getting: The black Crowes
[11139] Getting: Wildside
[11140] Getting: Wizardthrone


## Get A Single Link Per Artist

In [8]:
downloaded_urls = set(pd.read_sql(f"""
    select url
    from url_cache
""", db).url)

artists = artists[artists.ArtistLink.isin(downloaded_urls)].drop(columns='PrevDownloaded')

In [9]:
artists = artists.sort_values(by=['Artist', 'ArtistLink']).drop_duplicates('Artist')

## Process Artist HTML and Extract Text

In [10]:
def get_all_url_text(urls):
    p_count = 0
    
    def get_url_text(url):
        nonlocal p_count
        h = url_cache.get(url)
        try:
            if h is not None:
                h_doc = BeautifulSoup(h['content'])
                # It is important to use getText with a separator.
                return h_doc.getText(' ')
        except Exception as e:
            print(f"[WARN] Couldn't download {url} - {e}")
        finally:
            p_count += 1
            if (p_count % 100) == 0:
                print(f"[{dt.datetime.now()}] Processed {p_count} urls.")
    
    return urls.apply(get_url_text)

artists['ArtistDetailText'] = get_all_url_text(artists.ArtistLink)

[2021-10-30 11:51:04.564708] Processed 100 urls.
[2021-10-30 11:51:31.855810] Processed 200 urls.
[2021-10-30 11:51:57.647254] Processed 300 urls.
[2021-10-30 11:52:24.654919] Processed 400 urls.
[2021-10-30 11:52:49.295930] Processed 500 urls.
[2021-10-30 11:53:15.567072] Processed 600 urls.
[2021-10-30 11:53:42.645021] Processed 700 urls.
[2021-10-30 11:54:12.755713] Processed 800 urls.
[2021-10-30 11:54:40.445847] Processed 900 urls.
[2021-10-30 11:55:06.473634] Processed 1000 urls.
[2021-10-30 11:55:39.633697] Processed 1100 urls.
[2021-10-30 11:56:07.211471] Processed 1200 urls.
[2021-10-30 11:56:37.846147] Processed 1300 urls.
[2021-10-30 12:20:35.472537] Processed 1400 urls.
[2021-10-30 12:21:09.554079] Processed 1500 urls.
[2021-10-30 12:21:40.375221] Processed 1600 urls.
[2021-10-30 12:22:03.916732] Processed 1700 urls.
[2021-10-30 12:22:32.081748] Processed 1800 urls.
[2021-10-30 12:23:02.112976] Processed 1900 urls.
[2021-10-30 12:23:31.651148] Processed 2000 urls.
[2021-10-

## Save Artist Info

In [11]:
artists.sample(5)

Unnamed: 0_level_0,Artist,ArtistLink,ArtistDetailText
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
9873,Y&T,https://en.wikipedia.org/wiki/Y%26T,\n \n \n \n Y&T - Wikipedia \n \n \n \n \n \n ...
2693,A.C.E,https://en.wikipedia.org/wiki/A.C.E_(South_Kor...,\n \n \n \n A.C.E (South Korean band) - Wikipe...
12954,The Pipettes,https://en.wikipedia.org/wiki/The_Pipettes,\n \n \n \n The Pipettes - Wikipedia \n \n \n ...
20430,The Stereo Bus,https://en.wikipedia.org/wiki/The_Stereo_Bus,\n \n \n \n The Stereo Bus - Wikipedia \n \n \...
15031,Darius,https://en.wikipedia.org/wiki/Darius_Danesh,\n \n \n \n Darius Campbell - Wikipedia \n \n ...


In [12]:
artists.to_sql('artist', db, if_exists = 'replace', index=False)

In [13]:
cur = db.cursor()
cur.execute("create unique index i_artist_artist on artist (Artist)")
db.commit()
cur.close()            