Topic: Project 5  
Subject: Scraping Hip-hop artists from Wikipedia  
Date: 12/12/2017  
Name: Zach Heick  

**Summary**: Scraped hip-hop artists from Wikipedia and stored data into database.

In [8]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
import spotipy
import spotipy.util as util
from spotipy.oauth2 import SpotifyClientCredentials
import os
from pprint import pprint
import re
from collections import defaultdict

# Scrape Hip-Hop Artists from Wikipedia

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_hip_hop_musicians'
response = requests.get(url)
page = response.text
soup = BeautifulSoup(page,"html5lib")

In [3]:
hh_artists = []
for item in soup.find_all(class_='div-col columns column-width'):
    for li in item.find_all('li'):
        text = re.sub('\[\w*\]', '', li.text)
        hh_artists.append(text)

In [4]:
client_credentials_manager = SpotifyClientCredentials(client_id=os.environ['SPOTIFY_CLIENT_ID'],
                                                     client_secret=os.environ['SPOTIFY_CLIENT_SECRET'])

spotify = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [5]:
def get_artist_data(name):
    """
    Get artist IDs from Spotify API
    :param name: name of the artist
    :return: artist name, ID, and popularity score
    """
    artist_data = {}

    results = spotify.search(q='artist:' + name, type='artist')
    
    if len(results['artists']['items']) > 0:
        top_result = results['artists']['items'][0]
        artist_data['name'] = name
        artist_data['id'] = top_result['id']
        artist_data['popularity'] = top_result['popularity']
        return artist_data
    else:
        return None  

In [6]:
artist_data = []
for name in hh_artists:
    data = get_artist_data(name)
    artist_data.append(data)

retrying ...1secs


In [7]:
ids = []
for d in artist_data:
    if d == None:
        ids.append('-1')
    else:
        ids.append(d['id'])

# Removing Duplicate Artists

In [9]:
dups = defaultdict(list)
for i, e in enumerate(ids):
    dups[e].append(i)
    
print('Duplicates\n')
    
for k, v in dups.items():
    if len(v) >= 2:
        print(k,v)
        print(hh_artists[v[0]])
        print(hh_artists[v[1]])
        print('---------------')

Duplicates

1URnnhqYAYcrqrcwql10ft [6, 943]
21 Savage
Savage
---------------
6gc6oo3u2f7SqTd4mhe81O [30, 488]
Ahmad
Jamal
---------------
-1 [134, 157, 174, 223, 281, 382, 406, 518, 524, 549, 602, 667, 775, 792, 814, 1005, 1032, 1082, 1090, 1091, 1106, 1118, 1144, 1155]
Boi-1da
Busy Bee Starski
---------------
7pqZVedlsryCmpMGf9L9zV [152, 383]
Bumpy Knuckles
Freddie Foxxx
---------------
3BG9cpTxZQzQChcsp7ss9a [167, 754]
Capone
Mr. Capone-E
---------------
5NjUjtjtkzZqRuWFdspT1K [200, 568]
Chip
King Chip
---------------
7c0XG5cIJTrrAgEC3ULPiq [309, 1114]
Dolla
Ty Dolla Sign
---------------
6f4XkbvYlXMH0QgVRzW0sM [368, 1139]
FLAME
Waka Flocka Flame
---------------
7LnaAXbDVIL75IVPnndf7w [376, 490]
Foxx
Jamie Foxx
---------------
4Q5sPmM8j4SpMqL4UA1DtS [618, 638]
Lil Flip
Lil' Flip
---------------
4LLpKhyESsyAXpc4laK94U [671, 675]
Mac
Mac Miller
---------------
5lHRUCqkQZCIWeX7xG4sYT [858, 899]
Quan
Rich Homie Quan
---------------
0bfX8pF8kuHNCs57Ms4jZb [918, 919]
Roscoe
Roscoe Dash
----

In [10]:
duplicates = [943,488,152,383,167,200,309,368,376,638,671,858,918,1125]
artist_data = np.delete(artist_data, duplicates)

In [11]:
len(artist_data)

1192

# Store Data

In [35]:
from sqlalchemy import create_engine, MetaData, Table
engine_name = 'postgresql://' + str(os.environ['zU']) + ':' + str(os.environ['zP']) + '@13.59.54.149:5432/project_kojak'
engine = create_engine(engine_name)

m = MetaData()
m.reflect(engine)

In [36]:
conn = engine.connect()
conn.execute(m.tables['artists'].insert(), [d for d in artist_data if d is not None])

<sqlalchemy.engine.result.ResultProxy at 0x113bedb38>