In [1]:
'''This script drives the data acquisition and storage'''
import time
import requests
import json
import urllib2
import echonestkeys as secret
from fuzzywuzzy import fuzz, process
import psycopg2
from scraper import artists_wikipedia
from scraper import putumayo_artists
url = 'https://www.putumayo.com/african-blues-9-7-15-9-13-15/'
artists = putumayo_artists(url, 5)
artists.keys()

def artists(y1, y2, pgs):
    #y1 - first year to scrape wikipedia
    #y2 - second year
    #pgs - putumayo pages
    
    artists = []
    
    for i in range(y1, y2+1, 1):
        artists = artists + artists_wikipedia(i)

    url = 'https://www.putumayo.com/african-blues-9-7-15-9-13-15/'
    artists = artists + putumayo_artists(url, pgs).keys()
    return set(artists)

echo_limit = 0
def rate_limiter():
    global echo_limit
    echo_limit += 1
    if echo_limit%355 == 0:
        time.sleep(63)
        
def rate_reset():
    global echo_limit
    echo_limit = 0

api_keys = secret.apikeys()

def api_generator():
    key = api_keys.pop(0)
    api_keys.append(key)
    time.sleep(1)
    return key


def get_json(url):
    response = requests.get(url)
    if response.status_code==200:
        return json.loads(response.text)
#     print 'Response code not 200 for %s' % url
    return None

def fuzzy_match(a, names):
    lower = unicode.lower
    strlower = str.lower
    if a == names[0]:
        return 0
    try:
        if strlower(a) == lower(names[0]):
            return 0
    except:
        if lower(a) == lower(names[0]):
            return 0
    best = max([(fuzz.ratio(a, n),n) for n in names])
    if best[0]>0.7:
        return names.index(best[1])
    else:
        return -1

def get_sp_artist(query):
    q = '+'.join(query.split()).strip()
    url = "https://api.spotify.com/v1/search?q=%s&type=artist" % q
    results = get_json(url)['artists']['items']
    if results != []:
        names = [r['name'] for  r in results]
        index = fuzzy_match(query, names)
        if index != -1:
            return results[index]
    return None


def get_ec_track(artistId, track):
    key = api_generator()
    url = 'http://developer.echonest.com/api/v4/song/search?api_key=%s&format=json&results=1\
&artist_id=spotify:artist:%s&title=%s&bucket=audio_summary&bucket=artist_location' % (key, artistId , track)
    rate_limiter()
    response = get_json(url)
    if response:
        response = response['response']
        status = response['status']['code']
        if status == 0:
            songs = response['songs']
            if songs!=[]:
                return songs[0]
    return None

def get_sp_topTracks(sp_artistId):
    url = 'https://api.spotify.com/v1/artists/%s/top-tracks?country=US' % sp_artistId
    response = get_json(url)
    if response:
        tracks = response['tracks']
        if tracks:
            return tracks
    return None


# create table artists(name text, sp_id text, ec_id text, genres text[], popularity integer, images text, latitude double precision, longitude double precision, location text);
# create table tracks(name text, sp_id text, ec_id text, duration integer, preview text, explicit bool);
# create table audiosummary(sp_trackId text, time_signature integer, energy double precision, liveness double precision, tempo double precision, speechiness double precision, acousticness double precision,
#                           danceability double precision, instrumentalness double precision, key integer, loudness double precision, valence double precision, mode integer);
import cPickle
az = cPickle.load(file('artistrip','rb'))


In [2]:
conn = psycopg2.connect("dbname='worldmusicexplorer' user='home'")
cur = conn.cursor()
rate_reset()

for a in az:
    artist = get_sp_artist(a)
    if artist:
        sp_artistId = artist['id']
        sp_artistName = artist['name']
        sp_artistGenres = artist['genres']
        sp_artistPopularity = artist['popularity']
        try:
            sp_artistImageLinks = artist['images'][0]['url']
        except:
            sp_artistImageLinks = None
        tracks = get_sp_topTracks(sp_artistId)
        if tracks:
            if len(tracks) >= 1:
                track = tracks[0]
                sp_trackId = track['id']
                sp_trackName = track['name']
                sp_trackPreview = track['preview_url']
                sp_trackExplicit = track['explicit']
                sp_trackDuration = track['duration_ms']
                q_trackName = '%20'.join(sp_trackName.split(' '))
                ec_track = get_ec_track(sp_artistId, q_trackName)
                if ec_track:
                    ec_trackId = ec_track['id']
                    
                    audsum = ec_track['audio_summary']
                    
                    timesig = audsum['time_signature']
                    energy = audsum['energy']
                    liveness = audsum['liveness']
                    tempo = audsum['tempo']
                    speechiness = audsum['speechiness']
                    acousticness = audsum['acousticness']
                    danceability = audsum['danceability']
                    instrumentalness = audsum['instrumentalness']
                    key = audsum['key']
                    loudness = audsum['loudness']
                    valence = audsum['valence']
                    mode = audsum['mode']
                    
                    ec_artistId = ec_track['artist_id']
                    
                    ec_artistLocation = ec_track['artist_location']
                    if ec_artistLocation != {}:
                        print '%s has made it' % sp_artistName
                        latitude = ec_artistLocation['latitude']
                        location = ec_artistLocation['location']
                        longitude = ec_artistLocation['longitude']
                        
                        SQL = """insert into artists(name, sp_id, ec_id, genres, popularity, images, 
                        latitude, longitude, location) values (%s, %s, %s, %s, %s, %s, %s, %s, %s)"""
                        data = (sp_artistName, sp_artistId, ec_artistId, sp_artistGenres, sp_artistPopularity, sp_artistImageLinks,
                        latitude, longitude, location)
                        cur.execute(SQL, data) 
                        cur.execute("COMMIT")
        
                        SQL = "insert into tracks(name, sp_id, ec_id, duration, preview, explicit) values (%s, %s, %s, %s, %s, %s);"
                        data = (sp_trackName, sp_trackId, ec_trackId, sp_trackDuration, sp_trackPreview, sp_trackExplicit)
                        cur.execute(SQL, data)
                        cur.execute("COMMIT")
                        
                        SQL = """insert into audiosummary(sp_trackId, time_signature, energy, liveness, tempo, speechiness, acousticness,
                        danceability, instrumentalness, key, loudness, valence, mode) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, 
                        %s, %s)"""
                        data = (sp_trackId, timesig,energy,liveness,tempo,speechiness,acousticness,danceability,
                        instrumentalness,key,loudness,valence,mode)
                        cur.execute(SQL, data)
                        cur.execute("COMMIT")
  
                

Alexandra Burke has made it
Rocket Juice & The Moon has made it
Eliza Doolittle has made it
Queens of the Stone Age has made it
Dierks Bentley has made it
Deerhunter has made it
Prong has made it
Dan Croll has made it
Architects has made it
K'NAAN has made it
Iamamiwhoami has made it
Mystery has made it
Goldfrapp has made it
Bullet For My Valentine has made it
Big Time Rush has made it
Jukebox The Ghost has made it
Chris Tomlin has made it
Sleaford Mods has made it
Swans has made it
a-ha has made it
Burzum has made it
Gary Allan has made it
The Crystal Method has made it
Spawn of possession has made it
Delorean has made it
Red Hot Chili Peppers has made it
Mala Rodríguez has made it
Zomboy has made it
MercyMe has made it
Sons Of Magdalene has made it
deadmau5 has made it
The Joy Formidable has made it
For The Fallen Dreams has made it
Forever The Sickest Kids has made it
D.R.I. has made it
Finger Eleven has made it
Airbourne has made it
PUP has made it
Brody Dalle has made it
Emilíana 

KeyError: 'audio_summary'

In [9]:
z = pd.Series.from_csv('world-country-names.tsv', sep='\t')
set_countries = list(set([c for c in countries if c != 'United States']))
new_countries = []
new_ids = []
for c in set_countries:
    if c == 'South Korea':
        c = 'Korea, Republic of'
    if c == 'Russia':
        c = 'Russian Federation'
    if c in z.values:
        new_countries.append(c)
        new_ids.append(z[z==c].index[0])

pd.Series(new_countries, index=new_ids)


In [10]:
cur.execute('Select latitude, longitude from artists')
rows = cur.fetchall()
# import googlemapsapi as goog
# mapkey = goog.apikey()

def get_country(lat, lon):
    url = 'https://maps.googleapis.com/maps/api/geocode/json?latlng=%s,%s&key=%s' % (lat, lon, mapkey)
    obj = json.loads(requests.get(url).text)
    if obj['status'] == 'OK':
        result = obj['results'][0]
        components = result['address_components']
        country = [c['long_name'] for c in components if c['types'][0] == 'country'][0]
        return country
    else:
        return None

    
countries = []
for r in rows:
    lat = r[0]
    lon = r[1]
    countries.append(get_country(lat, lon))
set_countries = set([c for c in countries if c != 'United States'])

(51.5072648, -0.1278328)
(51.5063, -0.12714)
(51.5063, -0.12714)
(33.76437, -116.339766)
(33.401395, -111.931298)
(33.7483, -84.3911)
(40.7146, -74.0071)
(53.0162014, -2.1812607)
(50.8289, -0.13414)
(2.04117, 45.3441)
(59.3333333, 18.05)
(35.2196, -80.0195)
(51.5072648, -0.1278328)
(51.505833, -3.577222)
(33.973951, -118.248405)
(38.8991, -77.029)
(32.661269, -95.77607)
(52.9533834, -1.1487384)
(40.7146, -74.0071)
(59.9123, 10.75)
(60.391111, 5.324722)
(33.9071, -118.012)
(33.973951, -118.248405)
(56.6661, 16.3637)
(43.2848, -2.17764)
(33.973951, -118.248405)
(27.75, -18.0)
(50.1213173, -5.5329266)
(33.189011, -96.10887)
(41.8842, -87.6324)
(43.6486, -79.3853)
(53.3418732, -1.2789459)
(42.732535, -84.555535)
(32.492683, -97.264359)
(53.7974185, -1.5437941)
(29.7605, -95.3698)
(43.3262, -79.7985)
(-38.3843, 142.483)
(41.5524, -81.4645)
(-37.7984, 144.979)
(64.1101, -21.8929)
(40.694481, -73.989319)
(35.187133, -117.885359)
(33.7483, -84.3911)
(41.850033, -87.650052)
(45.507856, -122.690

In [252]:
# import numpy as np
cur.execute('Select * from audiosummary')
audsum = cur.fetchall()
audiomatrix = list(audsum[0][1:])

for i in range(len(audsum)):
    if any(iter(a is None for a in audsum[i])) == False:
        audiomatrix = np.c_[audiomatrix, list(audsum[i][1:])]
    else:
        print i

audiomatrix = audiomatrix.T[1:]

676
684


In [294]:
from sklearn.cluster import KMeans
# clusters = KMeans.fit_predict(k = 10,audiomatrix)
model = KMeans(n_clusters=24).fit(audiomatrix)
labels = model.labels_

values = model.cluster_centers_

In [265]:
countries_order = countries[0:676]+countries[677:684]+countries[685:]

In [267]:
def cluster_number(n):
    cluster = []
    for i in range(len(labels)):
        if labels[i] == n:
            cluster.append(countries_order[i])
    return cluster

In [296]:
for i in range(24):
    print len(cluster_number(i))

32
50
30
54
43
33
55
31
9
19
47
38
20
27
2
23
29
33
26
36
48
7
34
10


In [303]:
cur.execute('Select genres from artists')
genres = cur.fetchall()
genres_order = genres[0:676]+genres[677:684]+genres[685:]

In [301]:
def cluster_names(n):
    cluster = []
    for i in range(len(labels)):
        if labels[i] == n:
            tup = (artists_order[i], genres[i], countries_order[i])
            cluster.append(tup)
    return cluster

In [311]:
cluster_names(3)

[(('Jukebox The Ghost',), ([],), u'United States'),
 (('Burzum',), (['black metal', 'pagan black metal'],), u'Norway'),
 (('Sons Of Magdalene',), ([],), u'United States'),
 (('Sirenia',),
  (['gothic metal', 'gothic symphonic metal', 'symphonic metal'],),
  u'Norway'),
 (('The Band Perry',), ([],), u'United States'),
 (('The Birthday Massacre',), ([],), u'Canada'),
 (('Ashley Monroe',), (['country dawn'],), u'United States'),
 (('LP',), ([],), u'United States'),
 (('Hildegunn \xc3\x98iseth',), ([],), u'Norway'),
 (("Jamie's Elsewhere",), (['screamo'],), u'United States'),
 (('Oceans Ate Alaska',), ([],), u'United Kingdom'),
 (('Dionne Warwick',), ([],), u'United States'),
 (('Mayer Hawthorne',), ([],), u'United States'),
 (('Suicidal Tendencies',), (['crossover thrash'],), u'United States'),
 (('Anti-Flag',),
  (['melodic hardcore', 'punk', 'skate punk'],),
  u'United States'),
 (('The Neighbourhood',), (['shimmer pop'],), u'United States'),
 (('French Montana',), ([],), u'United State

In [340]:
ages = [98,34,12]
addresses = ['main','finderne','rocky']
ppl = ['jamie', 'rodger', 'camila']

In [341]:
dic = {}
for i in range(len(ppl)):
    name = ppl[i]
    age = ages[i]
    add = addresses[i]
    agedict = {"age", age}
    adddict = {"add", add}
    dic[name]=(agedict,adddict)
    