In [1]:

import psycopg2
import echonestkeys as secret
import json
import urllib2
import time 
import requests

import pandas as pd
import numpy as np
from collections import Counter

from fuzzywuzzy import fuzz, process
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

from scraper import country_adjectives
from scraper import putumayo_artists
from scraper import artists_wikipedia


In [371]:
states = [u'WA', u'WI', u'WV', u'FL', u'WY', u'NH', u'NJ', u'NM', u'NA', u'NC', u'ND', u'NE', u'NY', u'RI', u'NV',
          u'GU', u'CO', u'CA', u'GA', u'CT', u'OK', u'OH', u'KS', u'SC', u'KY', u'OR', u'SD', u'DE', u'DC', u'HI',
          u'PR', u'TX', u'LA', u'TN', u'PA', u'VA', u'VI', u'AK', u'AL', u'AS', u'AR', u'VT', u'IL', u'IN', u'IA', 
          u'AZ', u'ID', u'ME', u'MD', u'MA', u'UT', u'MO', u'MN', u'MI', u'MT', u'MP', u'MS', u'Washington', 
          u'Wisconsin', u'West Virginia', u'Florida', u'Wyoming', u'New Hampshire', u'New Jersey', u'New Mexico', 
          u'National', u'North Carolina', u'North Dakota', u'Nebraska', u'New York', u'Rhode Island', u'Nevada', 
          u'Guam', u'Colorado', u'California', u'Georgia', u'Connecticut', u'Oklahoma', u'Ohio', u'Kansas', 
          u'South Carolina', u'Kentucky', u'Oregon', u'South Dakota', u'Delaware', u'District of Columbia', u'Hawaii', 
          u'Puerto Rico', u'Texas', u'Louisiana', u'Tennessee', u'Pennsylvania', u'Virginia', u'Virgin Islands', 
          u'Alaska', u'Alabama', u'American Samoa', u'Arkansas', u'Vermont', u'Illinois', u'Indiana', u'Iowa', 
          u'Arizona', u'Idaho', u'Maine', u'Maryland', u'Massachusetts', u'Utah', u'Missouri', u'Minnesota', 
          u'Michigan', u'Montana', u'Northern Mariana Islands', u'Mississippi']

all_countries = json.loads(requests.get("https://restcountries.eu/rest/v1/all").text)
country_names = {}
for country in all_countries:
    
    name = country['name']
    latlng = country['latlng']
    alt = country['altSpellings']
    demonym = country['demonym']
    native = country['nativeName']
    if latlng != []:
        if demonym != '':
            names = filter(None, alt)
            names.append(name)
            names.append(native)
            names.append(demonym)
            for n in names:
                if n not in country_names.keys():
                    country_names[n] = (name, latlng)
                    
for s in states:
    country_names[s] = country_names['United States']


adjectives = country_adjectives()

for adj in adjectives.keys():
    commsplit = adj.split(', ')
    if len(commsplit) == 2:
        offname = commsplit[1]+' '+commsplit[0]
    else:
        offname = adj
    if offname in country_names.keys():
        for val in adjectives[adj]:
            if val.strip() not in country_names.keys():
                country_names[val.strip()] = country_names[offname]

country_names['England']=country_names['UK']
country_names['English']=country_names['UK']
country_names['Scotland']=country_names['UK']
country_names['Northern Ireland']=country_names['UK']
country_names['Wales']=country_names['UK']
country_names['Ireland']=country_names['Republic of Ireland']
country_names['Svalbard']=country_names['Norway']
country_names['Jan Mayen']=country_names['Norway']
country_names['Montreal']=country_names['Canada']
likelyNext = {}
for c in country_names.keys():
    words = word_tokenize(c)
    if len(words)>1:
        pos = pos_tag(words)
        pronouns = [a for (a, b) in pos if b == 'NNP' or b =='NNPS' or b == 'NN' or b == 'JJ']
        len_nnp = len(pronouns)
        if len_nnp == 1:
            if pronouns[0] not in country_keys:
                country_names[pronouns[0]]=country_names[c]
        elif len_nnp > 1:          
            country_names[' '.join(pronouns)] = country_names[c]
            likelyNext[' '.join(pronouns)]=[]
            for i in range(1,len_nnp):
                lkey = ' '.join(pronouns[:i])
                lval = pronouns[i:]
                if lkey in likelyNext.keys():
                    newval = likelyNext[lkey][:]
                    for n in lval:
                        newval.append(n)
                    likelyNext[lkey]=newval
                else:
                    likelyNext[lkey]=lval

country_keys = country_names.keys()
likely_keys = likelyNext.keys()


In [454]:
def get_json(url):
    response = requests.get(url)
    if response.status_code==200:
        return json.loads(response.text)
    else:
        print 'Response code not 200 for %s' % url
        return None

def fuzzy_match(a, names):
    lower = unicode.lower
    strlower = str.lower
    if a == names[0]:
        return 0
    if strlower(a) == lower(names[0]):
        return 0
    best = max([(fuzz.ratio(a, n),n) for n in names])
    if best[0]>0.7:
        return names.index(best[1])
    else:
        return -1
    
    
def spotify_artist(query):
    q = '+'.join(query.split()).strip()
    url = "https://api.spotify.com/v1/search?q=%s&type=artist" % q
    results = get_json(url)['artists']['items']
    if results != []:
        names = [r['name'] for  r in results]
        index = fuzzy_match(query, names)
        if index != -1:
            return results[index]
    else:
        return None

    

def extract_country(text):
    try:
        words = word_tokenize(text)
    except:
        words = word_tokenize(text.decode('utf8'))
    pos = pos_tag(words)
    pronouns = [a for (a, b) in pos if b == 'NNP' or b =='NNPS' or b == 'NN' or b == 'JJ']
    pronouns.reverse()
    countries_seen=[]
    query = ''
    while pronouns:
        query = ' '.join([query,pronouns.pop()]).strip()

        if query in country_keys:
            countries_seen.append(country_names[query])
        else:
            if query not in likely_keys:
                if len(query.split(' '))==1:
                    query = ''
                else:
                    pronouns.append(query.split(' ')[-1])

                    query = ''
    return Counter([name for (name,latlang) in countries_seen]).most_common()

def echo_artist(spotify_artist_id):
    url = 'http://developer.echonest.com/api/v4/artist/profile?\
api_key=%s&id=spotify:artist:%s&bucket=biographies' % (secret.apikey(), spotify_artist_id)
    response = get_json(url)['response']
    if response['status']['code'] == 0: 
        return response['artist']
    return None


        
        
def echo_country(biographies):
    if biographies != []:
        bios = [b['text'] for b in biographies]
        try:
            sentences = [' '.join(sent_tokenize(b.decode('utf8'))[:3]) for b in bios]
        except:
            sentences = [' '.join(sent_tokenize(b)[:3]) for b in bios]
        text = ' '.join(sentences)

        return extract_country(text)
    else:
        return biographies

            

In [450]:
def spotify_top_tracks(spotify_id):
    try:
        url = 'https://api.spotify.com/v1/artists/%s/top-tracks?country=US' % spotify_id
        response = get_json(url)
        if response['tracks']:
            return response['tracks']
        return None
    except:
        return None


def echo_summary(spotify_track_id):
    url = 'http://developer.echonest.com/api/v4/track/profile?api_key=%s\
&id=spotify:track:%s&bucket=audio_summary' % (secret.apikey(), spotify_track_id)
    rate_limiter()
    response = get_json(url)['response']
    status = response['status']['code']
    if status == 0:
        return response['track']
    else:
        return None

    
def de(spotify_id):
    response = spotify_top_tracks(spotify_id)
    if response:
        tracks = response
        for track in tracks:
            spotify_track_name = track['name']
            spotify_track_id = track['id']
            spotify_track_preview = track['preview_url']
            spotify_track_explicit = track['explicit']
            spotify_track_artists = [(a['id'],a['name']) for a in track['artists']]
            spotify_track_duration = track['duration_ms']
            e = echo_summary(spotify_track_id)
            if e:
                aud_sum = e['audio_summary']
                print {s: aud_sum[s] for s in aud_sum if s != 'analysis_url'}
                return
            else:
                nameq = '%20'.join(spotify_track_name.strip(' '))
                'http://developer.echonest.com/api/v4/song/search?api_key=%s\
&format=json&results=1&artist%s&title=%s&bucket=audio_summary&bucket=artist_location' % (secret.apikey(), echo_id, nameq)

de(spotify_artist('Miley Cyrus')['id'])


# [u'status',
#  u'song_id',
#  u'title',
#  u'foreign_ids',
#  u'artist',
#  u'foreign_release_ids',
#  u'foreign_release_id',
#  u'catalog',
#  u'release',
#  u'foreign_id',
#  u'id',
#  u'audio_summary']


{u'time_signature': 4, u'energy': 0.6220087746234382, u'liveness': 0.3700262537298555, u'tempo': 80.003, u'speechiness': 0.03344592280777209, u'acousticness': 0.008816917600021911, u'danceability': 0.6125644326649705, u'instrumentalness': 8.22981909495782e-09, u'key': 1, u'duration': 231.24, u'loudness': -5.794, u'valence': 0.47220227365181217, u'mode': 0}


In [431]:
echo_limit = 0
def rate_limiter():
    global echo_limit
    echo_limit += 1
    if echo_limit%119 == 0:
        time.sleep(63)
        
def rate_reset():
    global echo_limit
    echo_limit = 0



artists2013 = artists_wikipedia(2013)
for artist in artists2013[:100]:
    a = spotify_artist(artist)
    if a is not None:
        spotify_id = a['id']
        spotify_name = a['name']
        spotify_genres = a['genres']
        spotify_popularity = a['popularity']
        country = echo_country(spotify_id)
        
        rate_limiter()
        if country != []:
            
    echo = echo_artist(spotify_artist_id)
    if echo:
        artist = echo['artist']
        biographies = artist['biographies']
        echo_artist_id = artist['id']
        echo_artist_name = artist['name']
    

In [457]:
# echo_country(spotify_artist('Bitter : Sweet')['id'])

# url = 'https://www.putumayo.com/african-blues-9-7-15-9-13-15/'
# artistsP = putumayo_artists(url, 5)

In [456]:
spotify_artist('Miley Cyrus')['id']


u'5YGY8feqx7naU7z4HrwZM6'

In [None]:
url = 'http://developer.echonest.com/api/v4/track/profile?api_key=%s\
&id=spotify:track:%s&bucket=audio_summary&bucket=artist_location' % (secret.apikey(), spotify_track_id)