In [5]:
import matplotlib as plt
%matplotlib inline
import pandas as pd
import matplotlib.patches as mpatches
import plotly.express as px
from geotext import GeoText
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
import requests
from bs4 import BeautifulSoup
import os
import re

In [6]:
#create a file for the output of gathered lyrics
open('output.txt', 'w')

<_io.TextIOWrapper name='output2.txt' mode='w' encoding='cp1252'>

In [8]:
#follow below steps from tutorial at 
# https://medium.com/analytics-vidhya/how-to-scrape-song-lyrics-a-gentle-python-tutorial-5b1d4ab351d2
GENIUS_API_TOKEN = (open("genius_token.txt").read())


def request_artist_info(artist_name, page):
    base_url = 'https://api.genius.com'
    headers = {'Authorization': 'Bearer ' + GENIUS_API_TOKEN}
    search_url = base_url + '/search?per_page=10&page=' + str(page)
    data = {'q': artist_name}
    response = requests.get(search_url, data=data, headers=headers)
    return response

def request_song_url(artist_name, song_cap):
    page = 1
    songs = []
    
    while True:
        response = request_artist_info(artist_name, page)
        json = response.json()
        song_info = []
        for hit in json['response']['hits']:
            if artist_name.lower() in hit['result']['primary_artist']['name'].lower():
                song_info.append(hit)
    
        for song in song_info:
            if (len(songs) < song_cap):
                url = song['result']['url']
                songs.append(url)
            
        if (len(songs) == song_cap):
            break
        else:
            page += 1
        
    print('Found {} songs by {}'.format(len(songs), artist_name))
    return songs
    
request_song_url('Lana Del Rey', 300)

def scrape_song_lyrics(url):
    page = requests.get(url)
    html = BeautifulSoup(page.text, 'html.parser')
    lyrics = html.find('div', class_='lyrics').get_text()
    lyrics = re.sub(r'[\(\[].*?[\)\]]', '', lyrics)
    lyrics = os.linesep.join([s for s in lyrics.splitlines() if s])         
    return lyrics

def write_lyrics_to_file(artist_name, song_count):
    f = open('output.txt', 'wb')
    urls = request_song_url(artist_name, song_count)
    for url in urls:
        lyrics = scrape_song_lyrics(url)
        f.write(lyrics.encode("utf8"))
    f.close()
    num_lines = sum(1 for line in open('output.txt', 'rb'))
    print('Wrote {} lines to file from {} songs'.format(num_lines, song_count))
  
write_lyrics_to_file('Lana Del Rey', 300)

Found 300 songs by Lana Del Rey
Found 300 songs by Lana Del Rey
Wrote 13446 lines to file from 300 songs


In [9]:
#open the file and read its contents
f = open('output.txt', 'r', encoding='utf-8')

content = f.read()

In [10]:
#use GeoText on the file and put together a list of cities
places = GeoText(content)
cities_from_text = places.cities

In [11]:
#create a dataframe of the cities for all countries
city_mentions = pd.DataFrame(cities_from_text, columns=['city'])

In [12]:
#notice that some of the city names don't sound legit
city_mentions['city'].value_counts()

New York         50
Hollywood        33
Brooklyn         20
Of               16
Santa Monica     13
Paris            12
Buy              11
Mustang          10
Coney Island     10
Palm             10
Roses             9
Young             8
Venice            6
Man               5
Mary              5
Sunset            5
Pontiac           5
Bradley           5
Green             4
Born              4
Las Vegas         4
Florida           4
Long Beach        4
Mon               4
Mercedes          4
Paradise          3
Miami             3
Nancy             3
Bentley           3
Boo               3
                 ..
Tijuana           1
Medellín          1
Carolina          1
Pensacola         1
York              1
Detroit           1
Kansas            1
Columbia          1
Staten Island     1
Adrian            1
Monterrey         1
London            1
Oklahoma          1
Riverside         1
Fresno            1
Mascara           1
Golden            1
Joplin            1
Laredo            1


In [13]:
#create a column for country origins
f = lambda x: GeoText(x).country_mentions

origin = city_mentions['city'].apply(f)

city_mentions['country_raw'] = origin

In [15]:
city_mentions.head()

Unnamed: 0,city,country_raw
0,Hollywood,{'US': 1}
1,Hollywood,{'US': 1}
2,Hollywood,{'US': 1}
3,Born,{'NL': 1}
4,New York,{'US': 1}


In [16]:
#create a fxn to clean the output in the country column
fn = lambda x: list(x)[0]

city_mentions['country'] = city_mentions['country_raw'].apply(fn)

In [17]:
city_mentions.head()

Unnamed: 0,city,country_raw,country
0,Hollywood,{'US': 1},US
1,Hollywood,{'US': 1},US
2,Hollywood,{'US': 1},US
3,Born,{'NL': 1},NL
4,New York,{'US': 1},US


In [18]:
# select only cities mentioned in the United States
city_mentions = city_mentions[city_mentions['country'] == 'US']

In [19]:
city_mentions.head()

Unnamed: 0,city,country_raw,country
0,Hollywood,{'US': 1},US
1,Hollywood,{'US': 1},US
2,Hollywood,{'US': 1},US
4,New York,{'US': 1},US
5,Woodstock,{'US': 1},US


In [20]:
city_mentions = city_mentions.drop(columns=['country_raw', 'country'])

In [21]:
city_mentions.head()

Unnamed: 0,city
0,Hollywood
1,Hollywood
2,Hollywood
4,New York
5,Woodstock


In [22]:
#check value counts and do some manual cleaning - I'm sure there's a better way to search the lyrics for surrounding
# words and figure this out - but given the size of the dataset, it was easier for me to do it manually
city_mentions['city'].value_counts()

New York         50
Hollywood        33
Brooklyn         20
Santa Monica     13
Paris            12
Palm             10
Coney Island     10
Mustang          10
Venice            6
Bradley           5
Pontiac           5
Sunset            5
Long Beach        4
Las Vegas         4
Green             4
Paradise          3
Miami             3
San Francisco     3
Newport           2
Eden              2
Coachella         2
Crystal           2
Woodstock         2
Mansfield         2
Kokomo            2
Fresno            1
Riverside         1
Bellevue          1
Monroe            1
Los Angeles       1
Kansas            1
Newark            1
Joplin            1
Adrian            1
York              1
Sarasota          1
Pensacola         1
Laredo            1
Golden            1
Staten Island     1
Oklahoma          1
Columbia          1
Detroit           1
Tallahassee       1
Name: city, dtype: int64

In [23]:
cities_to_remove = ['Paris', 'Mustang', 'Palm', 'Bradley', 'Sunset', 'Pontiac', 'Green', 'Paradise', 'Mansfield', 'Eden',
                   'Crystal', 'Monroe', 'Columbia', 'Laredo', 'Joplin', 'Adrian', 'York', 'Golden', 'Oklahoma', 
                    'Kansas', 'Coachella','Kokomo','Woodstock']
city_mentions = city_mentions[~city_mentions['city'].isin(cities_to_remove)]

In [24]:
city_mentions['city'].value_counts()

New York         50
Hollywood        33
Brooklyn         20
Santa Monica     13
Coney Island     10
Venice            6
Long Beach        4
Las Vegas         4
Miami             3
San Francisco     3
Newport           2
Staten Island     1
Bellevue          1
Pensacola         1
Tallahassee       1
Newark            1
Los Angeles       1
Fresno            1
Sarasota          1
Riverside         1
Detroit           1
Name: city, dtype: int64

In [25]:
#manually change data for Newport - since LDR means Newport Beach in CA and not Newport, Rhode Island and this will mess
#up the geocoder
city_mentions = city_mentions.replace(to_replace ='Newport', value ='Newport Beach')
city_mentions = city_mentions.replace(to_replace ='Venice', value ='Venice Beach')

In [26]:
city_mentions['city'].value_counts()

New York         50
Hollywood        33
Brooklyn         20
Santa Monica     13
Coney Island     10
Venice Beach      6
Long Beach        4
Las Vegas         4
San Francisco     3
Miami             3
Newport Beach     2
Newark            1
Staten Island     1
Bellevue          1
Pensacola         1
Tallahassee       1
Los Angeles       1
Fresno            1
Sarasota          1
Riverside         1
Detroit           1
Name: city, dtype: int64

In [27]:
#check value counts again
city_val_counts = city_mentions['city'].value_counts()

In [28]:
#create a list of city names to find the coordinates of
unique_list = (city_mentions['city'].unique().tolist())

In [29]:
#use geopy's geolocator to find information on all the cities in the list
chrome_user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36"
geolocator = Nominatim(timeout=10,user_agent=chrome_user_agent)

lat_lon = []
for city in unique_list: 
    try:
        location = geolocator.geocode(city)
        if location:
            lat_lon.append(location)
    except GeocoderTimedOut as e:
        print("Error: geocode failed on input %s with message %s"%
             (city, e))

In [30]:
#create a df for raw output from the geolocator, maybe put cleaning directly after this; or append to original df
city_data = pd.DataFrame(lat_lon, columns=['raw_data','raw_data2'])

#re-order the cols
city_data = city_data[['raw_data2', 'raw_data']]

In [31]:
city_data.head()

Unnamed: 0,raw_data2,raw_data
0,"(34.0980031, -118.3295232)","Hollywood, Los Angeles, Los Angeles County, Ca..."
1,"(40.7127281, -74.0060152)","New York, United States of America"
2,"(36.1672559, -115.1485163)","Las Vegas, Clark County, Nevada, United States..."
3,"(40.57592745, -73.99219674341339)","Coney Island, Brooklyn, Kings County, New York..."
4,"(33.97996005, -118.46877706707772)","Venice Beach, Pacific Avenue, Venice, Los Ange..."


In [32]:
#exapand entries in raw_data - we only really care about 0 so will drop the rest afterwards
city_data = city_data.join(city_data['raw_data'].str.split(',', expand=True))

In [33]:
city_data.head()

Unnamed: 0,raw_data2,raw_data,0,1,2,3,4,5,6,7
0,"(34.0980031, -118.3295232)","Hollywood, Los Angeles, Los Angeles County, Ca...",Hollywood,Los Angeles,Los Angeles County,California,90028-8127,United States of America,,
1,"(40.7127281, -74.0060152)","New York, United States of America",New York,United States of America,,,,,,
2,"(36.1672559, -115.1485163)","Las Vegas, Clark County, Nevada, United States...",Las Vegas,Clark County,Nevada,United States of America,,,,
3,"(40.57592745, -73.99219674341339)","Coney Island, Brooklyn, Kings County, New York...",Coney Island,Brooklyn,Kings County,New York,11224,United States of America,,
4,"(33.97996005, -118.46877706707772)","Venice Beach, Pacific Avenue, Venice, Los Ange...",Venice Beach,Pacific Avenue,Venice,Los Angeles,Los Angeles County,California,90292.0,United States of America


In [34]:
#change the coordinates to a string to remove the parentheses
city_data['raw_data2'] = city_data['raw_data2'].astype(str)
city_data['in_progress'] = city_data['raw_data2'].map(lambda x:x.lstrip('()'))
city_data['in_progress2'] = city_data['in_progress'].map(lambda x:x.rstrip('()'))

In [35]:
#split the coordinates using the comma as the delimiter
city_data[['lat','lon']] = city_data.in_progress2.str.split(",",expand=True,)

In [36]:
#check the output
city_data.head()

Unnamed: 0,raw_data2,raw_data,0,1,2,3,4,5,6,7,in_progress,in_progress2,lat,lon
0,"(34.0980031, -118.3295232)","Hollywood, Los Angeles, Los Angeles County, Ca...",Hollywood,Los Angeles,Los Angeles County,California,90028-8127,United States of America,,,"34.0980031, -118.3295232)","34.0980031, -118.3295232",34.0980031,-118.3295232
1,"(40.7127281, -74.0060152)","New York, United States of America",New York,United States of America,,,,,,,"40.7127281, -74.0060152)","40.7127281, -74.0060152",40.7127281,-74.0060152
2,"(36.1672559, -115.1485163)","Las Vegas, Clark County, Nevada, United States...",Las Vegas,Clark County,Nevada,United States of America,,,,,"36.1672559, -115.1485163)","36.1672559, -115.1485163",36.1672559,-115.1485163
3,"(40.57592745, -73.99219674341339)","Coney Island, Brooklyn, Kings County, New York...",Coney Island,Brooklyn,Kings County,New York,11224,United States of America,,,"40.57592745, -73.99219674341339)","40.57592745, -73.99219674341339",40.57592745,-73.99219674341339
4,"(33.97996005, -118.46877706707772)","Venice Beach, Pacific Avenue, Venice, Los Ange...",Venice Beach,Pacific Avenue,Venice,Los Angeles,Los Angeles County,California,90292.0,United States of America,"33.97996005, -118.46877706707772)","33.97996005, -118.46877706707772",33.97996005,-118.46877706707772


In [37]:
#drop unnecessary columns
cols_to_drop = [0,1,3,4,5,6,7,8,9,10,11]

city_data.drop(city_data.columns[cols_to_drop],axis=1,inplace=True)

In [38]:
city_data.head()

Unnamed: 0,0,lat,lon
0,Hollywood,34.0980031,-118.3295232
1,New York,40.7127281,-74.0060152
2,Las Vegas,36.1672559,-115.1485163
3,Coney Island,40.57592745,-73.99219674341339
4,Venice Beach,33.97996005,-118.46877706707772


In [39]:
city_data.columns = ['city','lat','lon']

In [40]:
city_data.head()

Unnamed: 0,city,lat,lon
0,Hollywood,34.0980031,-118.3295232
1,New York,40.7127281,-74.0060152
2,Las Vegas,36.1672559,-115.1485163
3,Coney Island,40.57592745,-73.99219674341339
4,Venice Beach,33.97996005,-118.46877706707772


In [41]:
#change the latitudes and longitudes to float types so they can be used later in mapping
city_data['lat'] = city_data['lat'].astype(float)
city_data['lon'] = city_data['lon'].astype(float)

In [42]:
# merge with the value counts for the city mentions df
city_counts = pd.DataFrame(city_val_counts)

In [43]:
city_counts.head()

Unnamed: 0,city
New York,50
Hollywood,33
Brooklyn,20
Santa Monica,13
Coney Island,10


In [44]:
#reset index and name columns to match
city_counts = city_counts.reset_index()
city_counts.columns = ['city', 'mentions']

In [45]:
city_counts.head()

Unnamed: 0,city,mentions
0,New York,50
1,Hollywood,33
2,Brooklyn,20
3,Santa Monica,13
4,Coney Island,10


In [46]:
#drop duplicates from geolocator dataframe
city_data.drop_duplicates(keep=False,inplace=True)

In [47]:
city_data.head()

Unnamed: 0,city,lat,lon
0,Hollywood,34.098003,-118.329523
1,New York,40.712728,-74.006015
2,Las Vegas,36.167256,-115.148516
3,Coney Island,40.575927,-73.992197
4,Venice Beach,33.97996,-118.468777


In [48]:
city_data = city_data[['city', 'lat','lon']]

In [49]:
#merge the two dataframes
merged = pd.merge(city_data, city_counts, on='city', how='left')

In [50]:
merged.head()

Unnamed: 0,city,lat,lon,mentions
0,Hollywood,34.098003,-118.329523,33
1,New York,40.712728,-74.006015,50
2,Las Vegas,36.167256,-115.148516,4
3,Coney Island,40.575927,-73.992197,10
4,Venice Beach,33.97996,-118.468777,6


In [51]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21 entries, 0 to 20
Data columns (total 4 columns):
city        21 non-null object
lat         21 non-null float64
lon         21 non-null float64
mentions    21 non-null int64
dtypes: float64(2), int64(1), object(1)
memory usage: 840.0+ bytes


In [59]:
# map the locations using plotly and mapbox and create an interactive map

px.set_mapbox_access_token(open("mapbox_token.txt").read())
df = px.data.carshare()
fig = px.scatter_mapbox(merged, lat='lat', lon='lon', color='mentions', size='mentions',
                  color_continuous_scale=px.colors.sequential.Agsunset, size_max=40, zoom=3, 
                        hover_data=['city'])
#fig.update_layout(title_text = 'US Cities Mentioned in Lana Del Rey Songs')

fig.update_layout(
    title={
        'text': 'US Cities Mentioned in Lana Del Rey Songs',
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})


fig.show()

#save graph as html

with open('plotly_graph.html', 'w') as f:
    f.write(fig.to_html(include_plotlyjs='cdn'))