### This Juypter Notebook is used to gather location details from geopy API
This can be considered second version of geopy-test.ipynb 

In [1]:
import os
import glob
import json

import boto3
import configparser
import matplotlib.pyplot as plt
import pandas as pd
from time import time
import numpy as np
import html

In [2]:
import sys
!{sys.executable} -m pip install geopy

Collecting geopy
[?25l  Downloading https://files.pythonhosted.org/packages/80/93/d384479da0ead712bdaf697a8399c13a9a89bd856ada5a27d462fb45e47b/geopy-1.20.0-py2.py3-none-any.whl (100kB)
[K    100% |████████████████████████████████| 102kB 3.4MB/s a 0:00:011
[?25hCollecting geographiclib<2,>=1.49 (from geopy)
  Downloading https://files.pythonhosted.org/packages/8b/62/26ec95a98ba64299163199e95ad1b0e34ad3f4e176e221c40245f211e425/geographiclib-1.50-py3-none-any.whl
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-1.50 geopy-1.20.0


In [3]:
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
from geopy.exc import GeocoderServiceError
geopy = Nominatim(user_agent="geopy-Testing")

from geopy.extra.rate_limiter import RateLimiter
geocode = RateLimiter(geopy.geocode, min_delay_seconds=1)
reverse = RateLimiter(geopy.reverse, min_delay_seconds=1)

In [4]:
def recur_geocode(address):
    """
    Summary line. 
    Sometimes function fails due to timeout and other issues and i have seen it run successful, when rerun
    So using recursion to avoid that issue. 
    Problem with this approach : Can go into infinite loop. But, so far in my tests that has not happened.
    So did that infinite loop brake.
  
    Parameters: 
    arg1 (artist_location)
  
    Returns: 
    Location details
    """         
    try:        
        #return geopy.geocode(address, language='en')
        return geocode(address, language='en')
    except (GeocoderTimedOut, GeocoderServiceError) as e:
        print('Error : recur_geocode : {} : {}'.format(address, e))
        return recur_geocode(address)

def recur_reverse(latslong):
    """
    Summary line. 
    Sometimes function fails due to timeout and other issues and i have seen it run successful, when rerun
    So using recursion to avoid that issue. 
    Problem with this approach : Can go into infinite loop. But, so far in my tests that has not happened.
    So did that infinite loop brake.
  
    Parameters: 
    arg1 (latitude)
    arg2 (longitude)
  
    Returns: 
    Location details
    """             
    try:
        #geopy = Nominatim(user_agent="geopy-Testing")
        #location = geolocator.reverse(latslong,  language='en')       
#       return geopy.reverse(latslong, language='en')
        return reverse(latslong, language='en')
    except (GeocoderTimedOut, GeocoderServiceError) as e:
        print('Error : ',e)
        print('Error : recur_reverse : {} : {}'.format(latslong, e))
        return recur_geocode(latslong)

def get_location_details(*argv):
    """
    Summary line. 
    Gets the location details
  
    Parameters: 
    Allow variable number of arguments
    #if 1 argument, input is location name
    #if 2 argument, input is lats & long
  
    Returns: 
    Formatted location details
    """         
    geolocator = Nominatim(user_agent="geopy-Testing")

    if(len(argv) == 1):
        #location = geolocator.geocode(argv[0],  language='en')
        location = recur_geocode(argv[0])
        if(location == None):
            return None
        lats = location.latitude
        long = location.longitude
        latslong = "{}, {}".format(lats, long)
        
    elif(len(argv) == 2):
        for i, arg in enumerate(argv):
            if(i==0):
                lats = arg
            else:
                long = arg
            
        latslong = "{}, {}".format(lats, long)
        
    #location = geolocator.reverse(latslong,  language='en')
    location = recur_reverse(latslong)
    if(location == None):
        return None
    
    lats = location.latitude
    long = location.longitude
    geoData = location.raw
    #print(geoData)
    if(geoData.get('address') != None):
        loc_details = {}
        loc_details['county'] = np.NaN if (geoData['address'].get('county') == None) else geoData['address'].get('county')
        loc_details['city'] = np.NaN if (geoData['address'].get('city') == None) else geoData['address'].get('city')
        loc_details['state'] = np.NaN if (geoData['address'].get('state') == None) else geoData['address'].get('state')
        loc_details['country'] = np.NaN if (geoData['address'].get('country') == None) else geoData['address'].get('country')
        loc_details['country_code'] = np.NaN if (geoData['address'].get('country_code') == None) else geoData['address'].get('country_code')
        loc_details['lat'] = lats
        loc_details['lon'] = long
        loc_details['latslong'] = latslong
    else:
        return None
    return loc_details

In [5]:
#Initial Tests to see if the geocode works
input = 'St. Joseph, MO'
geolocator = Nominatim(user_agent="geopy-Testing")
location = geolocator.geocode(input,  language='en')
print(location.address)
print(location.latitude)
print(location.longitude)

St. Joseph, Buchanan County, Missouri, USA
39.7686055
-94.8466322


In [6]:
#Inital tests to see if the reverse_geocode works
lats = location.latitude
long = location.longitude
print("Before : Latitude:{}, Longitude:{}".format(lats, long))
latslong = "{}, {}".format(lats, long)
location = geolocator.reverse(latslong,  language='en')
print('Raw : {}'.format(location.raw))
geoData = location.raw
city = geoData['address']['city']
state = geoData['address']['state']
country = geoData['address']['country']
print("City:{}".format(city))
print("State:{}".format(state))
print("Country:{}".format(country))
print("After : Latitude:{}, Longitude:{}".format(lats, long))

Before : Latitude:39.7686055, Longitude:-94.8466322
Raw : {'place_id': 184596674, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright', 'osm_type': 'way', 'osm_id': 526006570, 'lat': '39.7686102', 'lon': '-94.8467038258343', 'display_name': 'Saint Joseph City Hall, 1100, Frederick Avenue, St. Joseph, Buchanan County, Missouri, 64501, USA', 'address': {'townhall': 'Saint Joseph City Hall', 'house_number': '1100', 'road': 'Frederick Avenue', 'city': 'St. Joseph', 'county': 'Buchanan County', 'state': 'Missouri', 'postcode': '64501', 'country': 'USA', 'country_code': 'us'}, 'boundingbox': ['39.7685008', '39.7687214', '-94.847066', '-94.8463415']}
City:St. Joseph
State:Missouri
Country:USA
After : Latitude:39.7686055, Longitude:-94.8466322


In [7]:
#Inital test to see if the get_location_details() works
lat = 39.7686055
lon = -94.8466322
location_details = get_location_details(lat, lon)
print(location_details)

{'county': 'Buchanan County', 'city': 'St. Joseph', 'state': 'Missouri', 'country': 'USA', 'country_code': 'us', 'lat': 39.7686102, 'lon': -94.8467038258343, 'latslong': '39.7686055, -94.8466322'}


### Loading song_df_latlon with latitude & longitude locations for geocode processing

In [8]:
song_df_latlon = pd.read_csv("song_df_latlon.csv")
print('song_df_latlon.csv = ',song_df_latlon.shape)

song_df_latlon.csv =  (5277, 18)


In [9]:
song_df_latlon.head()

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year,_merge,process,corrected_location,county,city,state,country,country_code
0,ARPPJJO1187B99D171,25.03512,,121.5152,Michael Stearns,427,1,SOEBWIE12A8C14380F,Sacred Site Soundtrack,1993,,Y,,,,Taipei,TW,tw
1,ARBZIN01187FB362CC,1.32026,,103.78871,Paris Hilton,192,1,SOERIDA12A6D4F8506,I Want You (Album Version),2006,,Y,,Northwest,,,Singapore,sg
2,ARZLVVQ11F4C8421BA,39.49974,,-111.54732,T.O.K,188,1,SOTQSHS12A8AE46A4A,Money Maker,0,,Y,,Sanpete County,,Utah,USA,us
3,ARAQMES1187FB4D46A,28.64334,,-81.23258,Benzino / The Outlawz,228,1,SOHJBCO12AB01889CD,Feel Your Pain,0,,Y,,Seminole County,,Florida,USA,us
4,AR4T2IF1187B9ADBB7,63.96027,,10.22442,Billy Idol,287,1,SOVYXYL12AF72A3373,Rebel Yell (1999 Digital Remaster),1983,,Y,,Trøndelag,,,Norway,no


In [21]:
song_df_latlon.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5277 entries, 0 to 5276
Data columns (total 18 columns):
artist_id             5277 non-null object
artist_latitude       5277 non-null float64
artist_location       5259 non-null object
artist_longitude      5277 non-null float64
artist_name           5277 non-null object
duration              5277 non-null int64
num_songs             5277 non-null int64
song_id               5277 non-null object
title                 5277 non-null object
year                  5277 non-null int64
_merge                0 non-null float64
process               5277 non-null object
corrected_location    5277 non-null object
county                5277 non-null object
city                  5277 non-null object
state                 5277 non-null object
country               5277 non-null object
country_code          5277 non-null object
dtypes: float64(3), int64(3), object(12)
memory usage: 742.2+ KB


In [20]:
song_df_latlon['process'] = song_df_latlon.process.astype(str)
song_df_latlon['corrected_location'] = song_df_latlon.corrected_location.astype(str)
song_df_latlon['county'] = song_df_latlon.county.astype(str)
song_df_latlon['city'] = song_df_latlon.city.astype(str)
song_df_latlon['state'] = song_df_latlon.state.astype(str)
song_df_latlon['country'] = song_df_latlon.country.astype(str)
song_df_latlon['country_code'] = song_df_latlon.country_code.astype(str)

### Note:-  
Below step wasn't run in one-shot as geocode API has ratelimiter set that it can retreive only 500 location details. For the next round, you will have to wait atleast more than 1hr30mins or so. Added "process" column to mark the processed rows and used index to break out of the loop.   

Updated the dataframe directly using .at[], which let you stop the stop loop and when you get "Too many requests" error. 

This is for **latitude & longitude** processing

In [25]:
%%time
#song_df_latlon
song_df_array = []
for index, row in song_df_latlon.iterrows():
    #print(index)
    
    if(row['process'] == 'Y'):
        continue
        
    print('Processing row {}'.format(index))
    #if(index > 5300):
    #    break
 
    #print('A = ',location, len(location))
#    row['city']=None
    lat = row['artist_latitude']
    lon = row['artist_longitude']
        
    if(lat != None or lon != None or lat != np.nan or lon != np.nan):
        if(np.isnan(lat) or np.isnan(lon)):
            print('Found NAN @ index ',index)
            continue

        location_details = get_location_details(lat, lon)

        if(location_details != None):
            #print('C = ',index, location_details)
            song_df_latlon.at[index,'process']='Y'
            song_df_latlon.at[index,'county']=location_details.get('county')
            song_df_latlon.at[index,'city']=location_details.get('city')
            song_df_latlon.at[index,'state']=location_details.get('state')
            song_df_latlon.at[index,'country']=location_details.get('country')
            song_df_latlon.at[index,'country_code']=location_details.get('country_code')            



Processing row 761
Processing row 3770
Processing row 5167
Processing row 5252
CPU times: user 623 ms, sys: 12.4 ms, total: 635 ms
Wall time: 5.02 s


In [26]:
#song_df_latlon.to_csv('song_df_latlon.csv', encoding='utf-8', index=False)

In [None]:
#song_df_latlon.to_csv('song_df_latlon-backup.csv', encoding='utf-8', index=False)

In [27]:
song_df_latlon.shape

(5277, 18)

In [14]:
#Using tail to see to which point geocode was processed
song_df_latlon[song_df_latlon.process == 'Y'].tail(10)

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year,_merge,process,corrected_location,county,city,state,country,country_code
2991,ARRXNFW1187B9A0E6B,42.50382,ITALY,12.57347,Arcadia,329,1,SOUSJZQ12A6D4F69CB,Election Day,0,,Y,,TR,,Umbria,Italy,it
2992,AR8FAQM1187B9B98C0,38.8991,"Washington, D.C.",-77.029,The Dismemberment Plan,182,1,SONLZIF12A58A7DB4E,The Small Stuff,1995,,Y,,,Washington,District of Columbia,USA,us
2993,ARXJFJ51187B98A366,31.3893,ISRAEL,35.36124,Kukan Dub Lagan,365,1,SOSMQGH12AB0185195,In the End of my Day,2007,,Y,,,Tamar Regional Council,South District,Israel,il
2994,ARV15CM1187B990EEA,35.472,"Oklahoma City, OK",-97.52033,Traindodge,290,1,SOAGVJV12AF729DACF,Brushing Of The Wings,0,,Y,,Oklahoma County,Oklahoma City,Oklahoma,USA,us
2995,ARVML4B1187FB52324,40.73197,"Newark, NJ",-74.17418,Leroy Hutson,181,1,SOINLKW12A6D4F8F83,Ella Weez,1974,,Y,,Essex County,Newark,New Jersey,USA,us
2996,AR2BG0R1187FB398C3,29.59733,"Houma, LA",-90.71913,Dax Riggs,158,1,SOUURWE12AB0182EA3,Night is the Notion,2007,,Y,,Terrebonne Parish,,Louisiana,USA,us
2997,ARJFOC01187FB413A5,33.59233,"Lubbock, TX",-101.85587,Delbert McClinton,251,1,SOGPXXQ12AB0180CC6,He Will Break Your Heart,0,,Y,,Lubbock County,Lubbock,Texas,United States of America,us
2998,ARYR2F11187B9976FA,36.15398,"Tulsa, OK",-95.99277,AM,229,1,SOGGZPR12AF729F74F,New Road,0,,Y,,Tulsa County,Tulsa,Oklahoma,USA,us
2999,ARVKQ0C1187FB3FF78,44.94382,"Saint Paul, MN",-93.09332,Hockey Night,210,1,SOTQYCQ12A58A81624,Cooperation,2005,,Y,,Ramsey County,Saint Paul,Minnesota,United States of America,us
3000,AROMVBH123E29C285B,28.59457,"Winter Park, FL",-81.35071,Mike Dunn & The Kings Of New England,222,1,SOTMYJW12AB0181839,The Sunshine State,2009,,Y,,Orange County,,Florida,USA,us


### Loading song_df_wlocations with corrected_location & artist_location info. for geocode processing

In [24]:
song_df_wlocations = pd.read_csv("song_df_wlocations.csv")
print('song_df_wlocations.csv = ',song_df_wlocations.shape)

song_df_wlocations.csv =  (2974, 18)


In [25]:
song_df_wlocations['process'] = song_df_wlocations.process.astype(str)
song_df_wlocations['corrected_location'] = song_df_wlocations.corrected_location.astype(str)
song_df_wlocations['county'] = song_df_wlocations.county.astype(str)
song_df_wlocations['city'] = song_df_wlocations.city.astype(str)
song_df_wlocations['state'] = song_df_wlocations.state.astype(str)
song_df_wlocations['country'] = song_df_wlocations.country.astype(str)
song_df_wlocations['country_code'] = song_df_wlocations.country_code.astype(str)

In [26]:
#Interesting to see nan came as a string
song_df_wlocations[song_df_wlocations.corrected_location == 'nan'].tail(15)
#song_df_wlocations[song_df_wlocations.corrected_location == None].tail(15)
#song_df_wlocations[song_df_wlocations.corrected_location == np.nan].tail(15)

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year,_merge,process,corrected_location,county,city,state,country,country_code


In [27]:
song_df_wlocations.head()

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year,_merge,process,corrected_location,county,city,state,country,country_code
0,ARTH9041187FB43E1F,39.76861,"St. Joseph, MO",-94.846704,Eminem,312,1,SOLXDDC12A6701FBFD,I'm Back,2000,,Y,"St. Joseph, MO",Buchanan County,St. Joseph,Missouri,USA,us
1,ARQFUGM1187FB3E24E,34.053696,"Los Angeles, California, USA",-118.242921,Black Label Society,258,1,SOHVHKM12A8C13F716,Counterfeit God,2000,,Y,"Los Angeles, USA",Los Angeles County,Los Angeles,California,United States of America,us
2,ARJ0AL61187B9A3F27,41.710404,Georgia,44.031081,Katie Melua,193,1,SOFHFLK12AF72A4FB9,Mary Pickford (Used To Eat Roses),2008,,Y,Georgia,Tsalka Municipality,,Lower Kartli,Georgia,ge
3,ARJ2PMY1187FB5B563,40.851723,NY - Long Island,-73.099233,Burning Spear,334,1,SOVTFUO12A6310D813,The Invasion (Aka Black Wa Da Da),0,,Y,"NY, Long Island",Suffolk County,,New York,USA,us
4,ARWR6RK1187FB3AB52,40.851723,"Long Island, NY",-73.099233,From Autumn To Ashes,147,1,SOHKXAC12A58A7F6E5,IV,2002,,Y,"Long Island, NY",Suffolk County,,New York,USA,us


### Note:-  
Below step wasn't run in one-shot as geocode API has ratelimiter set that it can retreive only 500 location details. For the next round, you will have to wait atleast more than 1hr30mins or so. Added "process" column to mark the processed rows and used index to break out of the loop.   

Updated the dataframe directly using .at[], which let you stop the stop loop and when you get "Too many requests" error. 

This is for **location** processing

In [28]:
%%time
#Get locations details for places with locations using song_df_wlocations
for index, row in song_df_wlocations.iterrows():
    #print(index)
    
    if(row['process'] == 'Y'):
        continue
        
    print('Processing row {}'.format(index))
    if(index > 3000):
        break
 
    #print('A = ',location, len(location))
    #row['city']=None
    artist_location = row['artist_location']
    corrected_location = row['corrected_location']
    
    if(corrected_location != None or corrected_location != np.nan or corrected_location != 'nan'):
        location_details = get_location_details(corrected_location)
        if(location_details == None):
            location_details = get_location_details(artist_location)
            
        if(location_details != None):
            #print('C = ',index, location_details)
            song_df_wlocations.at[index,'process']='Y'
            song_df_wlocations.at[index,'county']=location_details.get('county')
            song_df_wlocations.at[index,'city']=location_details.get('city')
            song_df_wlocations.at[index,'state']=location_details.get('state')
            song_df_wlocations.at[index,'country']=location_details.get('country')
            song_df_wlocations.at[index,'country_code']=location_details.get('country_code')            
            song_df_wlocations.at[index,'artist_latitude']=location_details.get('lat')            
            song_df_wlocations.at[index,'artist_longitude']=location_details.get('lon')            


Processing row 2952
Processing row 2953
Processing row 2954
Processing row 2955
Processing row 2956
Processing row 2957
Processing row 2958
Processing row 2959
Processing row 2960
Processing row 2961
Processing row 2962
Processing row 2963
Processing row 2964
Processing row 2965
Processing row 2966
Processing row 2967
Processing row 2968
Processing row 2969
Processing row 2970
Processing row 2971
Processing row 2972
Processing row 2973
CPU times: user 840 ms, sys: 75.5 ms, total: 915 ms
Wall time: 40.8 s


In [29]:
song_df_wlocations.shape

(2974, 18)

In [31]:
#song_df_wlocations[song_df_wlocations.process == 'nan'].tail(15)
song_df_wlocations[song_df_wlocations.process == 'Y'].tail(5)

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year,_merge,process,corrected_location,county,city,state,country,country_code
2969,ART39ZB1187B9B1E3A,38.104364,"Hillside of Vallejo, CA",-122.256914,E-40,226,1,SOGGNFQ12AB017F83C,Get Em Up,1993,,Y,"Vallejo, CA",Solano County,Vallejo,California,USA,us
2970,AR40YBH1187FB38A1A,36.2507,"Maynardsville, TN",-83.796784,Roy Acuff,138,1,SOBGYAV12A58A7E45D,Back In The Country,2001,,Y,"Maynardville, TN",Union County,Maynardville,Tennessee,USA,us
2971,ART39ZB1187B9B1E3A,38.104364,"Hillside of Vallejo, CA",-122.256914,E-40_ Goldie Gold,254,1,SOEXHJE12AB0184322,Off That Vodka (feat. Goldie Gold) (Clean Vers...,0,,Y,"Vallejo, CA",Solano County,Vallejo,California,USA,us
2972,ARKDO731187B98E21B,43.371493,"Baimorto, La Coruna, Spain",-8.395835,Luz Casal,216,1,SOPVGTJ12A8C13EACD,Bajo Tu Abrazo,0,left_only,Y,"La Coruna, Spain",A Coruña,A Coruña,Galicia,Spain,es
2973,ARD2GXE1187B9A2E26,5.565521,"Guyana, West Indies",-58.153124,Mad Professor,266,1,SOUIPHX12A6D4F8AB0,Schizophrenic Dub,1986,,Y,Guyana,,,Upper Demerara-Berbice,Guyana,gy


In [32]:
#process column has nan as string
song_df_wlocations[song_df_wlocations.process == 'nan'].head(15)

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year,_merge,process,corrected_location,county,city,state,country,country_code


In [33]:
#song_df_wlocations.to_csv('song_df_wlocations.csv', encoding='utf-8', index=False)

In [34]:
#song_df_wlocations.to_csv('song_df_wlocations-backup.csv', encoding='utf-8', index=False)