In [59]:
import re
import time
import numpy as np
import pandas as pd
import googlemaps as gmaps

gmaps = gmaps.Client(key='AIzaSyAiiG2SnCVLWlg0ZA6GKXS-beHPIidYVeM')

In [60]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

In [61]:
df = pd.read_csv('data.csv')
print(df.shape)
df.head(5)

(2309, 14)


Unnamed: 0,price,title,furnishing,tenants preferred,bathroom,availability,longitude,latitude,balcony,floor,facing,overlooking,car parking,owner resides
0,"₹ 18,000",3 BHK House for rent in Gudimalkapur 1400 sqft,Unfurnished,Bachelors/Family,2,Immediately,,,,,,,,
1,"₹ 20,000",3 BHK Villa for rent in Shamshabad 1500 sqft,Unfurnished,Bachelors,3,From Mar '20,17.270166,78.393593,3.0,,,,,
2,"₹ 5,800",1 BHK Apartment for rent in Vinayak Nagar II-H...,Unfurnished,Family,1,Immediately,,,1.0,1 out of 1 Floor,East,Main Road,,
3,"₹ 14,000","2 BHK Apartment for rent in Kukatpally, NH 9 1...",Semi-Furnished,Bachelors/Family,2,Immediately,,,,2 out of 5 Floors,,,,
4,"₹ 20,000",3 BHK Apartment for rent in Dellapur 1560 sqft,Semi-Furnished,Bachelors/Family,2,Immediately,,,,5 out of 5 Floors,,,,


In [62]:
def parse_address(x):
    x = re.split(' \d\d+', x)[0]
    x = re.split(' in', x)[-1]
    return x+', Hyderabad'

In [63]:
df['address'] = df['title'].apply(parse_address)

In [52]:
print(len(df), 'total rows')
unique_locs = list(set(df['address']))
print(len(unique_locs), 'unique locations')

2309 total rows
863 unique locations


# Geocoding

In [53]:
def get_gmap_geos(locations, max_errors=10):
    geocoded = dict()
    start_time = time.perf_counter()
    errors = 0
    for iteration, location in enumerate(locations):
        try:
            geocoded[location] = gmaps.geocode(location)
            time.sleep(.05) # <-- throttle
        except:
            time.sleep(2)
            try:
                geocoded[location] = gmaps.geocode(location)
            except:
                time.sleep(2)
                try:
                    geocoded[location] = gmaps.geocode(location)
                except:
                    geocoded[location] = 'error'
                    errors+=1
        if errors>max_errors:
            print('stopped at', iteration, '(max errors reached)')
            break
            return geocoded
        if iteration>0 and iteration%200==0:
            now = time.strftime('%H:%M:%S', time.localtime(time.time()))
            print(f'{now} | {iteration} locations geocoded ({round(iteration/len(unique_locs)*100, 2)}%) | {errors} total errors')
    print(f'job completed in {time.perf_counter() - start_time}s\n{errors} requests timed out')
    print(len([v for v in geocoded.values() if v=='error']), 'locations not found')
    return geocoded

In [54]:
unique_geocodes = get_gmap_geos(unique_locs)

21:37:59 | 200 locations geocoded (23.17%) | 0 total errors
21:39:43 | 400 locations geocoded (46.35%) | 0 total errors
21:41:24 | 600 locations geocoded (69.52%) | 0 total errors
21:43:18 | 800 locations geocoded (92.7%) | 0 total errors
job completed in 454.48432708099995s
0 requests timed out
0 locations not found


### Map back onto df

In [58]:
unique_geocodes

{' RV Citin Loft, Manikonda, Outer Ring Road, Hyderabad': [{'address_components': [{'long_name': 'Road Number 25',
     'short_name': 'Rd Number 25',
     'types': ['route']},
    {'long_name': 'Huda',
     'short_name': 'Huda',
     'types': ['political', 'sublocality', 'sublocality_level_3']},
    {'long_name': 'Alkapur Township',
     'short_name': 'Alkapur Twp',
     'types': ['political', 'sublocality', 'sublocality_level_2']},
    {'long_name': 'Manikonda',
     'short_name': 'Manikonda',
     'types': ['political', 'sublocality', 'sublocality_level_1']},
    {'long_name': 'Hyderabad',
     'short_name': 'Hyderabad',
     'types': ['locality', 'political']},
    {'long_name': 'Ranga Reddy',
     'short_name': 'R.R. District',
     'types': ['administrative_area_level_2', 'political']},
    {'long_name': 'Telangana',
     'short_name': 'Telangana',
     'types': ['administrative_area_level_1', 'political']},
    {'long_name': 'India',
     'short_name': 'IN',
     'types': ['count

In [64]:
df['gmaps_info'] = df['address'].apply(lambda x: unique_geocodes[x])

In [65]:
def extract_coords(geo):
    try:
        geo = geo[-1] # drop generalized extra results
        return geo['geometry']['location']['lat'], geo['geometry']['location']['lng']
    except:
        return np.nan, np.nan

In [67]:
df['coord'] = df['gmaps_info'].apply(extract_coords)
df['latitude'].fillna(df['coord'].apply(lambda x: x[0]), inplace=True)
df['longitude'].fillna(df['coord'].apply(lambda x: x[1]), inplace=True)
df.head()

Unnamed: 0,price,title,furnishing,tenants preferred,bathroom,availability,longitude,latitude,balcony,floor,facing,overlooking,car parking,owner resides,address,gmaps_info,coord
0,"₹ 18,000",3 BHK House for rent in Gudimalkapur 1400 sqft,Unfurnished,Bachelors/Family,2,Immediately,78.437402,17.378782,,,,,,,"Gudimalkapur, Hyderabad",[{'address_components': [{'long_name': 'Gudima...,"(17.3787821, 78.4374024)"
1,"₹ 20,000",3 BHK Villa for rent in Shamshabad 1500 sqft,Unfurnished,Bachelors,3,From Mar '20,17.270166,78.393593,3.0,,,,,,"Shamshabad, Hyderabad",[{'address_components': [{'long_name': 'Shamsh...,"(17.2619301, 78.387971)"
2,"₹ 5,800",1 BHK Apartment for rent in Vinayak Nagar II-H...,Unfurnished,Family,1,Immediately,78.598729,17.331128,1.0,1 out of 1 Floor,East,Main Road,,,"Vinayak Nagar II-Hayathnagar, Hyderabad",[{'address_components': [{'long_name': 'Vinaya...,"(17.3311276, 78.5987291)"
3,"₹ 14,000","2 BHK Apartment for rent in Kukatpally, NH 9 1...",Semi-Furnished,Bachelors/Family,2,Immediately,78.399146,17.494868,,2 out of 5 Floors,,,,,"Kukatpally, NH 9, Hyderabad",[{'address_components': [{'long_name': 'Nation...,"(17.4948682, 78.3991456)"
4,"₹ 20,000",3 BHK Apartment for rent in Dellapur 1560 sqft,Semi-Furnished,Bachelors/Family,2,Immediately,78.286481,17.463172,,5 out of 5 Floors,,,,,"Dellapur, Hyderabad",[{'address_components': [{'long_name': 'Tellap...,"(17.463172, 78.2864815)"


### Export

In [68]:
df.to_csv('geocoded.csv', index=False)