# Capstone Project - IBM Data Science

This notebook is part of the Capstone Project of the IBM Data Science Professional Certificate

In [1]:
print("Hello Capstone Project Course!")

Hello Capstone Project Course!


In [3]:
# Import necessary libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import wget
import os.path
from os import path
from geopy.geocoders import Nominatim
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import json
from pandas.io.json import json_normalize
import folium

In [4]:
# Download the file containing geographical coordinates of Australian postcodes if it doesn't exist
if path.exists('australian_postcodes.csv') == False:
    url = 'https://www.matthewproctor.com/Content/postcodes/australian_postcodes.csv'
    wget.download(url)
geo = pd.read_csv("australian_postcodes.csv")[['postcode', 'locality', 'lat', 'long']] # Read in relevant columns of file
geo = geo.dropna() # Remove rows with NAs
geo = geo[(geo.lat > -34.13) & (geo.lat < -33.57) & (geo.long > 150.60) & (geo.long < 151.35)] # Select only those rows that are in Sydney area
geoleft = geo.groupby(['postcode'])['locality'].apply(', '.join).reset_index() # Combine localities within same postcode
georight = geo.groupby(['postcode'])['lat', 'long'].mean().reset_index() # Average latitudes and longitudes for multiple localities in same postcode
geo = pd.merge(left = geoleft, right = georight, left_on = 'postcode', right_on = 'postcode') # Merge back into single dataframe
print('Shape of dataframe:', geo.shape) # Find shape of resulting dataframe
geo.head() # Preview resulting dataframe

Shape of dataframe: (476, 4)


  if __name__ == '__main__':


Unnamed: 0,postcode,locality,lat,long
0,1001,SYDNEY,-33.794883,151.268071
1,1002,SYDNEY,-33.794883,151.268071
2,1003,SYDNEY,-33.794883,151.268071
3,1004,SYDNEY,-33.794883,151.268071
4,1005,SYDNEY,-33.794883,151.268071


In [5]:
# Download the file containing solar installation data by postcode in Australia if it doesn't exist
if path.exists('postcodes_1618.csv') == False:
    url = 'https://d284f79vx7w9nf.cloudfront.net/attachments/analysis/5/postcodes_1618.csv'
    wget.download(url)
pv = pd.read_csv("postcodes_1618.csv")[['postcode', 'capacity', 'pot_kw']] # Read in relevant columns of file
pv = pv.dropna().astype({'postcode' : int}) # Remove rows with NAs and convert postcode to integer format
print('Shape of dataframe:', pv.shape) # Find shape of resulting dataframe
pv.head() # Preview resulting dataframe

Shape of dataframe: (2605, 3)


Unnamed: 0,postcode,capacity,pot_kw
0,800,1943.0,32376.0
1,810,15648.0,198075.0
2,812,7638.0,101247.0
3,820,13470.0,188975.0
4,822,12360.0,140603.0


In [6]:
df = pd.merge(left = geo, right = pv, left_on = "postcode", right_on = "postcode") # Merge the two dataframes (via inner join)
print('Shape of dataframe:', df.shape) # Find shape of resulting dataframe
df.head() # Preview resulting dataframe

Shape of dataframe: (220, 6)


Unnamed: 0,postcode,locality,lat,long,capacity,pot_kw
0,2000,"BARANGAROO, DARLING HARBOUR, DAWES POINT, HAYM...",-33.860016,151.25053,2097.0,56265.0
1,2007,"BROADWAY, ULTIMO",-33.883189,151.19665,144.0,16683.0
2,2008,"CHIPPENDALE, DARLINGTON, GOLDEN GROVE",-33.891146,151.193858,632.0,19225.0
3,2009,"DARLING ISLAND, PYRMONT",-33.871222,151.193055,907.0,18489.0
4,2010,"DARLINGHURST, SURRY HILLS, TAYLOR SQUARE",-33.884119,151.212262,919.0,55246.0


In [11]:
# Find geographical coordinates of Sydney, Australia
address = 'Sydney'
geolocator = Nominatim(user_agent="syd_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geographical coordinates of Sydney are {}, {}.'.format(latitude, longitude))

The geographical coordinates of Sydney are -33.8548157, 151.2164539.


In [12]:
# Create map of Sydney using latitude and longitude values
map_syd = folium.Map(location = [latitude, longitude], zoom_start = 10)

# Add markers to map
for postcode, locality, lat, long in zip(df['postcode'], df['locality'], df['lat'], df['long']):
    label = 'Postcode: {} (Contains suburbs: {})'.format(postcode, locality)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker(
        [lat, long],
        radius = 5,
        popup = label,
        color = 'blue',
        fill = True,
        fill_color = '#3186cc',
        fill_opacity = 0.7,
        parse_html = False).add_to(map_syd)  

# View the map
map_syd

In [13]:
LIMIT = 20
radius = 1000

def gettipcounts(postcode, lat, long, radius = radius):
    
    tipcounts_list = []
    for postcode, lat, long in zip(postcode, lat, long):
            
        # create the API request URL
        url1 = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&oauth_token={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            ACCESS_TOKEN, 
            VERSION, 
            lat, 
            long, 
            radius, 
            LIMIT)
            
        # make the GET request
        try:
            results1 = requests.get(url1).json()["response"]['groups'][0]['items']
        except:
            continue
        
        # return only relevant information for each nearby venue
        for v in results1:
            venue_id = v['venue']['id']
            url2 = 'https://api.foursquare.com/v2/venues/{}?client_id={}&client_secret={}&oauth_token={}&v={}'.format(
                venue_id,
                CLIENT_ID,
                CLIENT_SECRET,
                ACCESS_TOKEN,
                VERSION)
            try:
                results2 = requests.get(url2).json()['response']['venue']
            except:
                continue
            tipcounts_list.append([
                postcode,
                results2['name'],
                results2['tips']['count']
            ])
        
    tip_counts = pd.DataFrame([item for tipcount_list in tipcounts_list for item in tipcount_list])
    tip_counts.columns = ['postcode', 
                  'venue', 
                  'tipcounts']
    
    return(tip_counts)

In [15]:
LIMIT = 20
radius = 1000

def gettipcounts(postcode, lat, long, radius = radius):
    
    # Initialise empty dataframe
    tip_counts = pd.DataFrame(columns = ['postcode', 'venue', 'tipcounts'])
    
    for postcode, lat, long in zip(postcode, lat, long):
            
        # create the API request URL
        url1 = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&oauth_token={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            ACCESS_TOKEN, 
            VERSION, 
            lat, 
            long, 
            radius, 
            LIMIT)
            
        # make the GET request
        try:
            results1 = requests.get(url1).json()["response"]['groups'][0]['items']
        except:
            continue
        
        # return only relevant information for each nearby venue
        for v in results1:
            venue_id = v['venue']['id']
            url2 = 'https://api.foursquare.com/v2/venues/{}?client_id={}&client_secret={}&oauth_token={}&v={}'.format(
                venue_id,
                CLIENT_ID,
                CLIENT_SECRET,
                ACCESS_TOKEN,
                VERSION)
            try:
                results2 = requests.get(url2).json()['response']['venue']
            except:
                continue
            venue = results2['name']
            tipcounts = results2['tips']['count']
            tip_counts = tip_counts.append({'postcode' : postcode, 'venue' : venue, 'tipcounts' : tipcounts}, ignore_index = True)
    
    return(tip_counts)

In [18]:
LIMIT = 20
radius = 1000

def gettipcounts(postcode, lat, long, radius = radius):
    
    # Initialise empty dataframe
    tip_counts = pd.DataFrame(columns = ['postcode', 'venue', 'tipcounts'])
    
    for postcode, lat, long in zip(postcode, lat, long):
            
        # create the API request URL
        url1 = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&oauth_token={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            ACCESS_TOKEN, 
            VERSION, 
            lat, 
            long, 
            radius, 
            LIMIT)
            
        # make the GET request
        results1 = requests.get(url1).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        for v in results1:
            venue_id = v['venue']['id']
            url2 = 'https://api.foursquare.com/v2/venues/{}?client_id={}&client_secret={}&oauth_token={}&v={}'.format(
                venue_id,
                CLIENT_ID,
                CLIENT_SECRET,
                ACCESS_TOKEN,
                VERSION)
            results2 = requests.get(url2).json()['response']['venue']
            venue = results2['name']
            tipcounts = results2['tips']['count']
            tip_counts = tip_counts.append({'postcode' : postcode, 'venue' : venue, 'tipcounts' : tipcounts}, ignore_index = True)
    
    return(tip_counts)

In [21]:
tip_counts = pd.DataFrame(columns = ['postcode', 'venue', 'tipcounts'])
tip_counts

Unnamed: 0,postcode,venue,tipcounts


In [19]:
sydney_tipcounts = gettipcounts(postcode = df['postcode'],
                                   lat = df['lat'],
                                   long = df['long']
                                  )

KeyError: 'groups'

In [17]:
print('Shape of dataframe:', sydney_tipcounts.shape)
sydney_tipcounts.head()

Shape of dataframe: (0, 3)


Unnamed: 0,postcode,venue,tipcounts


In [None]:
tip = sydney_tipcounts.groupby(['postcode'])['tipcounts'].sum().reset_index()
df_final = pd.merge(left = df, right = tip, left_on = 'postcode', right_on = 'postcode') # Merge the two dataframes (via inner join)
print('Shape of dataframe:', df_final.shape) # Find shape of resulting dataframe
df_final.head() # Preview resulting dataframe

In [31]:
venue_id

'4bdbd2353904a593e1364c9e'

In [35]:
x = results['response']['groups'][0]['items']

In [36]:
for result in x:
    venue_id = result['venue']['id']
    print(venue_id)

4bdbd2353904a593e1364c9e
4b3c5fc9f964a520178425e3
4baf10cbf964a5208de93be3
4b089cc6f964a5206c0f23e3
4b218facf964a520f83d24e3
55230c83498ecee05b0427ed
4b80b5caf964a5209c8730e3
52b7d78b498ea1b65e7354ad
4c34fb5b452620a1f30c260f


In [None]:
lat = -33.860016
long = 151.250530
LIMIT = 20
radius = 1000

In [None]:
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&oauth_token={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            ACCESS_TOKEN, 
            VERSION, 
            lat, 
            long, 
            radius, 
            LIMIT)
url

In [None]:
results = requests.get(url).json()

In [None]:
results