# Coursera Capstone Final Project

## Project Description

The goal for the project is to find the best place in Budapest for a new pub. It is crucial to find the right location for businesses, most of the time it is the deciding factor in the beginning of the business.

I will use the Foursquare API to solve the problem. To find the best place it`s important to find the already existing places, which places are popular and if is there any room for a new business.

There are also regulations where you can open a pub, in our case it has to be at least 150 meters from any public school.

In [None]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests
import folium # map rendering library
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
import re
import geopy.distance
import pickle

In [None]:
# Function for parsing url, I needed to add the header because some websites are checking the browser for the get queries
def parse(url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    return requests.get(url, headers=headers).content

This page contains all the gps coordinates for the districts: http://nepesseg.com/budapest/
In this function I will parse the data:

In [None]:
def parseDistricts(num):
    url = "http://nepesseg.com/budapest/budapest-{:02d}-kerulet".format(num)
    content = parse(url)
    soup = BeautifulSoup(content, "lxml")
    tag = soup.find(lambda tag:tag.name=="p" and "GPS koordinátái:" in tag.text)
    arr = re.findall(r"[-+]?\d*\.\d+|\d+", tag.text)
    return (arr[-2], arr[-1])

Get the coordinates for all the 23 districts in Budapest

In [None]:
budapest_coord = []
for i in range(1,24):
    coord = parseDistricts(i)
    budapest_coord.append(("Budapest " + str(i), float(coord[0]), float(coord[1])))

Checking the coordinates

In [None]:
budapest_coord

In [None]:
budapest_data = pd.DataFrame(list(budapest_coord), columns=['District', 'Latitude', 'Longitude'])

In [None]:
latitude = 47.50
longitude = 19.05

In [None]:
# create map of Manhattan using latitude and longitude values
map_budapest = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(budapest_data['Latitude'], budapest_data['Longitude'], budapest_data['District']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_budapest)  
    
map_budapest

In [None]:

coords_1 = (47.4968, 19.0375)
coords_2 = (47.5393, 18.9869)

print(geopy.distance.vincenty(coords_1, coords_2).km)

In [None]:
CLIENT_ID = 'N3WFOT4ZNN400G3S3MD23HOSCMOLSY4IGXLEMDX4O0K5NEWC' # your Foursquare ID
CLIENT_SECRET = 'WJYGOWE2B5KPY30LFQ20IYVJ522BLYLAZRCPNMBJQZLRDFZT' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

In [None]:
neighborhood_latitude = budapest_data.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = budapest_data.loc[0, 'Longitude'] # neighborhood longitude value

In [None]:
LIMIT = 100 
radius = 1000
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
results = requests.get(url).json()
results

In [None]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [None]:
def getNearbyVenues(names, latitudes, longitudes, radius=1000):
    LIMIT = 500
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        res_json = requests.get(url).json()
        if "response" in res_json:
            results = requests.get(url).json()["response"]['groups'][0]['items']
        else:
            venues_list.append([(
            '',
            name, 
            lat, 
            lng, 
            '', 
            0, 
            0,  
            '')])
            continue
        
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            v['venue']['id'], 
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Id', 
                  'Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [None]:
budapest_venues = getNearbyVenues(names=budapest_data['District'],
                                   latitudes=budapest_data['Latitude'],
                                   longitudes=budapest_data['Longitude']
                                  )

In [None]:
budapest_venues.groupby('Neighborhood').count()

In [None]:
budapest_pubs = budapest_venues[(budapest_venues['Venue Category'] == 'Beer Bar') | 
                (budapest_venues['Venue Category'] == 'Bistro') | 
                (budapest_venues['Venue Category'] == 'Bar') | 
                (budapest_venues['Venue Category'] == 'Karaoke Bar') | 
                (budapest_venues['Venue Category'] == 'Gastropub') | 
                (budapest_venues['Venue Category'] == 'Cocktail Bar') | 
                (budapest_venues['Venue Category'] == 'Beer Garden') | 
                (budapest_venues['Venue Category'] == 'Brewery') | 
               (budapest_venues['Venue Category'] == 'Pub')]

In [None]:
budapest_pubs

In [None]:
budapest_pubs[budapest_pubs['Neighborhood'] == 'Budapest 7']

In [None]:
def get_venue_details(venue_id):
    url = 'https://api.foursquare.com/v2/venues/{}?&client_id={}&client_secret={}&v={}'.format(
        venue_id,
        CLIENT_ID, 
        CLIENT_SECRET, 
        VERSION
    )
    return requests.get(url).json()

In [None]:
pub_details = []

In [None]:
for pub_id in budapest_pubs[budapest_pubs['Neighborhood'] == 'Budapest 7']['Id']:
    json = get_venue_details(pub_id) 
    pub_details.append(json)

In [None]:
pickle.dump( pub_details, open( "pub_details.p", "wb" ) )

In [None]:
def getDetail(detail, key):
    if key in detail:
        return detail[key]
    return 'Unknown'
    

In [None]:
for detail in pub_details:
    venue = detail['response']['venue']
    
    print(getDetail(venue, 'name') + ' ' + str(getDetail(venue, 'rating')) + ' '+ getDetail(venue, 'price')['message'])

In [None]:
budapest_other_venues = pd.concat([budapest_pubs, budapest_venues]).drop_duplicates(keep=False)

In [None]:
# create map of Manhattan using latitude and longitude values
pub_budapest = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(budapest_pubs['Venue Latitude'], budapest_pubs['Venue Longitude'], budapest_pubs['Venue']):
    label = folium.Popup(label, parse_html=True)
    folium.Circle(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(pub_budapest)  
    
pub_budapest

In [None]:
budapest_venues['Venue Category'].unique()

## Schools

In [None]:
num = 20
url = "http://www.iskolaklistaja.eu/tipus/?regio=kozep-magyarorszag&kerulet=budapest&start={}".format(num)
content = parse(url)
soup = BeautifulSoup(content, "lxml")
names = soup.findAll("div", {"class": "school_name"})
infos = soup.findAll("div", {"class": "school_info"})
for name, info in zip(names, infos):
    print(name.text + " " + info.text.split(',')[1])


In [None]:
def get_school_data(ker):
    url = "https://holmivan.valami.info/budapest-{}-kerulet/iskola-93".format(ker)
    content = parse(url)
    soup = BeautifulSoup(content, "lxml")
    table = soup.find('table', attrs={'class':'itemlist table table-condensed table-striped'})
    data = []
    rows = table.find_all('tr')
    for row in rows:
        cols = row.find_all('td')
        res = []
        res.append(cols[0].text.strip())
        res.append(cols[1].text.strip())
        #We need to find the gps coord in the last column
        coord_tag = cols[-1].find(lambda tag:tag.name=="a")
        arr = re.findall(r"[-+]?\d*\.\d+|\d+", coord_tag['onclick'])
        res.append(float(arr[-2]))
        res.append(float(arr[-1]))
        data.append([ele for ele in res if ele]) # Get rid of empty values
    school_data = pd.DataFrame(list(data), columns=['Name', 'Address', 'Latitude', 'Longitude'])
    return school_data

In [None]:
school_data = pd.concat([get_school_data(5),get_school_data(6),get_school_data(7),get_school_data(8)])

In [None]:
school_budapest = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(school_data['Latitude'], school_data['Longitude'], school_data['Name']):
    label = folium.Popup(label, parse_html=True)
    folium.Circle(
        [lat, lng],
        radius=50,
        popup=label,
        color='red',
        fill=True,
        fill_color='#ffcccc',
        fill_opacity=0.7,
        parse_html=False).add_to(school_budapest) 
    
for lat, lng, label in zip(budapest_pubs['Venue Latitude'], budapest_pubs['Venue Longitude'], budapest_pubs['Venue']):
    label = folium.Popup(label, parse_html=True)
    folium.Circle(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(school_budapest)  
    
for lat, lng, label in zip(budapest_other_venues['Venue Latitude'], budapest_other_venues['Venue Longitude'], budapest_other_venues['Venue']):
    label = folium.Popup(label, parse_html=True)
    folium.Circle(
        [lat, lng],
        radius=5,
        popup=label,
        color='yellow',
        fill=True,
        fill_color='#00ffff',
        fill_opacity=0.7,
        parse_html=False).add_to(school_budapest)  
    
school_budapest

## 3. Analyze Each Neighborhood

In [None]:
# one hot encoding
budapest_onehot = pd.get_dummies(budapest_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
budapest_onehot['Neighborhood'] = budapest_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [budapest_onehot.columns[-1]] + list(budapest_onehot.columns[:-1])
budapest_onehot = budapest_onehot[fixed_columns]

budapest_onehot.head()

In [None]:
budapest_grouped = budapest_onehot.groupby('Neighborhood').mean().reset_index()
num_top_venues = 5

for hood in budapest_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = budapest_grouped[budapest_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')