Parse table from Wikipedia webpage

In [1]:
import pandas as pd # library for data analysis
from bs4 import BeautifulSoup # library to parse web pages
import requests # library to handle requests
import csv
import folium # map rendering library
from sklearn.cluster import KMeans
import numpy as np
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

                the kernel may be left running.  Please let us know
                about your system (bitness, Python, etc.) at
                ipython-dev@scipy.org
  ipython-dev@scipy.org""")


get credentials from local file

In [4]:
from credentials import CLIENT_ID, CLIENT_SECRET, VERSION, LIMIT


In [5]:
coordinate_data = {}
with open('Geospatial_Coordinates.csv') as in_file:
    data = csv.DictReader(in_file)
    for row in data:
        coordinate_data[row['Postal Code']] = {'longitude': row['Longitude'],
                                               'latitude': row['Latitude']}

def get_coordinates(postal_code):
    ret = coordinate_data.get(postal_code, {})
    latitude = ret.get('latitude')
    longitude = ret.get('longitude')
    return longitude, latitude

In [6]:
def get_data_from_wikipedia(url):
    req = requests.get(url)
    soup = BeautifulSoup(req.content, 'html.parser')
    #print(soup.prettify())
    data = []
    table = soup.find('table', attrs={'class':'wikitable sortable'})
    table_body = table.find('tbody')
    #print(table_body)

    # get the headers of the table and store in a list
    table_headers = []
    headers = table_body.find_all('th')
    for header in headers:
        header_value = header.get_text().strip()
        table_headers.append(header_value)

    # get the rows of the table
    rows = table_body.find_all('tr')
    for row in rows:
        row_data = {}
        cells = row.find_all('td')
        for position, cell in enumerate(cells):
            value = cell.get_text().strip()
            key = table_headers[position]
            # add the value to a dictionary
            row_data[key] = value

        # check that there is some data and that Borough is not unassigned
        if row_data and row_data.get('Borough', '') != 'Not assigned':
            data.append(row_data)


    return data

In [7]:
def load_data_into_dataframe(data):
    df = pd.DataFrame(data)
    # rename the postal code heading
    df.rename(columns={"Postal Code": "PostalCode"}, inplace=True)

    return df

In [8]:
def add_coordinates(df):
    longitude = []
    latitude = []

    for index, row in df.iterrows():
        postal_code = row.get('PostalCode')
        row_long, row_lat = get_coordinates(postal_code=postal_code)
        longitude.append(float(row_long))
        latitude.append(float(row_lat))

    df['Latitude'] = latitude
    df['Longitude'] = longitude

    return df

In [9]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID,
            CLIENT_SECRET,
            VERSION,
            lat,
            lng,
            radius,
            LIMIT)
        results = requests.get(url).json()["response"]['groups'][0]['items']
        venues_list.append([(
            name,
            lat,
            lng,
            v['venue']['name'],
            v['venue']['location']['lat'],
            v['venue']['location']['lng'],
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood',
                  'Neighborhood Latitude',
                  'Neighborhood Longitude',
                  'Venue',
                  'Venue Latitude',
                  'Venue Longitude',
                  'Venue Category']

    return nearby_venues

In [12]:
def process_url(url):
    data = get_data_from_wikipedia(url=url)
    df = load_data_into_dataframe(data=data)
    df = add_coordinates(df=df)
    nearby_venues = getNearbyVenues(names=df['Neighborhood'],
                               latitudes=df['Latitude'],
                               longitudes=df['Longitude'])
    print('There are {} uniques categories.'.format(len(nearby_venues['Venue Category'].unique())))
    temp_nearby_venues = nearby_venues
    temp_nearby_venues['count'] = np.zeros(len(temp_nearby_venues))
    venue_counts = temp_nearby_venues.groupby(['Neighborhood', 'Venue Category']).count()
    print(venue_counts[(venue_counts['count'] > 2)])
    onehot = pd.get_dummies(nearby_venues[['Venue Category']], prefix="", prefix_sep="")
    # add neighborhood column back to dataframe
    onehot['Neighborhood'] = nearby_venues['Neighborhood']

    grouped = onehot.groupby('Neighborhood').mean().reset_index()
    print(grouped.head())

    return grouped


url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
grouped = process_url(url=url)

There are 233 uniques categories.
                                                                        Neighborhood Latitude  \
Neighborhood                                       Venue Category                               
Bedford Park, Lawrence Manor East                  Italian Restaurant                       3   
Brockton, Parkdale Village, Exhibition Place       Café                                     3   
CN Tower, King and Spadina, Railway Lands, Harb... Airport Service                          3   
Central Bay Street                                 Coffee Shop                              6   
Christie                                           Café                                     3   
                                                   Grocery Store                            4   
Commerce Court, Victoria Hotel                     Café                                     5   
Davisville                                         Dessert Shop                             3

In [18]:
# function to sort the venues in descending order
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)

    return row_categories_sorted.index.values[0:num_top_venues]

In [24]:
# new dataframe and display the top 10 venues for each neighborhood.

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = grouped['Neighborhood']

for ind in np.arange(grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Lounge,Latin American Restaurant,Skating Rink,Breakfast Spot,Electronics Store,Eastern European Restaurant,Drugstore,Donut Shop,Dog Run,Distribution Center
1,"Alderwood, Long Branch",Pizza Place,Gym,Sandwich Place,Coffee Shop,Athletics & Sports,Pub,Pool,Dance Studio,Deli / Bodega,Department Store
2,"Bathurst Manor, Wilson Heights, Downsview North",Coffee Shop,Bank,Shopping Mall,Fried Chicken Joint,Supermarket,Sushi Restaurant,Ice Cream Shop,Restaurant,Deli / Bodega,Middle Eastern Restaurant
3,Bayview Village,Café,Chinese Restaurant,Japanese Restaurant,Bank,Deli / Bodega,Electronics Store,Eastern European Restaurant,Drugstore,Donut Shop,Dog Run
4,"Bedford Park, Lawrence Manor East",Italian Restaurant,Sandwich Place,Pizza Place,Restaurant,Coffee Shop,Indian Restaurant,Butcher,Café,Sushi Restaurant,Spa


cluster the neighbourhoods

In [27]:
# set number of clusters
kclusters = 5

grouped_clustering = grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [48]:

# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,1.0,Park,Pool,Food & Drink Shop,Curling Ice,Drugstore,Donut Shop,Dog Run,Distribution Center,Discount Store,Diner
1,M4A,North York,Victoria Village,43.725882,-79.315572,1.0,Intersection,Portuguese Restaurant,Coffee Shop,French Restaurant,Pizza Place,Hockey Arena,Dance Studio,Deli / Bodega,Department Store,Dessert Shop
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,1.0,Coffee Shop,Park,Bakery,Pub,Café,Theater,Breakfast Spot,Gym / Fitness Center,Historic Site,French Restaurant
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,1.0,Clothing Store,Accessories Store,Furniture / Home Store,Coffee Shop,Miscellaneous Shop,Boutique,Event Space,Vietnamese Restaurant,General Entertainment,Cuban Restaurant
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,1.0,Coffee Shop,Sushi Restaurant,Diner,Sandwich Place,Park,Mexican Restaurant,Italian Restaurant,Hobby Shop,Fried Chicken Joint,Distribution Center


In [49]:
# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

Find the average position to center the map

In [50]:
def Average(lst):
    return sum(lst) / len(lst)

avg_latitude = Average(latitude)
avg_longitude = Average(longitude)

In [51]:
# create map
map_clusters = folium.Map(location=[avg_latitude, avg_longitude], zoom_start=11)

Add positions to the map

In [60]:
tempdf = toronto_merged.dropna()
tempdf['Cluster Labels'] = tempdf['Cluster Labels'].astype('int')
tempdf.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


PostalCode                 object
Borough                    object
Neighborhood               object
Latitude                  float64
Longitude                 float64
Cluster Labels              int32
1st Most Common Venue      object
2nd Most Common Venue      object
3rd Most Common Venue      object
4th Most Common Venue      object
5th Most Common Venue      object
6th Most Common Venue      object
7th Most Common Venue      object
8th Most Common Venue      object
9th Most Common Venue      object
10th Most Common Venue     object
dtype: object

In [52]:
def plot_clusters(df, map_to_add_to):
    markers_colors = []
    for index, row in df.iterrows():
        postal_code = row['PostalCode']
        lat = row['Latitude']
        lon = row['Longitude']
        neighbour = row['Neighborhood']
        cluster = row['Cluster Labels']
        label = folium.Popup(str(postal_code) + ' Cluster ' + str(neighbour), parse_html=True)
        folium.CircleMarker(
            [lat, lon],
            radius=5,
            popup=label,
            color=rainbow[cluster-1],
            fill=True,
            fill_color=rainbow[cluster-1],
            fill_opacity=0.7).add_to(map_to_add_to)

In [61]:
plot_clusters(df=tempdf, map_to_add_to=map_clusters)

In [62]:
map_clusters

