Parse table from Wikipedia webpage

In [2]:
import pandas as pd # library for data analysis
from bs4 import BeautifulSoup # library to parse web pages
import requests # library to handle requests
import csv
import folium # map rendering library
from sklearn.cluster import KMeans
import numpy as np
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

get credentials from local file

In [3]:
from credentials import CLIENT_ID, CLIENT_SECRET, VERSION, LIMIT


In [5]:
coordinate_data = {}
with open('Geospatial_Coordinates.csv') as in_file:
    data = csv.DictReader(in_file)
    for row in data:
        coordinate_data[row['Postal Code']] = {'longitude': row['Longitude'],
                                               'latitude': row['Latitude']}

def get_coordinates(postal_code):
    ret = coordinate_data.get(postal_code, {})
    latitude = ret.get('latitude')
    longitude = ret.get('longitude')
    return longitude, latitude

In [29]:
def get_data_from_wikipedia(url):
    req = requests.get(url)
    soup = BeautifulSoup(req.content, 'html.parser')
    #print(soup.prettify())
    data = []
    table = soup.find('table', attrs={'class':'wikitable sortable'})
    table_body = table.find('tbody')
    #print(table_body)

    # get the headers of the table and store in a list
    table_headers = []
    headers = table_body.find_all('th')
    for header in headers:
        header_value = header.get_text().strip()
        table_headers.append(header_value)

    row_key_remapping = {'Neighborhood': 'Neighbourhood'}

    # get the rows of the table
    rows = table_body.find_all('tr')
    for row in rows:
        row_data = {}
        cells = row.find_all('td')
        for position, cell in enumerate(cells):
            value = cell.get_text().strip()
            key = table_headers[position]
            key = row_key_remapping[key] if key in row_key_remapping else key
            # add the value to a dictionary
            row_data[key] = value

        # check that there is some data and that Borough is not unassigned
        if row_data and row_data.get('Borough', '') != 'Not assigned':
            data.append(row_data)


    return data


def load_data_into_dataframe(data):
    df = pd.DataFrame(data)
    # rename the postal code heading
    df.rename(columns={"Postal Code": "PostalCode",
                       "Neighborhood": "Neighbourhood"},
              inplace=True)

    return df

def add_coordinates(df):
    longitude = []
    latitude = []

    for index, row in df.iterrows():
        postal_code = row.get('PostalCode')
        row_long, row_lat = get_coordinates(postal_code=postal_code)
        longitude.append(float(row_long))
        latitude.append(float(row_lat))

    df['Latitude'] = latitude
    df['Longitude'] = longitude

    return df


def getNearbyVenues(names, latitudes, longitudes, radius=500):
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID,
            CLIENT_SECRET,
            VERSION,
            lat,
            lng,
            radius,
            LIMIT)
        print(url)
        results = requests.get(url).json()["response"]['groups'][0]['items']
        venues_list.append([(
            name,
            lat,
            lng,
            v['venue']['name'],
            v['venue']['location']['lat'],
            v['venue']['location']['lng'],
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood',
                  'Neighborhood Latitude',
                  'Neighborhood Longitude',
                  'Venue',
                  'Venue Latitude',
                  'Venue Longitude',
                  'Venue Category']

    return nearby_venues


def process_url(url):
    data = get_data_from_wikipedia(url=url)
    df = load_data_into_dataframe(data=data)
    df = add_coordinates(df=df)
    nearby_venues = getNearbyVenues(names=df['Neighbourhood'],
                               latitudes=df['Latitude'],
                               longitudes=df['Longitude'])
    print('There are {} uniques categories.'.format(len(nearby_venues['Venue Category'].unique())))
    temp_nearby_venues = nearby_venues
    temp_nearby_venues['count'] = np.zeros(len(temp_nearby_venues))
    venue_counts = temp_nearby_venues.groupby(['Neighbourhood', 'Venue Category']).count()
    print(venue_counts[(venue_counts['count'] > 2)])
    onehot = pd.get_dummies(nearby_venues[['Venue Category']], prefix="", prefix_sep="")
    # add neighborhood column back to dataframe
    onehot['Neighbourhood'] = nearby_venues['Neighbourhood']

    grouped = onehot.groupby('Neighbourhood').mean().reset_index()
    print(grouped.head())

    return df, grouped

# function to sort the venues in descending order
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)

    return row_categories_sorted.index.values[0:num_top_venues]


def top_10_sorted(grouped):

    num_top_venues = 10

    indicators = ['st', 'nd', 'rd']

    # create columns according to number of top venues
    columns = ['Neighbourhood']
    for ind in np.arange(num_top_venues):
        try:
            columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
        except:
            columns.append('{}th Most Common Venue'.format(ind+1))

    # create a new dataframe
    neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
    neighborhoods_venues_sorted['Neighbourhood'] = grouped['Neighbourhood']

    for ind in np.arange(grouped.shape[0]):
        neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(grouped.iloc[ind, :], num_top_venues)

    neighborhoods_venues_sorted.head()

    return neighborhoods_venues_sorted

def cluster_and_merge(df, grouped, neighborhoods_venues_sorted):

    # set number of clusters
    kclusters = 5

    grouped_clustering = grouped.drop('Neighbourhood', 1)

    # run k-means clustering
    kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(grouped_clustering)

    # check cluster labels generated for each row in the dataframe
    #kmeans.labels_[0:10]

    # add clustering labels
    neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

    location_merged = df

    # merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
    location_merged = location_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

    location_merged.head()

    return location_merged


def plot_clusters(df, kclusters):
    # create map


    map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

    df = df.dropna()
    df['Cluster Labels'] = df['Cluster Labels'].astype('int')

    # set color scheme for the clusters
    x = np.arange(kclusters)
    ys = [i + x + (i*x)**2 for i in range(kclusters)]
    colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
    rainbow = [colors.rgb2hex(i) for i in colors_array]

    markers_colors = []
    for index, row in df.iterrows():
        postal_code = row['PostalCode']
        lat = row['Latitude']
        lon = row['Longitude']
        neighbour = row['Neighbourhood']
        cluster = row['Cluster Labels']
        label = folium.Popup(str(postal_code) + ' Cluster ' + str(neighbour), parse_html=True)
        folium.CircleMarker(
            [lat, lon],
            radius=5,
            popup=label,
            color=rainbow[cluster-1],
            fill=True,
            fill_color=rainbow[cluster-1],
            fill_opacity=0.7).add_to(map_clusters)


In [30]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
toronto_df, toronto_grouped = process_url(url=url)

KeyError: 'groups'

In [22]:
toronto_sorted_top_10 = top_10_sorted(toronto_grouped)

NameError: name 'toronto_grouped' is not defined

In [23]:
toronto_merged = cluster_and_merge(df=toronto_df, grouped=toronto_grouped,
                  neighborhoods_venues_sorted=toronto_sorted_top_10)


NameError: name 'toronto_df' is not defined

In [None]:
plot_clusters(df=toronto_merged)

