In [330]:
# Get HTML of the desired wiki page
import requests
url = requests.get("https://en.wikipedia.org/wiki/List_of_universities_in_Canada").text

# Parse data from wiki page into soup object
from bs4 import BeautifulSoup
soup = BeautifulSoup(url,'lxml')

# Extract the table containing the desired data
my_table = soup.find('table', class_ = 'wikitable sortable')

In [331]:
# Create empty lists for each column in final table
A=[]
B=[]
C=[]

# Pull contents from table and place into created lists
for row in my_table.findAll("tr"):
    cells = row.findAll('td')
    if len(cells)==9:
        col_A_contents = (cells[0].find(text=True)).rstrip()
        col_B_contents = (cells[1].find(text=True)).rstrip()
        col_C_contents = (cells[2].find(text=True)).rstrip()
        A.append(col_A_contents)
        B.append(col_B_contents)
        C.append(col_C_contents) 

# Import pandas to convert list to data frame
import pandas as pd
df = pd.DataFrame(A, columns = ['University'])
df['City'] = B
df['Province'] = C

# Reduce results to only Ontario universities NOT in Toronto
df_ontario = df[df["Province"]=="Ontario"]
df_final = df_ontario[df["City"]!="Toronto"]
df_final = df_final.reset_index(drop=True)

df_final.head()



Unnamed: 0,University,City,Province
0,Algoma University,Sault Ste. Marie,Ontario
1,Brock University,St. Catharines,Ontario
2,Carleton University,Ottawa,Ontario
3,Dominican University College,Ottawa,Ontario
4,Lakehead University,Thunder Bay,Ontario


In [337]:
!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="uni_explorer")

Solving environment: done

# All requested packages already installed.



In [338]:
# Get the latitudes and longitudes of each university
lats = []
lons = []
for index, row in df_final.iterrows():
    uni = row[0]
    location = geolocator.geocode(uni)
    lats.append(location.latitude)
    lons.append(location.longitude)

# Add the lat and lon coordinates of each uni to the dataset
df_final["Latitude"] = lats
df_final["Longitude"] = lons

df_final.head()

Unnamed: 0,University,City,Province,Latitude,Longitude
0,Algoma University,Sault Ste. Marie,Ontario,46.501885,-84.28746
1,Brock University,St. Catharines,Ontario,43.119569,-79.249395
2,Carleton University,Ottawa,Ontario,45.386084,-75.695393
3,Dominican University College,Ottawa,Ontario,45.411245,-75.7099
4,Lakehead University,Thunder Bay,Ontario,44.592292,-79.458208


In [339]:
# Set up Foursquare credentials
CLIENT_ID = '2MVMN2N4CL330MOLH0TGZY0A4BR2RLEUWUUWJ1ORKHK2P5IK' # your Foursquare ID
CLIENT_SECRET = 'QQ2TF3DFE3ZMRGALQXDYI2EFEEMV2TF2EPJUL1LPM4KS30H1' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

In [340]:
# Create a dataset that includes all of the venues nearby each uni's lat/lon location
def getNearbyVenues(names, latitudes, longitudes, radius=1500, LIMIT=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'],   
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['University', 
                  'University Latitude', 
                  'University Longitude', 
                  'Venue',  
                  'Venue Category']
    
    return(nearby_venues)

In [341]:
# Create a new table that lists all of the returned venues
all_venues = getNearbyVenues(names=df_final['University'],
                                   latitudes=df_final['Latitude'],
                                   longitudes=df_final['Longitude'])

all_venues.head()

Unnamed: 0,University,University Latitude,University Longitude,Venue,Venue Category
0,Algoma University,46.501885,-84.28746,Starbucks,Coffee Shop
1,Algoma University,46.501885,-84.28746,Pino's,Grocery Store
2,Algoma University,46.501885,-84.28746,JC Sakura,Japanese Restaurant
3,Algoma University,46.501885,-84.28746,Wacky Wings,Wings Joint
4,Algoma University,46.501885,-84.28746,Tim Hortons,Coffee Shop


In [342]:
# Count the number of venues returned for each university
uni_counter = all_venues.groupby('University')['Venue'].nunique().reset_index()

print (uni_counter)

# Create a list of universities that do not have enough (less than 10) venues returned 
remove_list = []

for index, row in uni_counter.iterrows():
    if row[1] < 10:
        remove_list.append(row[0])
        
print (remove_list)

                                       University  Venue
0                               Algoma University     18
1                                Brock University     11
2                             Carleton University     53
3                    Dominican University College     98
4                        Huron University College     18
5                             Lakehead University      5
6                           Laurentian University      6
7                             McMaster University     45
8                            Nipissing University     30
9                              Queen's University     97
10               Royal Military College of Canada     63
11                          Saint Paul University     95
12                               Trent University      6
13                           University of Guelph     28
14  University of Ontario Institute of Technology     50
15                           University of Ottawa     98
16                         Univ

In [343]:
# Remove unis in the dataset that show up in the remove list
for uni in remove_list:
    all_venues = all_venues[all_venues["University"] != uni]

all_venues.head()

Unnamed: 0,University,University Latitude,University Longitude,Venue,Venue Category
0,Algoma University,46.501885,-84.28746,Starbucks,Coffee Shop
1,Algoma University,46.501885,-84.28746,Pino's,Grocery Store
2,Algoma University,46.501885,-84.28746,JC Sakura,Japanese Restaurant
3,Algoma University,46.501885,-84.28746,Wacky Wings,Wings Joint
4,Algoma University,46.501885,-84.28746,Tim Hortons,Coffee Shop


In [344]:
# one hot encoding
unis_onehot = pd.get_dummies(all_venues[['Venue Category']], prefix="", prefix_sep="")

# add University column back to dataframe
unis_onehot['University'] = all_venues['University'] 

# move University column to the first column
fixed_columns = [unis_onehot.columns[-1]] + list(unis_onehot.columns[:-1])
unis_onehot = unis_onehot[fixed_columns]

# Group by university
unis_grouped = unis_onehot.groupby('University').sum().reset_index()

unis_grouped.head()

Unnamed: 0,University,Adult Boutique,American Restaurant,Arcade,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,BBQ Joint,Bakery,...,Track Stadium,Trail,Train Station,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Algoma University,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,Brock University,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Carleton University,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
3,Dominican University College,1,1,0,0,0,0,1,0,4,...,0,0,0,0,4,0,2,0,0,3
4,Huron University College,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [345]:
unis_grouped = unis_onehot.groupby('University').sum().reset_index()

unis_grouped.head()

Unnamed: 0,University,Adult Boutique,American Restaurant,Arcade,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,BBQ Joint,Bakery,...,Track Stadium,Trail,Train Station,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Algoma University,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,Brock University,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Carleton University,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
3,Dominican University College,1,1,0,0,0,0,1,0,4,...,0,0,0,0,4,0,2,0,0,3
4,Huron University College,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [346]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [347]:
num_top_venues = 8

for uni in unis_grouped['University']:
    print("----"+uni+"----")
    temp = unis_grouped[unis_grouped['University'] == uni].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Algoma University----
                  venue freq
0           Coffee Shop    2
1  Fast Food Restaurant    2
2        Discount Store    1
3            Beer Store    1
4           Supermarket    1
5          Burger Joint    1
6           Golf Course    1
7        Sandwich Place    1


----Brock University----
                   venue freq
0            Coffee Shop    3
1      Convenience Store    2
2            Gas Station    1
3                    Gym    1
4         Scenic Lookout    1
5                  Hotel    1
6  Performing Arts Venue    1
7             Restaurant    1


----Carleton University----
                  venue freq
0           Coffee Shop    9
1                  Park    3
2                   Pub    3
3        Sandwich Place    3
4  Fast Food Restaurant    2
5           Pizza Place    2
6                   Bar    2
7                Museum    1


----Dominican University College----
                   venue freq
0            Coffee Shop    8
1  Vietnamese Restaurant  