# Web Scrape labelled images of building styles

# Toronto Plaques
* http://torontoplaques.com/Menu_Subjects.html

In [1]:
# Import libaries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import string
import re
import urllib
import os
import time
from datetime import datetime
import string

In [2]:
from sqlalchemy import create_engine, Column, Integer, String, Sequence, Float
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from models import connect_db, PointsOfInterest, ArchitecturalStyles, Architects,POICategories

In [3]:
# search by Builidng Type
main_page = 'http://torontoplaques.com/Menu_Subjects.html'
#style_url="http://www.acotoronto.ca/search_buildingsDB_d2.php"
site_root = "http://torontoplaques.com"
debug=False
plaques_list = []

rerun_webscrape=False # rerun all  webscraping
populate_db = False # repopulate database

In [None]:
def load_page(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Error connecting: status code {response.status_code}")

In [None]:
def load_plaque(plaque_url, name, subject):
    #location_text= ""
    plaque_text = ""
    plaque_dict = {
        'Name':name,
        'Subject': subject,
        'URL': plaque_url,
        'Notes':None,
        'Latitude': None,
        'Longitude': None
    }

    #plaque_url="/Pages/Alexander_Muir.html"
    plaque_url = f"{site_root}/{plaque_url}"
    #print(plaque_url)
    html = load_page(plaque_url)
    soup = BeautifulSoup(html)
    try:
        coordinates = soup.find('p',{'class':'plaquecoordinates'}).text.strip('Coordinates: ').split(' ')
        #print(coordinates)

        plaque_dict['Latitude'] = coordinates[0]
        plaque_dict['Longitude']=coordinates[1]
    except:
        # when coordinates is missing, that usually means the plaque has been removed
        # skip this plaque
        return None
    
    try:
        plaque_text = soup.find('p',{'class':'locationtext'}).text
        plaque_text += " "
    except:
        pass
        # sometimes this element doesn't exist -- no worries
    
    try:
        plaque_text += soup.find('p',{'class':'plaquetext'}).text
    except:
        pass
    
    
    plaque_dict['Notes']=plaque_text
    #plaque_dict['Notes']=soup.find('p',{'class':'locationtext'}).text
    plaques_list
    return plaque_dict


In [None]:
def load_subject_page(page_url,subject):
#     subject = 'Arts'
#     page_url="Subjects/Arts.html"
    page_url = f"{site_root}/{page_url}"
    print(page_url)
    html = load_page(page_url)
    soup = BeautifulSoup(html)


    #plaques = soup.find('div', {'id':'tablelinks'}).findAll('a')#,{'width': '70%'})
    plaques = soup.find('div', {'id':'tablelinks'}).findAll('td',{'width': '70%'})
    for plaque in plaques:
       # print(plaque)
        plaque = plaque.find('a')
        name=plaque.text.strip()
        plaque_url = plaque.get_attribute_list('href')[0].strip('..')
        #print(plaque_url)    
        plaque_dict = load_plaque(plaque_url, name, subject)
        if plaque_dict!= None:
            plaques_list.append(plaque_dict)


In [None]:
#load_subject_page('/Subjects/Medicine.html','Medicine')

In [None]:
def run_webscrape():
    html = load_page(main_page)
    soup = BeautifulSoup(html)
    subjects = soup.find('div', {'id':'tablelinks'}).findChildren('a')
    for subject in subjects:
        asubj = subject.text.strip()
        subject_url = subject.get_attribute_list('href')[0]
      #  print(subject_url)
        load_subject_page(subject_url,asubj)
    #     buildings_list.extend(building_types_pages(bld_type_url))
        time.sleep(5)
        plq_df = pd.DataFrame(plaques_list)
        plq_df.to_csv('../data/toronto_plaques_' + str(round(time.time(),0)) + '.csv')


In [4]:
def cleanup_data(df):
    # make url complete
    bld_df['URL'] =bld_df['URL'].apply(lambda x: f'{site_root}{x}' )
    bld_df['Latitude']=bld_df['Latitude'].apply(lambda x: float(x.replace(',','')))
    return df

In [5]:
if rerun_webscrape:
    run_webscrape()
    plq_df = pd.DataFrame(plaques_list)
    plq_df.to_csv('../data/toronto_plaques_' + str(round(time.time(),0)) + '.csv')
else:
    bld_df=pd.read_csv('../data/toronto_plaques_1543013801.0.csv',index_col=0)
    bld_df=cleanup_data(bld_df)
bld_df.head()

Unnamed: 0,Latitude,Longitude,Name,Notes,Subject,URL
0,43.663153,-79.327211,Alexander Muir 1830-1906,A maple tree on the southwest corner of Laing ...,Arts,http://torontoplaques.com/Pages/Alexander_Muir...
1,43.66205,-79.37973,The Beatles,"Inside the former Maple Leaf Gardens, now a Lo...",Arts,http://torontoplaques.com/Pages/Beatles.html
2,43.643193,-79.423296,The Beatrice Lillie Building,Here at 1115 Queen Street West at Lisgar Stree...,Arts,http://torontoplaques.com/Pages/Beatrice_Lilli...
3,43.650811,-79.382486,Bernard Keble Sandwell (1876-1954),I used to read Saturday Night magazine quite r...,Arts,http://torontoplaques.com/Pages/Bernard_Keble_...
4,43.67115,-79.38708,Boris Volkoff,This 2009 Heritage Toronto plaque can be found...,Arts,http://torontoplaques.com/Pages/Boris_Volkoff....


In [6]:
bld_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1133 entries, 0 to 1132
Data columns (total 6 columns):
Latitude     1133 non-null float64
Longitude    1132 non-null float64
Name         1133 non-null object
Notes        1133 non-null object
Subject      1133 non-null object
URL          1133 non-null object
dtypes: float64(2), object(4)
memory usage: 62.0+ KB


In [7]:
df_to_db_map={
    'Name':'name',
    'Latitude' :'latitude' ,
    'Longitude' :'longitude' ,
    'URL':'external_url',
    'Notes': 'details',
}

In [8]:
def save_to_database_ORM(session, df):
    '''
    Saves scraped data to database using SqlAlchemy ORM
    Updates three tables: points_of_interest, archtectural_styles, architects
    The relationship between these tables is defined in models.py, so it automatically populates the poi_id column
    in the child tables with the poi_id of the main entry 
    '''
    
    for index, row in df.iterrows():
        
        poi_dict ={df_to_db_map[k]:v for k, v in row.items() if k in df_to_db_map.keys() and not pd.isnull(v)}
        poi_dict['poi_type']="Plaque"
        poi_dict['source']= site_root
        # hardcode type to plaque 

        poi = PointsOfInterest(**poi_dict )
        
        # define category 
        cat=POICategories(category=row['Subject'])
        poi.categories.append(cat)

        session.add(poi)
        session.commit()

In [9]:

if populate_db:
    db=connect_db() #establish connection
    Session = sessionmaker(bind=db)
    session = Session() 
    save_to_database_ORM(session, bld_df)