# Web Scrape labelled images of building styles

# Architectural Index for Ontario - Archindont
* Archindont is a database of architectural information and citations to periodical articles and books about buildings in Toronto.
* list of building types: http://archindont.torontopubliclibrary.ca/Arch/search.do;jsessionid=jzKVTS6juZ65uhz4EA9tv9K7
http://archindont.torontopubliclibrary.ca/Arch/main.do

In [1]:
# Import libaries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import string
import re
import urllib
import os
import time
from datetime import datetime
import string

In [2]:
from sqlalchemy import create_engine, Column, Integer, String, Sequence, Float
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from models import connect_db, PointsOfInterest, ArchitecturalStyles, Architects,POICategories

In [3]:
# search by Builidng Type
main_page = 'http://archindont.torontopubliclibrary.ca/Arch/search.do?searchType=Typ&initial='
#style_url="http://www.acotoronto.ca/search_buildingsDB_d2.php"
site_root = "http://archindont.torontopubliclibrary.ca"
debug=False
buildings_list=[]
rerun_webscrape=False # rerun all  webscraping
populate_db = False # repopulate database

In [3]:
def load_page(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Error connecting: status code {response.status_code}")

In [4]:
def building_types_pages(type_url):
    '''
    Returns list of dictionaries where each dictionary is info about one building
    To Test: /Arch/buildingType.do;jsessionid=A2ZMTUjOxuCFc4yaAxah-Nws?type=Apartment+Houses&typeID=18'
    '''
    buildings=[]
    flags_dict = {
            'Address:':None,
            'Type of Building:': None,
            'Name of Building:': None,
            'Notes:':None,
            'Building completed': None,
            'Demolished': None,
           'Architects:': None
        }
    throw_flag = False
    flag = None

    
    type_url = f"{site_root}/{type_url}"
    html = load_page(type_url)
    soup = BeautifulSoup(html)
    for tag in soup.recursiveChildGenerator():
        if tag.name in ['th','td']:
            
            if tag.text == 'Address:' and flags_dict['Address:'] != None:
                # new building but skip the first building in the page
                buildings.append(flags_dict) # write out dict for prev building
                flags_dict = {
                    'Address:':None,
                    'Type of Building:': None,
                    'Name of Building:': None,
                    'Notes:':None,
                    'Building completed': None,
                    'Demolished': None,
                   'Architects:': None
                }
            if throw_flag:
               # print(f"{flag} {tag.text.strip()}")
                flags_dict[flag] = tag.text.strip()
                throw_flag = False
            if tag.text.strip() in flags_dict.keys():
               # print(tag.text)
                throw_flag = True
                flag = tag.text.strip()

    return buildings    

In [23]:
def run_webscrape():
    for char in string.ascii_uppercase[25]:
        letter_url = f"{main_page}{char}"
        html = load_page(letter_url)
        soup = BeautifulSoup(html)
        bld_types = soup.find('form', {'name':'SearchResultsForm'}).findChildren('li')
        for style in bld_types:
            bld_type_url = style.find('a').get_attribute_list('href')[0]
            buildings_list.extend(building_types_pages(bld_type_url))
            time.sleep(5)
        bld_df = pd.DataFrame(buildings_list)
        bld_df.to_csv('../data/archidont_buildings_'+ str(char) + str(round(time.time(),0)) + '.csv')


In [None]:
def save_to_database_ORM(session, df):
    '''
    Saves scraped data to database using SqlAlchemy ORM
    Updates three tables: points_of_interest, archtectural_styles, architects
    The relationship between these tables is defined in models.py, so it automatically populates the poi_id column
    in the child tables with the poi_id of the main entry 
    '''
    
    for index, row in df.iterrows():
        
        poi_dict ={df_to_db_map[k]:v for k, v in row.items() if k in df_to_db_map.keys() and not pd.isnull(v)}
        poi_dict['source']= site_root
        poi = PointsOfInterest(**poi_dict )

#         # define style -- no style for Archidont buildings
#         style=ArchitecturalStyles(style=row['Style'])
#         poi.styles.append(style)
        
        # architects (can be multiple)
        if pd.isnull(row['Architects:']) == False:
            prev_company=""
            #split multiple architects on \r\n
            for company in row['Architects:'].split('\r\n'):
                if company != prev_company:
                    architect = Architects(architect_name= company.replace("'","''"))
                    poi.architects.append(architect)
                    prev_company=company
        session.add(poi)
        session.commit()

In [36]:
def cleanup_data(df):
    # TODO: check for duplcates before inserting
    
    # remove demolished buildings for now
    df = df[df['Demolished'].isnull()]
    
    # Append TORONTO, ON to all addresses
    df['Address:'] = df['Address:'] + ", Toronto ON"
    
    # some of the building names are missing -- if so, then delete the row since it's meaningless for us
    df = df[df['Name of Building:'].isnull()==False]
    
    for ix, row in df.iterrows():
        if row['Address:'][0] == '0':
            print(row['Name of Building:'], row['Address:'], row['Type of Building:'])
            
    # we have some odd categories of buildings here that we don't really want to include in our walks
    
#     # check for 0 Street type addresses -- see if can flesh out
#     df['Address:'].apply(lambda x: print(x) if x[0] == '0' else print("ok"))
#    #if a.address[0] == '0':

    return df

In [37]:
if rerun_webscrape:
    run_webscrape()
    bld_df = pd.DataFrame(buildings_list)
    bld_df.to_csv('../data/archidont_buildings_'+ str(round(time.time(),0)) + '.csv')
else:
    bld_df=pd.read_csv('../data/archidont_buildingsB_Z.csv',index_col=0)
    print(bld_df.shape)
    bld_df=cleanup_data(bld_df)
    print(bld_df.shape)
bld_df.head()

(2447, 7)
Canadian Bank of Commerce 0 Danforth Avenue, Toronto ON Banks
Banca Commerciale Italiana 0 St. Clair Avenue West, Toronto ON Banks
Bank of Nova Scotia 0 St. Clair Avenue West, Toronto ON Banks
Molson Plant 0 Fleet Street, Toronto ON Breweries
Governor's Bridge 0 Belt Line (Ravine), Toronto ON Bridges
Governor's Bridge 0 Bloor Street East, Toronto ON Bridges
Rosedale Valley Bridge 0 Bloor Street East, Toronto ON Bridges
Cherry Street (lift) Bridge 0 Cherry Street, Toronto ON Bridges
Don Valley Bicycle Bridge 0 Don Valley Parkway, Toronto ON Bridges
Gerrard Street Bridge 0 Gerrard Street East, Toronto ON Bridges
Humber Pedestrian Cycle Brg 0 Harbourfront, Toronto ON Bridges
Huntley Street Bridge 0 Huntley Street, Toronto ON Bridges
John St. Pedestrian Bridge 0 John Street, Toronto ON Bridges
Mimico Creek bridge 0 Mimico Creek, Toronto ON Bridges
Mount Pleasant Cemetery Bdg 0 Mount Pleasant Road, Toronto ON Bridges
Queen Street Bridge 0 Queen Street East, Toronto ON Bridges
St. 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


 0 Spadina Avenue, Toronto ON Parks
Penninsula Pleasure Ground 0 Toronto Island(s), Toronto ON Parks
Victoria Park 0 Victoria Park Avenue, Toronto ON Parks
Alexander Muir Memorial Gardens 0 Yonge Street, Toronto ON Parks
High Park Pavilion 0 High Park, Toronto ON Pavilions
Coronation Park Pavilion 0 Lake Shore Boulevard West, Toronto ON Pavilions
Allan Gardens Pavilion no.3 0 Sherbourne Street, Toronto ON Pavilions
Toronto Arts Building 0 Victoria Street, Toronto ON Performing Arts Centres
Ryerson Athletic Facilities 0 Gerrard Street East, Toronto ON Physical Education Buildings
Mail Building 0 King Street West, Toronto ON Post Offices
Nortwind Toronto District Cooling Project 0 Front Street West, Toronto ON Power Plants
Lakeview Generating Station 0 Lake Shore Boulevard West, Toronto ON Power Plants
Bishop Strachan School 0 College Street, Toronto ON Private Schools
Upper Canada College 0 King Street West, Toronto ON Private Schools
Peter Pan Statue 0 Avenue Road, Toronto ON Public Sc

Unnamed: 0,Address:,Architects:,Building completed,Demolished,Name of Building:,Notes:,Type of Building:
1,"63 Walker Avenue, Toronto ON",,,,Hunt's Bakery,HUnt's Bakery operated here from 1928-1979. S...,Bakeries
3,"420 Bloor Street East, Toronto ON",Lyle John,1911.0,,Toronto Dominion Bank,,Banks
4,"539 Bloor Street West, Toronto ON",Horsburgh V.C.,1914.0,,Canadian Bank of Commerce,Now houses Pauper's Pub.,Banks
5,"1129 Bloor Street West, Toronto ON",Taylor Hazell Architects (firm),,,Canadian Bank of Commerce,,Banks
6,"363 Broadview Avenue, Toronto ON",Lyle John,1911.0,,Bank of Nova Scotia,,Banks


In [38]:
#bld_df[bld_df['Demolished'].isnull() == False]
# 208 demolished buildings -- for our purpooses, probably no benefit in keepnig

In [41]:
bld_df['Type of Building:'].unique()

array(['Bakeries', 'Banks', 'Book Stores', 'Breweries', 'Bridges',
       'Broadcasting Stations', 'Bus Terminals', 'Cabins', 'Campuses',
       'Cemeteries', 'Chapels', 'Churches', 'City Blocks', 'City Halls',
       'Clothing Stores', 'Clubhouses', 'Coach Houses', 'Colleges',
       'Comfort Stations', 'Community Centres', 'Concert Halls',
       'Condominiums', 'Convention Centres', 'Corporate Headquarters',
       'Cottages', 'Courthouses', 'Cultural Centres', 'Dairies',
       'Day Care Centres', 'Department Stores', 'Design Centres',
       'Dormitories', 'Elementary Schools', 'Entertainment Buildings',
       'Exhibition Buildings', 'Extended Care Facilities', 'Factories',
       'Film Studios', 'Fire Stations', 'Gardens', 'Gasworks', 'Gates',
       'Government Office Buildings', 'Grain Elevators', 'Greenhouses',
       'Grocery Stores', 'Hardware Stores', 'High Schools', 'Hospitals',
       'Hostels', 'Hotels', 'Houses', 'Housing',
       'Hydro-Electric Stations', 'Jails', 'L

In [50]:
cats_to_drop = ['Subway Stations']
bld_df[bld_df['Type of Building:']=='Houses']

Unnamed: 0,Address:,Architects:,Building completed,Demolished,Name of Building:,Notes:,Type of Building:
893,"15 Admiral Road, Toronto ON",Todd David,1891,,15 Admiral Road,Twinned double house also includes no.13 Admir...,Houses
897,"41 Alcina Avenue, Toronto ON",,,,York Wilson House,,Houses
907,"288 Annette Street, Toronto ON",Knox Elliot & Jarvis (firm),1899,,Birches (The),Built for Theodore Heintzman.,Houses
908,"336 Annette Street, Toronto ON",,c1910,,Oaklands (The),Former owners were William Stewart and then Ro...,Houses
915,"125 Aspenwood Drive, Toronto ON",,c1830s,,John Duncan House,,Houses
919,"2 Ava Crescent, Toronto ON",Biriukova Alexandra,,,Lawren Harris House,,Houses
922,"34 Avondale Road, Toronto ON",,1834,,Elihu Pease House,,Houses
925,"200 Baldwin Avenue, Toronto ON",Marecheaux Otis\r\nPeacock Gregory\r\nWinton T...,1993-1994,,Kensington Tower,,Houses
931,"0 Bay Street, Toronto ON",,1829,,Clover Hill,Built by John Elmsley,Houses
932,"0 Bay Street, Toronto ON",,c1837,,Elmsley Villa,Built for John Elmsley. Functioned as the fou...,Houses


In [60]:
db=connect_db() #establish connection
Session = sessionmaker(bind=db)
session = Session() 

In [61]:
df_to_db_map={
    'Name of Building:':'name',
    'Building completed':'build_year'   ,
    'Demolished' :'demolished_year',
    'Address:' :'address' ,
    'Bld_link':'external_url',
    'Notes:': 'details',
    'Type of Building:':'poi_type'
}

In [63]:
if populate_db:
    save_to_database_ORM(session, bld_df)