# Web Scrape labelled images of building styles

# Architectural Index for Ontario - Archindont
* Archindont is a database of architectural information and citations to periodical articles and books about buildings in Toronto.
* list of building types: http://archindont.torontopubliclibrary.ca/Arch/search.do;jsessionid=jzKVTS6juZ65uhz4EA9tv9K7
http://archindont.torontopubliclibrary.ca/Arch/main.do

In [2]:
# Import libaries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import string
import re
import urllib
import os
import time
from datetime import datetime
import string

In [3]:
from sqlalchemy import create_engine, Column, Integer, String, Sequence, Float
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from models import connect_db, PointsOfInterest, ArchitecturalStyles, Architects,POICategories

In [4]:
# search by Builidng Type
main_page = 'http://archindont.torontopubliclibrary.ca/Arch/search.do?searchType=Typ&initial='
#style_url="http://www.acotoronto.ca/search_buildingsDB_d2.php"
site_root = "http://archindont.torontopubliclibrary.ca"
debug=False
buildings_list=[]
rerun_webscrape=False # rerun all  webscraping
populate_db = False # repopulate database

In [3]:
def load_page(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Error connecting: status code {response.status_code}")

In [4]:
def building_types_pages(type_url):
    '''
    Returns list of dictionaries where each dictionary is info about one building
    To Test: /Arch/buildingType.do;jsessionid=A2ZMTUjOxuCFc4yaAxah-Nws?type=Apartment+Houses&typeID=18'
    '''
    buildings=[]
    flags_dict = {
            'Address:':None,
            'Type of Building:': None,
            'Name of Building:': None,
            'Notes:':None,
            'Building completed': None,
            'Demolished': None,
           'Architects:': None
        }
    throw_flag = False
    flag = None

    
    type_url = f"{site_root}/{type_url}"
    html = load_page(type_url)
    soup = BeautifulSoup(html)
    for tag in soup.recursiveChildGenerator():
        if tag.name in ['th','td']:
            
            if tag.text == 'Address:' and flags_dict['Address:'] != None:
                # new building but skip the first building in the page
                buildings.append(flags_dict) # write out dict for prev building
                flags_dict = {
                    'Address:':None,
                    'Type of Building:': None,
                    'Name of Building:': None,
                    'Notes:':None,
                    'Building completed': None,
                    'Demolished': None,
                   'Architects:': None
                }
            if throw_flag:
               # print(f"{flag} {tag.text.strip()}")
                flags_dict[flag] = tag.text.strip()
                throw_flag = False
            if tag.text.strip() in flags_dict.keys():
               # print(tag.text)
                throw_flag = True
                flag = tag.text.strip()

    return buildings    

In [23]:
def run_webscript():
    for char in string.ascii_uppercase[25]:
        letter_url = f"{main_page}{char}"
        html = load_page(letter_url)
        soup = BeautifulSoup(html)
        bld_types = soup.find('form', {'name':'SearchResultsForm'}).findChildren('li')
        for style in bld_types:
            bld_type_url = style.find('a').get_attribute_list('href')[0]
            buildings_list.extend(building_types_pages(bld_type_url))
            time.sleep(5)
        bld_df = pd.DataFrame(buildings_list)
        bld_df.to_csv('../data/archidont_buildings_'+ str(char) + str(round(time.time(),0)) + '.csv')


In [None]:
def save_to_database_ORM(session, df):
    '''
    Saves scraped data to database using SqlAlchemy ORM
    Updates three tables: points_of_interest, archtectural_styles, architects
    The relationship between these tables is defined in models.py, so it automatically populates the poi_id column
    in the child tables with the poi_id of the main entry 
    '''
    
    for index, row in df.iterrows():
        
        poi_dict ={df_to_db_map[k]:v for k, v in row.items() if k in df_to_db_map.keys() and not pd.isnull(v)}

        poi = PointsOfInterest(**poi_dict )

#         # define style -- no style for Archidont buildings
#         style=ArchitecturalStyles(style=row['Style'])
#         poi.styles.append(style)
        
        # architects (can be multiple)
        if pd.isnull(row['Architects:']) == False:
            prev_company=""
            #split multiple architects on \r\n
            for company in row['Architects:'].split('\r\n'):
                if company != prev_company:
                    architect = Architects(architect_name= company.replace("'","''"))
                    poi.architects.append(architect)
                    prev_company=company
        session.add(poi)
        session.commit()

In [59]:
if rerun_webscrape:
    run_webscript()
    bld_df = pd.DataFrame(buildings_list)
    bld_df.to_csv('../data/archidont_buildings_'+ str(round(time.time(),0)) + '.csv')
else:
    bld_df=pd.read_csv('../data/archidont_buildingsB_Z.csv',index_col=0)
bld_df.head()

Unnamed: 0,Address:,Architects:,Building completed,Demolished,Name of Building:,Notes:,Type of Building:
0,0 Laughton Avenue,,1929.0,c2000,Jane Parker Bakery,Will be demolished to make way for Jane Parker...,Bakeries
1,63 Walker Avenue,,,,Hunt's Bakery,HUnt's Bakery operated here from 1928-1979. S...,Bakeries
2,165 Avenue Road,Lyle John,1930.0,,,,Banks
3,420 Bloor Street East,Lyle John,1911.0,,Toronto Dominion Bank,,Banks
4,539 Bloor Street West,Horsburgh V.C.,1914.0,,Canadian Bank of Commerce,Now houses Pauper's Pub.,Banks


In [60]:
db=connect_db() #establish connection
Session = sessionmaker(bind=db)
session = Session() 

In [61]:
df_to_db_map={
    'Name of Building:':'name',
    'Building completed':'build_year'   ,
    'Demolished' :'demolished_year',
    'Address:' :'address' ,
    'Bld_link':'external_url',
    'Notes:': 'details',
    'Type of Building:':'poi_type'
}

In [63]:
if populate_db:
    save_to_database_ORM(session, bld_df)