# Data Loader
* loads csv file, expected to have set structure of columns even if not all used
* cleans data:
*   1) find and remove duplicates
*   2) clean up addresses
*   3) get lats/longs
*   4) calculate additional columns
* uploads to db

In [1]:
from sqlalchemy.orm import sessionmaker
from models import connect_db, PointsOfInterest, ArchitecturalStyles, Architects,POICategories
import pandas as pd
import re

In [2]:
from config import BaseConfig
from utils import *

In [3]:
import logging
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)
logging.basicConfig(filename='../logs/initdataload.log',level=logging.INFO)

In [4]:
def make_simple_poi(org_type):
    if org_type == 'Plaque':
        return org_type
    elif org_type == 'Monument':
        return 'Art'
    else:
        return 'Building'


In [5]:
def clean_build_year(year):
    if pd.isna(year) or year == None or len(year) < 4:
        return ''
    strip_words = ['unknown', 'circa ', 'abt ', 'about']
    for word in strip_words:
        year=year.replace(word, '')
    return year[0:4]

In [6]:
# try to find points outside of Toronto
def find_points_outside_TO(df, fix_address=False, dist=50, starting_lat=43.656287,starting_long= -79.380898):
    ''' Find points more than 50KM from downtown Toronto (Yonge Dundas Square is default) and try to update'''
    df=find_dist(df, starting_lat, starting_long)
    for ix, row in df[df['dist_start']>dist].iterrows():
        #print(f"{row['poi_id'], row['name']} is outside of Toronto")
        logging.debug(f"{row['poi_id'], row['name']} is outside of Toronto")
        #update_coords(row['poi_id'], fix_address)
        lat, long = get_lats_longs(row)
        df_poi.loc[index, 'latitude']= lat
        df_poi.loc[index, 'longitude']= long
    return df

In [7]:
def add_features(df):
    df['cleaned_year']=df['build_year'].apply(lambda x: clean_build_year(x))
    df['cleaned_year']=pd.to_numeric(df['cleaned_year'],errors='coerce',downcast='integer')
    df['build_decade']= df['cleaned_year'].apply(lambda x: x//10*10 )
    df['poi_type_simple'] = df['poi_type'].apply(lambda x: make_simple_poi(x))
    return df

In [8]:
def load_init_data():
    df_poi = pd.read_csv('../init_data/pois.csv' )
    df_architects = pd.read_csv('../init_data/architects.csv' )
    df_cats = pd.read_csv('../init_data/poi_cats.csv' )
    df_styles= pd.read_csv('../init_data/architectural_styles.csv' )
    return df_poi, df_architects, df_cats, df_styles


In [11]:
def data_clean_up(df_poi):
    '''
    Load from CSV
    Drop duplicates
    Clean up address
    Find missing lat/long coords
    Find spots outside Toronto and 
    add new features
    '''
    #df_poi=load_init_data()
    df_poi=df_poi.drop_duplicates(subset=['name', 'address','source','external_url'], keep='last')
    df_poi['address']=df_poi.apply(lambda row: cleanup_address(row['name'], row['address'], logging), axis=1)
    for index, row in df_poi[pd.isna(df_poi['latitude']) | pd.isna(df_poi['longitude'])].iterrows():
        lat, long = get_lats_longs(row)
        df_poi.loc[index, 'latitude']= lat
        df_poi.loc[index, 'longitude']= long
    df_poi=find_points_outside_TO(df_poi)
    df_poi = add_features(df_poi)
    return df_poi

In [12]:
df_poi, df_architects, df_cats, df_styles=load_init_data()
df_poi=data_clean_up(df_poi)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['dist_start'] = avail_points
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.

In [14]:
df_poi.head()

Unnamed: 0,poi_id,name,build_year,demolished_year,address,latitude,longitude,external_url,image_url,heritage_status,current_use,poi_type,source,details,dist_start,cleaned_year,build_decade,poi_type_simple
1,2,"Walter Seymour Allward, R.C.A. 1876 - 1955",345.0,,"43 Amelia Street, Toronto, ON, Canada",43.667614,-79.366896,http://www.cabbagetownpeople.ca/person/walter-...,,other,,,http://www.cabbagetownpeople.ca,,1.690953,,,Building
2,4,"Walter Seymour Allward, R.C.A. 1876 - 1955",,,"137 Amelia Street, Toronto, ON, Canada",43.668273,-79.363974,http://www.cabbagetownpeople.ca/person/walter-...,,,,,http://www.cabbagetownpeople.ca,,1.907115,,,Building
3,7368,Betty Oliphant 1918 - 2004,,,"137 Amelia Street, Toronto, ON, Canada",43.66824,-79.36388,http://www.cabbagetownpeople.ca/person/betty-o...,,,,Plaque,http://www.cabbagetownpeople.ca,World-Renowned Innovator of Ballet Education. ...,1.909954,,,Plaque
4,7370,"Dr. Mary O'Riordan, D.V.M. 1925 - 1993",,,"160 Amelia Street, Toronto, ON, Canada",43.66887,-79.36299,http://www.cabbagetownpeople.ca/person/mary-or...,,,,Plaque,http://www.cabbagetownpeople.ca,"Pioneering Veterinarian. Born in Ennis, Count...",2.010212,,,Plaque
5,7371,Roger Abbott 1946 - 2011,,,"132 Amelia Street, Toronto, ON, Canada",43.66847,-79.36441,http://www.cabbagetownpeople.ca/person/roger-a...,,,,Plaque,http://www.cabbagetownpeople.ca,"Comedian and Actor, Co-Founder of the Royal Ca...",1.897606,,,Plaque


In [None]:
# need to stitch together different dfs

In [33]:
df_to_db_map={
    'name':'name',
    'build_year':'build_year'   ,
    'demolished_year' :'demolished_year',
    'address' :'address' ,
    'external_url':'external_url',
    'details': 'details',
    'image_url':'image_url',
    'heritage_status':'heritage_status',
    'current_use use':'current_use',
    'poi_type':'poi_type',
    'source': 'source', 
    'cleaned_year': 'build_year_clean',
    'build_decade': 'build_decade',
    'poi_type_simple': 'poi_type_simple',
    'latitude': 'latitude',
    'longitude': 'longitude'
}

In [34]:
df_poi.columns

Index(['poi_id', 'name', 'build_year', 'demolished_year', 'address',
       'latitude', 'longitude', 'external_url', 'image_url', 'heritage_status',
       'current_use', 'poi_type', 'source', 'details', 'dist_start',
       'cleaned_year', 'build_decade', 'poi_type_simple'],
      dtype='object')

In [59]:
def save_to_database_ORM(df):
    '''
    Saves scraped data to database using SqlAlchemy ORM
    Updates three tables: points_of_interest, archtectural_styles, architects
    The relationship between these tables is defined in models.py, so it automatically populates the poi_id column
    in the child tables with the poi_id of the main entry 
    '''
    
    for index, row in df.iterrows():
        
        poi_dict ={df_to_db_map[k]:v for k, v in row.items() if k in df_to_db_map.keys() and not pd.isnull(v)}
        #poi_dict['source']= site_root
        poi = PointsOfInterest(**poi_dict )
        old_poi_id =row['poi_id']
        
        # define style
        for ix2, astyle in df_styles[df_styles['poi_id']==old_poi_id].iterrows():
            #tyle=ArchitecturalStyles(style=row['Style'])
            style=ArchitecturalStyles(style=astyle['style'])
            poi.styles.append(style)
            
        for ix2, acat in df_cats[df_cats['poi_id']==old_poi_id].iterrows():
            cat = POICategories(category =acat['category'])
            poi.categories.append(cat)
            
        # architects (can be multiple)
        prev_company=""
        for ix2, anarct in df_architects[df_architects['poi_id']==old_poi_id].iterrows():
            if anarct['architect_name'] != prev_company and not 'Also see' in anarct['architect_name']:
                architect = Architects(architect_name= anarct['architect_name'].replace("'","''"))
                poi.architects.append(architect)
                prev_company=anarct['architect_name']
                print (anarct['architect_name'])
        print(poi.styles)
#         session.add(poi)
#         session.commit()

In [60]:
df_architects[df_architects['poi_id']==218]

Unnamed: 0,poi_id,architect_name
36,218,"Bruce Kuwabara, Thomas Payne, Marianne McKenna..."
37,218,"S.H. Maw, Associates"
38,218,George and Moorehouse


In [61]:
df_cats[df_cats['poi_id']==5]

Unnamed: 0,poi_id,category


In [66]:
save_to_database_ORM(df_poi[df_poi['poi_id']==8])

[<models.ArchitecturalStyles object at 0x000001772D4076D8>]


In [67]:
df_poi[df_poi['poi_id']!=8]

Unnamed: 0,poi_id,name,build_year,demolished_year,address,latitude,longitude,external_url,image_url,heritage_status,current_use,poi_type,source,details,dist_start,cleaned_year,build_decade,poi_type_simple
1,2,"Walter Seymour Allward, R.C.A. 1876 - 1955",345,,"43 Amelia Street, Toronto, ON, Canada",43.667614,-79.366896,http://www.cabbagetownpeople.ca/person/walter-...,,other,,,http://www.cabbagetownpeople.ca,,1.690953,,,Building
2,4,"Walter Seymour Allward, R.C.A. 1876 - 1955",,,"137 Amelia Street, Toronto, ON, Canada",43.668273,-79.363974,http://www.cabbagetownpeople.ca/person/walter-...,,,,,http://www.cabbagetownpeople.ca,,1.907115,,,Building
3,7368,Betty Oliphant 1918 - 2004,,,"137 Amelia Street, Toronto, ON, Canada",43.668240,-79.363880,http://www.cabbagetownpeople.ca/person/betty-o...,,,,Plaque,http://www.cabbagetownpeople.ca,World-Renowned Innovator of Ballet Education. ...,1.909954,,,Plaque
4,7370,"Dr. Mary O'Riordan, D.V.M. 1925 - 1993",,,"160 Amelia Street, Toronto, ON, Canada",43.668870,-79.362990,http://www.cabbagetownpeople.ca/person/mary-or...,,,,Plaque,http://www.cabbagetownpeople.ca,"Pioneering Veterinarian. Born in Ennis, Count...",2.010212,,,Plaque
5,7371,Roger Abbott 1946 - 2011,,,"132 Amelia Street, Toronto, ON, Canada",43.668470,-79.364410,http://www.cabbagetownpeople.ca/person/roger-a...,,,,Plaque,http://www.cabbagetownpeople.ca,"Comedian and Actor, Co-Founder of the Royal Ca...",1.897606,,,Plaque
6,7372,Gordon Sinclair 1900 - 1984,,,"327-355 Carlton Street, Toronto, ON M5A 3W3, C...",43.665450,-79.362970,http://www.cabbagetownpeople.ca/person/gordon-...,,,,Plaque,http://www.cabbagetownpeople.ca,A Giant in Canadian Broadcast Journalism. One...,1.768519,,,Plaque
7,7374,"Allan Winton King, OC 1930 - 2009",,,"397 Carlton Street, Toronto, ON, Canada",43.665690,-79.361860,http://www.cabbagetownpeople.ca/person/allan-k...,,,,Plaque,http://www.cabbagetownpeople.ca,Filmmaker. Allan King was born in Vancouver B...,1.857316,,,Plaque
8,7376,Charles Sauriol 1904 - 1995,,,"Riverdale Park, Winchester Street, Toronto, ON...",43.666420,-79.366030,http://www.cabbagetownpeople.ca/person/charles...,,,,Plaque,http://www.cabbagetownpeople.ca,Pioneer Ecologist. The plaque was situated at...,1.644914,,,Plaque
9,7377,"Oronhyatekha, M.D. 1841 - 1907",,,"211 Carlton St, Toronto, ON, Canada",43.663700,-79.369630,http://www.cabbagetownpeople.ca/person/oronhya...,,,,Plaque,http://www.cabbagetownpeople.ca,"Mohawk Physician, Victorian Businessman and Ph...",1.226570,,,Plaque
10,2600,J. H. McKinnon House,1888,,"506 Jarvis Street Church-Wellesley Toronto, ON",43.667350,-79.379120,http://www.acotoronto.ca/show_building.php?Bui...,http://www.acotoronto.ca/tobuilt_bk/php/Buildi...,Heritage property,Residential,Detached house,http://www.acotoronto.ca/,,1.237500,1888.0,1880.0,Building
