In [None]:
import numpy as np
import pandas as pd
import sqlite3
import calendar
import json
import matplotlib.path as mplPath

import shapefile
from neighbourhoods import hoods, cities

# Basic plan for a migration
1. Create a new database with the desired schema
2. Load the existing data into a dataframe
3. Apply changes to the dataframe
4. Write the new dataframe to the new database

In [None]:
raw_data = sqlite3.connect('apartments.db')
raw = raw_data.cursor()

clean_data = sqlite3.connect('apartments_new.db')
clean = clean_data.cursor()

In [None]:
def create_new_table(conn):
    conn.execute("""
    CREATE TABLE IF NOT EXISTS 'apartments' (
    'date' TEXT,
    'id' TEXT,
    'title' TEXT,
    'latitude' REAL, 
    'longitude' REAL,
    'address' TEXT,
    'date_available' TEXT,
    'price' REAL,
    'area' REAL,
    'neighbourhood' TEXT,
    'extras' TEXT,
    'bedrooms' REAL,
    'bathrooms' REAL,
    'unit_type' TEXT,
    'parking' TEXT,
    'smoking' BOOL,
    'pets' TEXT,
    'laundry' TEXT,
    'furnished' BOOL,
    'city' TEXT,
    'location' TEXT);""")
    
create_new_table(clean)
clean_data.commit()

In [None]:
# load data
df = pd.read_sql_query("SELECT * FROM apartments",raw_data)

In [None]:
# select the city
# takes about 8 minutes

def pick_city (x):
    selected_city = None
    for city, coords in cities.items():
        if mplPath.Path(coords.values).contains_point((x['longitude'],x['latitude'])) == True:
            selected_city = city
            break
    return selected_city

# Select the city for all records
# 7.16s for 10,000 records
% time df['City'] = df.apply(pick_city,axis=1)

In [None]:
# select the neighbourhood
# if neighbourhood is selected from defined set, then set neighbourhood field
# otherwise report in location field

def pick_location(x):
    location = x['neighbourhood']
    if location != None:
        location = location.lower()
    return location

def get_neighbourhood(latitude,longitude):
    # or, grab small from postig title text
    try:
        neighbourhood = None
        for k,v in hoods.items():
            if mplPath.Path(v.values).contains_point((longitude,latitude)): # for some reason, files are long,lat
                neighbourhood = k
                break
    except:
        neighbourhood = None
    return neighbourhood
        
def pick_neighbourhood (x):
    neighbourhood = get_neighbourhood(x['latitude'],x['longitude'])
    return neighbourhood

pick_location(df.iloc[0])
% time df['location'] = df.apply(pick_location,axis=1)

#pick_neighbourhood(df.iloc[0])
df['neighbourhood'] = df.apply(pick_neighbourhood,axis=1)

In [None]:
def furnished (x):
    furn = False
    if x['furnished'] == 1:
        furn = True
    return furn

% time df['furnished'] = df.apply(furnished,axis=1)


In [None]:
df.to_sql("apartments",clean_data,if_exists='replace',index=False)

In [None]:
clean_data.close()