In [1]:
import pandas as pd
import numpy as np
import requests
import sqlite3
import datetime
import re

In [2]:
desired_columns = ["ListingID", "SubType", "MLSArea", "YrBuilt", "AcLSqft", "ViewYN", "PoolPrivateYN", "BAC"]

In [3]:
single_fam_homes = pd.read_csv("data/MLS_SINGLEFAMILYHOMES.csv")[desired_columns]
not_sfh = pd.read_csv("data/MLS_NOT_SFH.csv")[desired_columns]


massage features such as listing ID to match other database & parse Acre/LotSquareFootage to only have LotSquareFootage

In [4]:
clean_mls = lambda x : x[2:]

single_fam_homes.ListingID = single_fam_homes.ListingID.apply(clean_mls)
not_sfh.ListingID = not_sfh.ListingID.apply(clean_mls)

In [5]:
clean_aclsqft = lambda x : int(x.split("/")[-1].replace(",", ""))

single_fam_homes.AcLSqft = single_fam_homes.AcLSqft.apply(clean_aclsqft)
not_sfh.AcLSqft = not_sfh.AcLSqft.apply(clean_aclsqft)

1 hot encode features such as notable View & private pool on property

In [6]:
single_fam_homes.ViewYN = single_fam_homes.ViewYN.apply(lambda x : 1 if x == "Y" else 0)
not_sfh.ViewYN = not_sfh.ViewYN.apply(lambda x : 1 if x == "Y" else 0)

In [7]:
single_fam_homes.PoolPrivateYN = single_fam_homes.PoolPrivateYN.apply(lambda x : 1 if x == "Y" else 0)
not_sfh.PoolPrivateYN = not_sfh.PoolPrivateYN.apply(lambda x : 1 if x == "Y" else 0)

some years have "/EST" suffix

In [8]:
def parse_yrBuilt(x):
    try:
        return int(x.split("/")[0])
    except:
        return x

In [9]:
single_fam_homes.YrBuilt = single_fam_homes.YrBuilt.apply(parse_yrBuilt)
not_sfh.YrBuilt = not_sfh.YrBuilt.apply(parse_yrBuilt)

In [10]:
pattern = re.compile("([\d.]+)")
clean_bac = lambda x : re.search(pattern, x).group(0)
single_fam_homes.BAC = single_fam_homes.BAC.apply(clean_bac)
not_sfh.BAC = not_sfh.BAC.apply(clean_bac)

In [11]:
from sqlite_api import MLSDatabase

In [12]:
db = MLSDatabase()

In [13]:
db.create_db()

Opened database successfully
NOTICE:  table MLS_LISTINGS already exists
Table created successfully


In [14]:
db.insert_dataframe(single_fam_homes)
db.insert_dataframe(not_sfh)

Connected to slo_housing.db
Records created successfully
Connected to slo_housing.db
Records created successfully


In [15]:
from sqlite_api import SLOHouseDatabase

In [16]:
dbquery = SLOHouseDatabase()

In [20]:
dbquery.select_row("SELECT * FROM HOUSES JOIN MLS_LISTINGS USING (MLS_ID)")

[(4,
  1073032,
  'San Luis Obispo',
  '5640 Pinehurst',
  4,
  3,
  945000.0,
  2600.0,
  363.0,
  '2016-11-20',
  215,
  'SFR/D',
  'SLO',
  1989,
  11566,
  1,
  0,
  2.5),
 (4,
  1073032,
  'San Luis Obispo',
  '5640 Pinehurst',
  4,
  3,
  945000.0,
  2600.0,
  363.0,
  '2016-11-20',
  797,
  'SFR/D',
  'SLO',
  1989,
  11566,
  1,
  0,
  2.5),
 (7,
  1073062,
  'Morro Bay',
  '3281 Tide',
  2,
  2,
  430000.0,
  891.0,
  482.0,
  '2016-11-19',
  118,
  'SFR/D',
  'MRBY',
  1972,
  2400,
  1,
  0,
  2.0),
 (7,
  1073062,
  'Morro Bay',
  '3281 Tide',
  2,
  2,
  430000.0,
  891.0,
  482.0,
  '2016-11-19',
  700,
  'SFR/D',
  'MRBY',
  1972,
  2400,
  1,
  0,
  2.0),
 (13,
  1073030,
  'Arroyo Grande',
  '1093 Grieb',
  2,
  3,
  369500.0,
  1312.0,
  281.0,
  '2016-11-18',
  480,
  'CONDO/D',
  'ARRG',
  1979,
  1498,
  1,
  0,
  2.5),
 (13,
  1073030,
  'Arroyo Grande',
  '1093 Grieb',
  2,
  3,
  369500.0,
  1312.0,
  281.0,
  '2016-11-18',
  1062,
  'CONDO/D',
  'ARRG',
  1979,