In [1]:
# %load house_scaper.py
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests
import sqlite3
import datetime

df = pd.DataFrame({"MLS": [],"Street": [], "City":[],"ListPrice":[],"Bedrooms":[],"Bathrooms":[],"SqFt":[],"Date":[],  "Price/SqFt":[]})
months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October",
              "November", "December"]
month_map = {key: int(val) for key, val in zip(months, range(1, 13))}


def sqlize_string(string):
    return "'" + string.strip() + "'"

def get_date(string):
    chunked = string.split()
    return sqlize_string(str(datetime.date(2016, month_map[chunked[0]], int(chunked[1]))))

def scraping(dataframe):
    page = requests.get("http://www.slocountyhomes.com/newlistex.php")
    data = BeautifulSoup(page.text, "html.parser")
    hdrs = ["Bathrooms", "Bedrooms", "City", "Date", "List Price", "MLS", "Price/SqFt", "SqFt", "Street"]
    idx_map = { hdr:idx for hdr, idx in zip(hdrs, range(len(hdrs))) }


    table_rows = data.find_all('tr')
    # print(table_rows)
    listing_date = ""
    for row in table_rows:
        row_entry = [0] * len(hdrs)
        cells = row.find_all("td", recursive=True)
        if len(cells) == 1:
            listing_date = cells[0].text.strip()
            assert listing_date != ""

        elif 0 < len(cells) <= 8 and len(cells) != 3:
            ## CELL ORDER -->   MLS #	Street	City	List Price	Beds	Baths	Sq Footage
            row_entry[idx_map["MLS"]] = int(cells[0].text.strip())
            row_entry[idx_map["Street"]] = sqlize_string(cells[1].text.strip())
            row_entry[idx_map["City"]] = sqlize_string(cells[2].text.strip())
            row_entry[idx_map["List Price"]] = int(cells[3].text.strip()[1:].replace(",", ""))
            row_entry[idx_map["Bedrooms"]] = int(cells[4].text.strip())
            row_entry[idx_map["Bathrooms"]] = int(cells[5].text.strip())
            try: # handle missing Sq footage
                row_entry[idx_map["SqFt"]] = int(cells[6].text.strip())
                row_entry[idx_map["Price/SqFt"]] = row_entry[idx_map["List Price"]] / row_entry[
                    idx_map["SqFt"]]
            except ValueError:
                row_entry[idx_map["SqFt"]] = -1
            row_entry[idx_map["Date"]] = get_date(listing_date)
            ## append this row to dataframe
            # print(row_entry)
            dataframe.loc[len(dataframe)] = row_entry
    ## data integrity
    dataframe.drop(dataframe[dataframe.SqFt == -1].index, inplace=True)
    dataframe.drop_duplicates(inplace=True)
    return dataframe


In [2]:
houses = scraping(df)

In [3]:
houses.head()

Unnamed: 0,Bathrooms,Bedrooms,City,Date,ListPrice,MLS,Price/SqFt,SqFt,Street
0,1.0,2.0,'Atascadero','2016-11-23',75000.0,1073124.0,116.640747,643.0,'1242 Camino Del Robles'
1,2.0,2.0,'Atascadero','2016-11-23',185000.0,1073132.0,171.296296,1080.0,'1228 Camino Del Robles'
2,2.0,2.0,'Atascadero','2016-11-23',190000.0,1073134.0,172.570391,1101.0,'1207 Fallen Leaf'
3,2.0,3.0,'Atascadero','2016-11-23',440000.0,1073096.0,248.587571,1770.0,'4100 Nogales'
4,2.0,3.0,'Lompoc','2016-11-23',329900.0,1073121.0,259.355346,1272.0,'812 6th'


In [4]:
from sqlite_api import SLOHouseDatabase

In [5]:
slo_houses = SLOHouseDatabase()

In [6]:
slo_houses.insert_dataframe(houses)

Connected to slo_housing.db
Records created successfully


In [16]:
slo_houses.get_dataframe_from_query("SELECT * FROM HOUSES ORDER BY LIST_DATE DESCENDING")

DatabaseError: Execution failed on sql 'SELECT * FROM HOUSES ORDER BY LIST_DATE DESCENDING': near "DESCENDING": syntax error

Nov 24 -- 1180 rows

sample database to see correct schema

### add dataset from dekhtyar to HOUSING table

In [7]:
master = pd.read_csv("data/HousingSLO-Master.csv")[["Bathrooms", "Bedrooms", "City", "Date", "List Price", "MLS", "Price/SqFt", "SqFt", "Street"]]

standardize date format in DATE column

In [8]:
def parse_date(date):
    if len(date.split()) > 1:
        return get_date(date)
    else:
        chunked = date.split("/")
        return sqlize_string( str(datetime.date(2016, int(chunked[0]), int(chunked[1]))) )

In [9]:
master.Date = master.Date.apply(parse_date)
master.City = master.City.apply(sqlize_string)
master.Street = master.Street.apply(sqlize_string)

In [10]:
master.head()

Unnamed: 0,Bathrooms,Bedrooms,City,Date,List Price,MLS,Price/SqFt,SqFt,Street
0,3,3,'Lompoc','2016-10-23',370000,1072327,223.970944,1652,'629 Northbrook'
1,2,3,'Lompoc','2016-10-23',389000,1072320,217.197097,1791,'1249 Westbrook'
2,2,3,'Lompoc','2016-10-23',518000,1072319,219.491525,2360,'247 Brisa Del Mar'
3,2,2,'Morro Bay','2016-10-23',550000,1072317,563.52459,976,'2760 Cedar'
4,3,3,'Nipomo','2016-10-23',699900,1072288,301.032258,2325,'1007 Jacqueline'


In [11]:
slo_houses.insert_dataframe(master)

Connected to slo_housing.db
Records created successfully


In [12]:
results = slo_houses.get_dataframe_from_query("SELECT * FROM HOUSES JOIN MLS_LISTINGS USING (MLS_ID)")

In [13]:
results

Unnamed: 0,ID,MLS_ID,CITY,ADDRESS,BED,BATH,LIST_PRICE,SQ_FOOTAGE,PRICE_PER_SQFT,LIST_DATE,ID.1,SUBTYPE,AREA,YR_BUILT,LOT_SQFT,VIEW,POOL,ARB_COMISSION
0,6111,1069605,Arroyo Grande,4410 Upper Lopez Canyon,1,1,345000.0,720.0,479.0,2016-10-13,1,SFR/D,ARRG,1985,1764842,1,0,3.50
1,5878,1070362,Arroyo Grande,345 Tiger Tail,2,2,379000.0,1312.0,288.0,2016-10-21,4,SFR/A,ARRG,1976,3999,1,0,2.50
2,6092,1069663,Arroyo Grande,1164 Pacific Pointe,3,2,449000.0,1326.0,338.0,2016-10-13,5,SFR/A,ARRG,1989,3999,1,0,2.25
3,4317,1072933,Arroyo Grande,306 Hondonada,2,2,519000.0,1746.0,297.0,2016-11-15,10,SFR/D,ARRG,0,206309,1,0,2.50
4,4318,1072919,Arroyo Grande,327 Corona Del Terra,3,2,560000.0,1608.0,348.0,2016-11-15,15,SFR/D,ARRG,1975,7802,1,0,3.00
5,5900,1072029,Arroyo Grande,516 Starlight,3,2,619000.0,1587.0,390.0,2016-10-13,17,SFR/D,ARRG,1999,5232,1,0,2.50
6,5554,1072274,Arroyo Grande,658 Woodland,3,2,589000.0,1615.0,364.0,2016-10-20,18,SFR/D,ARRG,1976,8590,0,0,2.25
7,5690,1070393,Arroyo Grande,850 StageCoach,3,2,699000.0,1656.0,422.0,2016-10-24,21,SFR/D,ARRG,2004,108900,1,0,2.50
8,5852,1070568,Arroyo Grande,101 Equestrian,3,2,729900.0,2127.0,343.0,2016-10-21,22,SFR/D,ARRG,0,17533,1,0,2.50
9,6327,1067335,Arroyo Grande,331 StageCoach,3,3,775000.0,2621.0,295.0,2016-10-09,24,SFR/D,ARRG,1980,8999,1,0,2.50


Nov 24 -- 229 rows