In [32]:
# %load house_scaper.py
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests
import sqlite3
import datetime

df = pd.DataFrame({"MLS": [],"Street": [], "City":[],"ListPrice":[],"Bedrooms":[],"Bathrooms":[],"SqFt":[],"Date":[],  "Price/SqFt":[]})
months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October",
              "November", "December"]
month_map = {key: int(val) for key, val in zip(months, range(1, 13))}


def sqlize_string(string):
    return "'" + string.strip() + "'"

def get_date(string):
    chunked = string.split()
    return sqlize_string(str(datetime.date(2016, month_map[chunked[0]], int(chunked[1]))))

def scraping(dataframe):
    page = requests.get("http://www.slocountyhomes.com/newlistex.php")
    data = BeautifulSoup(page.text, "html.parser")
    hdrs = ["Bathrooms", "Bedrooms", "City", "Date", "List Price", "MLS", "Price/SqFt", "SqFt", "Street"]
    idx_map = { hdr:idx for hdr, idx in zip(hdrs, range(len(hdrs))) }


    table_rows = data.find_all('tr')
    # print(table_rows)
    listing_date = ""
    for row in table_rows:
        row_entry = [0] * len(hdrs)
        cells = row.find_all("td", recursive=True)
        if len(cells) == 1:
            listing_date = cells[0].text.strip()
            assert listing_date != ""

        elif 0 < len(cells) <= 8 and len(cells) != 3:
            ## CELL ORDER -->   MLS #	Street	City	List Price	Beds	Baths	Sq Footage
            row_entry[idx_map["MLS"]] = int(cells[0].text.strip())
            row_entry[idx_map["Street"]] = sqlize_string(cells[1].text.strip())
            row_entry[idx_map["City"]] = sqlize_string(cells[2].text.strip())
            row_entry[idx_map["List Price"]] = int(cells[3].text.strip()[1:].replace(",", ""))
            row_entry[idx_map["Bedrooms"]] = int(cells[4].text.strip())
            row_entry[idx_map["Bathrooms"]] = int(cells[5].text.strip())
            try: # handle missing Sq footage
                row_entry[idx_map["SqFt"]] = int(cells[6].text.strip())
                row_entry[idx_map["Price/SqFt"]] = row_entry[idx_map["List Price"]] / row_entry[
                    idx_map["SqFt"]]
            except ValueError:
                row_entry[idx_map["SqFt"]] = -1
            row_entry[idx_map["Date"]] = get_date(listing_date)
            ## append this row to dataframe
            # print(row_entry)
            dataframe.loc[len(dataframe)] = row_entry
    ## data integrity
    dataframe.drop(dataframe[dataframe.SqFt == -1].index, inplace=True)
    dataframe.drop_duplicates(inplace=True)
    return dataframe


In [33]:
houses = scraping(df)

In [34]:
houses.head()

Unnamed: 0,Bathrooms,Bedrooms,City,Date,ListPrice,MLS,Price/SqFt,SqFt,Street
0,2.0,2.0,'Cambria','2016-11-24',369000.0,1073144.0,241.334205,1529.0,'901 Sheffield'
1,2.0,3.0,'Lompoc','2016-11-24',295000.0,1073146.0,236.567763,1247.0,'3366 Via Arnez'
3,2.0,3.0,'Paso Robles','2016-11-24',380000.0,1073125.0,329.289428,1154.0,'901 Player'
4,2.0,3.0,'Santa Maria','2016-11-24',329500.0,1073147.0,273.217247,1206.0,'1343 Amarone'
5,1.0,2.0,'Atascadero','2016-11-23',75000.0,1073124.0,116.640747,643.0,'1242 Camino Del Robles'


In [35]:
from sqlite_api import SLOHouseDatabase

In [36]:
slo_houses = SLOHouseDatabase()

In [37]:
slo_houses.insert_dataframe(houses)

Connected to slo_housing.db
Records created successfully


In [38]:
slo_houses.get_dataframe_from_query("SELECT * FROM HOUSES ORDER BY date(LIST_DATE) DESC")

Unnamed: 0,ID,MLS_ID,CITY,ADDRESS,BED,BATH,LIST_PRICE,SQ_FOOTAGE,PRICE_PER_SQFT,LIST_DATE
0,7769,1073144,Cambria,901 Sheffield,2,2,369000.0,1529.0,241.0,2016-11-24
1,7770,1073146,Lompoc,3366 Via Arnez,3,2,295000.0,1247.0,236.0,2016-11-24
2,7771,1073125,Paso Robles,901 Player,3,2,380000.0,1154.0,329.0,2016-11-24
3,7772,1073147,Santa Maria,1343 Amarone,3,2,329500.0,1206.0,273.0,2016-11-24
4,7773,1073124,Atascadero,1242 Camino Del Robles,2,1,75000.0,643.0,116.0,2016-11-23
5,7774,1073132,Atascadero,1228 Camino Del Robles,2,2,185000.0,1080.0,171.0,2016-11-23
6,7775,1073134,Atascadero,1207 Fallen Leaf,2,2,190000.0,1101.0,172.0,2016-11-23
7,7776,1073096,Atascadero,4100 Nogales,3,2,440000.0,1770.0,248.0,2016-11-23
8,7777,1073121,Lompoc,812 6th,3,2,329900.0,1272.0,259.0,2016-11-23
9,7778,1073145,Lompoc,912 Northpoint,4,2,351000.0,1600.0,219.0,2016-11-23


Nov 24 -- 1180 rows

sample database to see correct schema

### add dataset from dekhtyar to HOUSING table

In [25]:
master = pd.read_csv("data/HousingSLO-Master.csv")[["Bathrooms", "Bedrooms", "City", "Date", "List Price", "MLS", "Price/SqFt", "SqFt", "Street"]]

standardize date format in DATE column

In [26]:
def parse_date(date):
    if len(date.split()) > 1:
        return get_date(date)
    else:
        chunked = date.split("/")
        return sqlize_string( str(datetime.date(2016, int(chunked[0]), int(chunked[1]))) )

In [27]:
master.Date = master.Date.apply(parse_date)
master.City = master.City.apply(sqlize_string)
master.Street = master.Street.apply(sqlize_string)

In [28]:
master.head()

Unnamed: 0,Bathrooms,Bedrooms,City,Date,List Price,MLS,Price/SqFt,SqFt,Street
0,3,3,'Lompoc','2016-10-23',370000,1072327,223.970944,1652,'629 Northbrook'
1,2,3,'Lompoc','2016-10-23',389000,1072320,217.197097,1791,'1249 Westbrook'
2,2,3,'Lompoc','2016-10-23',518000,1072319,219.491525,2360,'247 Brisa Del Mar'
3,2,2,'Morro Bay','2016-10-23',550000,1072317,563.52459,976,'2760 Cedar'
4,3,3,'Nipomo','2016-10-23',699900,1072288,301.032258,2325,'1007 Jacqueline'


In [29]:
slo_houses.insert_dataframe(master)

Connected to slo_housing.db
Records created successfully


In [41]:
results = slo_houses.get_dataframe_from_query("SELECT * FROM HOUSES JOIN MLS_LISTINGS USING (MLS_ID) ORDER BY date(LIST_DATE) DESC")

In [42]:
results

Unnamed: 0,ID,MLS_ID,CITY,ADDRESS,BED,BATH,LIST_PRICE,SQ_FOOTAGE,PRICE_PER_SQFT,LIST_DATE,ID.1,SUBTYPE,AREA,YR_BUILT,LOT_SQFT,VIEW,POOL,ARB_COMISSION
0,7794,1073089,Arroyo Grande,2050 Oak,3,2,1149000.0,2291.0,501.0,2016-11-22,47,SFR/D,ARRG,1987,304920,1,0,2.50
1,7835,1073032,San Luis Obispo,5640 Pinehurst,4,3,945000.0,2600.0,363.0,2016-11-20,215,SFR/D,SLO,1989,11566,1,0,2.50
2,7838,1073062,Morro Bay,3281 Tide,2,2,430000.0,891.0,482.0,2016-11-19,118,SFR/D,MRBY,1972,2400,1,0,2.00
3,7845,1073020,Arroyo Grande,661 Rosemary,4,3,1245000.0,3300.0,377.0,2016-11-18,54,SFR/D,ARRG,0,26088,1,0,2.00
4,7857,1073049,San Luis Obispo,67 South,2,2,569000.0,1102.0,516.0,2016-11-18,180,SFR/D,SLO,1900,3202,0,0,2.50
5,7858,1073037,San Luis Obispo,170 Ramona,3,2,799000.0,1776.0,449.0,2016-11-18,199,SFR/D,SLO,1953,6299,1,0,2.50
6,7859,1073013,San Luis Obispo,219 Mission,4,3,1349000.0,3100.0,435.0,2016-11-18,226,SFR/D,SLO,1985,12001,1,0,2.00
7,7866,1072973,Templeton,1050 Semillon,4,3,835000.0,2825.0,295.0,2016-11-18,239,SFR/D,TTON,2004,13220,1,0,2.25
8,7844,1073030,Arroyo Grande,1093 Grieb,2,3,369500.0,1312.0,281.0,2016-11-18,480,CONDO/D,ARRG,1979,1498,1,0,2.50
9,5503,1073005,Arroyo Grande,1351 Newport,4,3,849000.0,3076.0,276.0,2016-11-17,30,SFR/D,ARRG,1960,13199,1,0,2.00


Nov 24 -- 229 rows