In [141]:
from datetime import datetime
from homeharvest import scrape_property
import numpy as np
import pandas as pd
import os
import sys

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# dict vectorizer
from sklearn.feature_extraction import DictVectorizer

ACTIVE_MODEL = RandomForestRegressor


TARGET_LOCATION = 'Rockingham County, NH'
DATA_FOLDER = './data'

MIN_PRICE = 700000

print('ready')

ready


In [142]:
def convert_int(x):
    try:
        return int(x)
    except:
        return float('nan')


In [143]:
class RedfinModel:

    TARGET_COLUMN = 'sold_price'
    COLUMNS_TO_ONE_HOT_ENCODE = ['zip_code', 'state', 'style', 'city']
    COLUMNS_TO_REMOVE = ['last_sold_date', 'list_price', 'latitude', 'longitude', 'days_on_mls', TARGET_COLUMN]
    OUTPUT_COLUMNS = ['readable_address', 'list_price', 'predicted', 'diff', 'diff_percent', 'property_url']

    def __init__(self, location=TARGET_LOCATION):
        self.model = None
        self.model_type = ACTIVE_MODEL
        self.data_folder = DATA_FOLDER
        self.location = location


    def fetch_data(self, listing_type="sold"):
        # Generate filename based on current timestamp
        # current_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        today = datetime.today().strftime('%Y-%m-%d')
        location = self.location
        filename = f"{self.data_folder}/{today}|{location}|{listing_type}.csv"

        # check if data already exists in data folder
        if os.path.exists(filename):
            print(f"Using cached data for {location} {listing_type} {today}")
            # return data
            return pd.read_csv(filename)

        past_days = 365 if listing_type == 'sold' else 90

        properties = scrape_property(
          location=location,
          listing_type=listing_type,  # or (for_sale, for_rent, pending)
          past_days=past_days,  # ex: sold in last 30 days - listed in last 30 days if (for_sale, for_rent)

          # date_from="2023-05-01", # alternative to past_days
          # date_to="2023-05-28",

          # mls_only=True,  # only fetch MLS listings
          # proxy="http://user:pass@host:port"  # use a proxy to change your IP address
        )
        print(f"Fetched properties ({len(properties)}): {location} {listing_type}")
        # Export to csv
        properties.to_csv(filename, index=False)
        return properties

    def encode_onehot(self, df, cols):
        """
        One-hot encoding is applied to columns specified in a pandas DataFrame.

        Modified from: https://gist.github.com/kljensen/5452382

        Details:

        http://en.wikipedia.org/wiki/One-hot
        http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

        @param df pandas DataFrame
        @param cols a list of columns to encode
        @return a DataFrame with one-hot encoding
        """
        vec = DictVectorizer()

        vec_data = pd.DataFrame(vec.fit_transform(df[cols].to_dict(orient='records')).toarray())
        vec_data.columns = vec.get_feature_names_out()
        vec_data.index = df.index

        df = df.drop(cols, axis=1)
        df = df.join(vec_data)

        return df

    def process_data(self, data):
        numeric_cols = data.select_dtypes(include=np.number).columns.values
        columns_to_use = np.concatenate((numeric_cols, RedfinModel.COLUMNS_TO_ONE_HOT_ENCODE))
        columns_to_use = np.setdiff1d(columns_to_use, RedfinModel.COLUMNS_TO_REMOVE)
        print('Using columns:', columns_to_use)
        data = data[columns_to_use]
        data = self.encode_onehot(data, RedfinModel.COLUMNS_TO_ONE_HOT_ENCODE)

        # Fill missing values or NaN
        data = data.fillna(0)

                # Remove out of range values
        if 'sold_price' in data.columns.values:
            data = data[(data['sold_price'] > MIN_PRICE) & (data['sold_price'] < 5000000)]
        if 'list_price' in data.columns.values:
            data = data[(data['list_price'] > MIN_PRICE) & (data['list_price'] < 5000000)]

        print(f"Processed data shape: {data.shape}")
        print(f"Processed data columns: {data.columns.values}")
        return data

    def train_from_raw(self, X, y):
        train = self.process_data(X)
        self.model = self.model_type()
        self.trained_columns = train.columns.values
        self.model.fit(train, y)
        return self.model

    def predict(self, X):
        if not self.model:
            raise Exception("Model not trained")
        test = self.process_data(X)
        # Drop any columns that are not in the training data
        dropped_columns = np.setdiff1d(test.columns.values, self.trained_columns)
        print(f"Dropping columns: {dropped_columns}")
        test = test.drop(dropped_columns, axis=1)
        # Add columns that are in the training data but not in the test data
        missing_columns = np.setdiff1d(self.trained_columns, test.columns.values)
        print(f"Adding columns: {missing_columns}")
        for column in missing_columns:
            test[column] = 0

        # Reorder columns to match training data
        test = test[self.trained_columns]

        pred = self.model.predict(test)
        return pred

    def print_feature_importances(self):
        if not self.model:
            raise Exception("Model not trained")
        try:
            importances = self.model.feature_importances_
        except Exception as e:
            importances = self.model.coef_
        # Zip with columns and order by importance
        importances = list(zip(self.trained_columns, importances))
        importances.sort(key=lambda x: x[1], reverse=True)
        return importances

In [144]:
redfin = RedfinModel(TARGET_LOCATION)
train_df = redfin.fetch_data('sold')
#
train_df.head()

Fetched properties (681): Rockingham County, NH sold


Unnamed: 0,property_url,mls,mls_id,status,style,street,unit,city,state,zip_code,...,last_sold_date,lot_sqft,price_per_sqft,latitude,longitude,stories,hoa_fee,parking_garage,primary_photo,alt_photos
0,https://www.realtor.com/realestateandhomes-det...,BSMA,73178194,SOLD,SINGLE_FAMILY,8 Spinney Ave,,Plaistow,NH,3865,...,2023-12-21,16553,335,42.82,-71.1,,0,3.0,http://ap.rdcpix.com/1cd23e0dcee573b2e887ae69b...,http://ap.rdcpix.com/1cd23e0dcee573b2e887ae69b...
1,https://www.realtor.com/realestateandhomes-det...,BSMA,73128868,SOLD,SINGLE_FAMILY,9 Highland St,,Newton,NH,3858,...,2023-12-21,73181,333,42.87,-71.04,,0,,http://ap.rdcpix.com/f5e2620a68650e5457c0c2548...,http://ap.rdcpix.com/f5e2620a68650e5457c0c2548...
2,https://www.realtor.com/realestateandhomes-det...,BSMA,73184214,SOLD,SINGLE_FAMILY,28 Cole St,,Salem,NH,3079,...,2023-12-21,13939,285,42.77,-71.19,2.0,0,1.0,http://ap.rdcpix.com/7f2744953580f1d0b5eba3664...,http://ap.rdcpix.com/7f2744953580f1d0b5eba3664...
3,https://www.realtor.com/realestateandhomes-det...,BSMA,73180483,SOLD,SINGLE_FAMILY,4 Hemlock Ln,,Salem,NH,3079,...,2023-12-15,26572,234,42.76,-71.21,,0,2.0,http://ap.rdcpix.com/4ae8fefaf952e39cd548e7f49...,http://ap.rdcpix.com/4ae8fefaf952e39cd548e7f49...
4,https://www.realtor.com/realestateandhomes-det...,BSMA,73095849,SOLD,SINGLE_FAMILY,14 Cole Rd,,Windham,NH,3087,...,2023-12-15,5227,436,42.82,-71.24,,0,,http://ap.rdcpix.com/d42021c3d5606174b30fda63d...,http://ap.rdcpix.com/d42021c3d5606174b30fda63d...


In [145]:
train_df.columns.values
# one hot encode columns

array(['property_url', 'mls', 'mls_id', 'status', 'style', 'street',
       'unit', 'city', 'state', 'zip_code', 'beds', 'full_baths',
       'half_baths', 'sqft', 'year_built', 'days_on_mls', 'list_price',
       'list_date', 'sold_price', 'last_sold_date', 'lot_sqft',
       'price_per_sqft', 'latitude', 'longitude', 'stories', 'hoa_fee',
       'parking_garage', 'primary_photo', 'alt_photos'], dtype=object)

In [146]:

redfin.train_from_raw(train_df, train_df[RedfinModel.TARGET_COLUMN])


Using columns: ['city' 'state' 'style' 'zip_code']
Processed data shape: (681, 80)
Processed data columns: ['city=Atkinson' 'city=Auburn' 'city=Brentwood' 'city=Candia'
 'city=Chester' 'city=Danville' 'city=Deerfield' 'city=Derry'
 'city=East Hampstead' 'city=East Kingston' 'city=Epping' 'city=Exeter'
 'city=Fremont' 'city=Greenland' 'city=Hampstead' 'city=Hampton'
 'city=Hampton Falls' 'city=Kensington' 'city=Kingston' 'city=Londonderry'
 'city=New Castle' 'city=Newfields' 'city=Newmarket' 'city=Newton'
 'city=North Hampton' 'city=Northwood' 'city=Nottingham' 'city=Plaistow'
 'city=Portsmouth' 'city=Raymond' 'city=Rye' 'city=Salem' 'city=Sandown'
 'city=Seabrook' 'city=South Hampton' 'city=Stratham' 'city=Windham'
 'state=NH' 'style=CONDO' 'style=CONDOS' 'style=FARM' 'style=LAND'
 'style=MOBILE' 'style=MULTI_FAMILY' 'style=OTHER' 'style=SINGLE_FAMILY'
 'zip_code=03032' 'zip_code=03034' 'zip_code=03036' 'zip_code=03037'
 'zip_code=03038' 'zip_code=03042' 'zip_code=03044' 'zip_code=0305

In [147]:
test_df = redfin.fetch_data('for_sale')

results = redfin.predict(test_df)
print(results)

Fetched properties (328): Rockingham County, NH for_sale
Using columns: ['city' 'state' 'style' 'zip_code']
Processed data shape: (328, 75)
Processed data columns: ['city=Atkinson' 'city=Auburn' 'city=Brentwood' 'city=Candia'
 'city=Chester' 'city=Danville' 'city=Deerfield' 'city=Derry'
 'city=East Kingston' 'city=Epping' 'city=Exeter' 'city=Fremont'
 'city=Greenland' 'city=Hampstead' 'city=Hampton' 'city=Hampton Falls'
 'city=Kensington' 'city=Kingston' 'city=Londonderry' 'city=Newington'
 'city=Newmarket' 'city=Newton' 'city=North Hampton' 'city=Northwood'
 'city=Nottingham' 'city=Plaistow' 'city=Portsmouth' 'city=Raymond'
 'city=Rye' 'city=Salem' 'city=Sandown' 'city=Seabrook' 'city=Stratham'
 'city=Windham' 'state=NH' 'style=CONDOS' 'style=FARM' 'style=LAND'
 'style=MOBILE' 'style=MULTI_FAMILY' 'style=SINGLE_FAMILY'
 'style=TOWNHOMES' 'zip_code=03032' 'zip_code=03034' 'zip_code=03036'
 'zip_code=03037' 'zip_code=03038' 'zip_code=03040' 'zip_code=03042'
 'zip_code=03044' 'zip_code=0

In [148]:
# Find rows with biggest mismatch between listing price and predicted predicted
test_df['predicted'] = results
test_df['diff'] = test_df['predicted'] - test_df['list_price']
test_df['diff_percent'] = test_df['diff'] / test_df['list_price'] * 100
test_df['readable_address'] = test_df['street']# + ', ' + test_df['city'] + ', ' + test_df['state'] + ' ' + str(test_df['zip_code'])
test_df.sort_values(by=['diff_percent'], ascending=False).head(10)


Unnamed: 0,property_url,mls,mls_id,status,style,street,unit,city,state,zip_code,...,longitude,stories,hoa_fee,parking_garage,primary_photo,alt_photos,predicted,diff,diff_percent,readable_address
89,https://www.realtor.com/realestateandhomes-det...,NHVT,4979928,FOR_SALE,CONDOS,99 Cluff Crossing Rd,Apt F10,Salem,NH,3079,...,-71.22,1.0,,,http://ap.rdcpix.com/2f8e5b071cebc0e6c0475c427...,http://ap.rdcpix.com/2f8e5b071cebc0e6c0475c427...,555249.15,340249.15,158.26,99 Cluff Crossing Rd
178,https://www.realtor.com/realestateandhomes-det...,NHVT,4972749,FOR_SALE,LAND,20 Quimby Dr,Lot 20,Northwood,NH,3261,...,,,0.0,,http://ap.rdcpix.com/13fc7ad628b9624091d707d59...,http://ap.rdcpix.com/13fc7ad628b9624091d707d59...,263702.08,158702.08,151.14,20 Quimby Dr
176,https://www.realtor.com/realestateandhomes-det...,NHVT,4972750,FOR_SALE,LAND,21 Quimby Dr,Lot 21,Northwood,NH,3261,...,,,0.0,,http://ap.rdcpix.com/5c00308a0eb4d781301cb87ac...,http://ap.rdcpix.com/5c00308a0eb4d781301cb87ac...,263702.08,158702.08,151.14,21 Quimby Dr
36,https://www.realtor.com/realestateandhomes-det...,NHVT,4980737,FOR_SALE,SINGLE_FAMILY,24 Millville Cir,,Salem,NH,3079,...,-71.21,1.0,,,http://ap.rdcpix.com/3a46a080598e83fb3a6c2772d...,http://ap.rdcpix.com/3a46a080598e83fb3a6c2772d...,618768.58,368768.58,147.51,24 Millville Cir
21,https://www.realtor.com/realestateandhomes-det...,NHVT,4980894,FOR_SALE,CONDOS,2 Brook Rd,Apt 10,Salem,NH,3079,...,-71.22,3.0,,,http://ap.rdcpix.com/59ee302f1bfed50820e10675b...,http://ap.rdcpix.com/59ee302f1bfed50820e10675b...,555249.15,307249.15,123.89,2 Brook Rd
22,https://www.realtor.com/realestateandhomes-det...,BSMA,73189149,FOR_SALE,CONDOS,2 Brook Rd,Apt 10,Salem,NH,3079,...,-71.22,1.0,347.0,,http://ap.rdcpix.com/4f0e74fd58d34c7372a7262d5...,http://ap.rdcpix.com/4f0e74fd58d34c7372a7262d5...,555249.15,307249.15,123.89,2 Brook Rd
204,https://www.realtor.com/realestateandhomes-det...,NHVT,4978142,FOR_SALE,LAND,M216 L76 First Nh Tpke,,Northwood,NH,3261,...,-71.23,,0.0,,http://ap.rdcpix.com/25f141d25c366d15878a0e51b...,http://ap.rdcpix.com/25f141d25c366d15878a0e51b...,263702.08,133702.08,102.85,M216 L76 First Nh Tpke
304,https://www.realtor.com/realestateandhomes-det...,NHVT,4974840,FOR_SALE,CONDOS,26 Chandler Ave,Apt 11,Plaistow,NH,3865,...,-71.1,1.0,,,http://ap.rdcpix.com/51a655f22467e93ae691a675e...,http://ap.rdcpix.com/51a655f22467e93ae691a675e...,425523.54,215623.54,102.73,26 Chandler Ave
20,https://www.realtor.com/realestateandhomes-det...,NHVT,4980907,FOR_SALE,MOBILE,290 Calef Hwy,Unit B10,Epping,NH,3042,...,-71.06,1.0,,,http://ap.rdcpix.com/16a8a04ce5a20bec60e1f8b9b...,http://ap.rdcpix.com/16a8a04ce5a20bec60e1f8b9b...,194379.87,94479.87,94.57,290 Calef Hwy
215,https://www.realtor.com/realestateandhomes-det...,NHVT,4978007,FOR_SALE,LAND,Tax Map 419 Middle Rd,Lot 69-3,Deerfield,NH,3037,...,-71.28,,0.0,,http://ap.rdcpix.com/5d7473a6aeee46639009078df...,http://ap.rdcpix.com/5d7473a6aeee46639009078df...,261756.63,126756.63,93.89,Tax Map 419 Middle Rd


In [149]:
print(redfin.trained_columns)
# remove exponent formatting
pd.set_option('display.float_format', lambda x: '%.2f' % x)
def make_clickable(val):
    # target _blank to open new window
    return '<a target="_blank" href="{}">{}</a>'.format(val, val)

test_df.style.format({'property_url': make_clickable})
print('===\nPredictions\n===')
test_df[RedfinModel.OUTPUT_COLUMNS].sort_values(by=['diff_percent'], ascending=False).head(50)


['city=Atkinson' 'city=Auburn' 'city=Brentwood' 'city=Candia'
 'city=Chester' 'city=Danville' 'city=Deerfield' 'city=Derry'
 'city=East Hampstead' 'city=East Kingston' 'city=Epping' 'city=Exeter'
 'city=Fremont' 'city=Greenland' 'city=Hampstead' 'city=Hampton'
 'city=Hampton Falls' 'city=Kensington' 'city=Kingston' 'city=Londonderry'
 'city=New Castle' 'city=Newfields' 'city=Newmarket' 'city=Newton'
 'city=North Hampton' 'city=Northwood' 'city=Nottingham' 'city=Plaistow'
 'city=Portsmouth' 'city=Raymond' 'city=Rye' 'city=Salem' 'city=Sandown'
 'city=Seabrook' 'city=South Hampton' 'city=Stratham' 'city=Windham'
 'state=NH' 'style=CONDO' 'style=CONDOS' 'style=FARM' 'style=LAND'
 'style=MOBILE' 'style=MULTI_FAMILY' 'style=OTHER' 'style=SINGLE_FAMILY'
 'zip_code=03032' 'zip_code=03034' 'zip_code=03036' 'zip_code=03037'
 'zip_code=03038' 'zip_code=03042' 'zip_code=03044' 'zip_code=03053'
 'zip_code=03077' 'zip_code=03079' 'zip_code=03087' 'zip_code=03261'
 'zip_code=03290' 'zip_code=03801' 

Unnamed: 0,readable_address,list_price,predicted,diff,diff_percent,property_url
89,99 Cluff Crossing Rd,215000,555249.15,340249.15,158.26,https://www.realtor.com/realestateandhomes-det...
178,20 Quimby Dr,105000,263702.08,158702.08,151.14,https://www.realtor.com/realestateandhomes-det...
176,21 Quimby Dr,105000,263702.08,158702.08,151.14,https://www.realtor.com/realestateandhomes-det...
36,24 Millville Cir,250000,618768.58,368768.58,147.51,https://www.realtor.com/realestateandhomes-det...
21,2 Brook Rd,248000,555249.15,307249.15,123.89,https://www.realtor.com/realestateandhomes-det...
22,2 Brook Rd,248000,555249.15,307249.15,123.89,https://www.realtor.com/realestateandhomes-det...
204,M216 L76 First Nh Tpke,130000,263702.08,133702.08,102.85,https://www.realtor.com/realestateandhomes-det...
304,26 Chandler Ave,209900,425523.54,215623.54,102.73,https://www.realtor.com/realestateandhomes-det...
20,290 Calef Hwy,99900,194379.87,94479.87,94.57,https://www.realtor.com/realestateandhomes-det...
215,Tax Map 419 Middle Rd,135000,261756.63,126756.63,93.89,https://www.realtor.com/realestateandhomes-det...


In [150]:
redfin.print_feature_importances()
# test_df.head()

[('zip_code=03854', 0.11024988400693318),
 ('zip_code=03842', 0.09931218603588243),
 ('style=SINGLE_FAMILY', 0.09635838463452849),
 ('style=OTHER', 0.09569451577430566),
 ('city=New Castle', 0.08339504988083989),
 ('zip_code=03856', 0.08077953499807174),
 ('city=Hampton', 0.07825239954610454),
 ('city=Newfields', 0.07749201246011624),
 ('style=MULTI_FAMILY', 0.023517116232891946),
 ('style=MOBILE', 0.02060652547645161),
 ('city=Windham', 0.013726915811306587),
 ('zip_code=03801', 0.0134310884483091),
 ('city=Portsmouth', 0.01259195054066827),
 ('zip_code=03087', 0.012014703969920446),
 ('style=CONDO', 0.0119348869818954),
 ('city=Sandown', 0.010389002042727222),
 ('zip_code=03870', 0.009830091590555756),
 ('city=Rye', 0.009419755936944664),
 ('zip_code=03873', 0.009338367077135918),
 ('style=CONDOS', 0.008523781114099646),
 ('city=Londonderry', 0.006873876367775393),
 ('city=Derry', 0.005834117813814173),
 ('zip_code=03038', 0.005760582559539881),
 ('city=Fremont', 0.005468284616754923