In [454]:
from datetime import datetime
from homeharvest import scrape_property
import numpy as np
import pandas as pd
import os
import sys

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

# https://www.educative.io/blog/scikit-learn-cheat-sheet-classification-regression-methods
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor, GradientBoostingRegressor
from xgboost.sklearn import XGBRegressor


# dict vectorizer
from sklearn.feature_extraction import DictVectorizer

ACTIVE_MODEL = LinearRegression

TARGET_LOCATION = 'Plymouth County, MA'
# TARGET_LOCATION = 'Rockingham County, NH'
# TARGET_LOCATION = 'Suffolk County, MA'
# TARGET_LOCATION = 'Essex County, MA'
# TARGET_LOCATION = 'Pierce County, WA'
# TARGET_LOCATION = 'Middlesex County, MA'
DATA_FOLDER = './data'

DAYS_OF_SOLD_HISTORY = 210
MIN_PRICE = 800000
MAX_PRICE = 2*10**6

print('ready')

ready


In [455]:
class RedfinModel:

    TARGET_COLUMN = 'sold_price'
    COLUMNS_TO_ONE_HOT_ENCODE = ['state', 'style', 'city']
    COLUMNS_TO_REMOVE = ['zip_code', 'last_sold_date', 'mls_id', 'list_price', 'latitude', 'longitude', 'days_on_mls', 'price_per_sqft', TARGET_COLUMN]
    OUTPUT_COLUMNS = ['readable_address', 'style', 'beds', 'list_price', 'predicted', 'diff', 'diff_percent', 'top_contributing_columns']# 'property_url']

    def __init__(self, location=TARGET_LOCATION, column_filters={}):
        self.model = None
        self.model_type = ACTIVE_MODEL
        self.data_folder = DATA_FOLDER
        self.location = location
        self.column_filters = column_filters


    def fetch_data(self, listing_type="sold"):
        # Generate filename based on current timestamp
        # current_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        today = datetime.today().strftime('%Y-%m-%d')
        location = self.location
        filename = f"{self.data_folder}/{today}|{location}|{listing_type}.csv"

        # check if data already exists in data folder
        if os.path.exists(filename):
            print(f"Using cached data for {location} {listing_type} as of {today}")
            # return data
            return self._filter_data(pd.read_csv(filename))

        past_days = DAYS_OF_SOLD_HISTORY if listing_type == 'sold' else 90

        properties = scrape_property(
          location=location,
          listing_type=listing_type,  # or (for_sale, for_rent, pending)
          past_days=past_days,  # ex: sold in last 30 days - listed in last 30 days if (for_sale, for_rent)

          # date_from="2023-05-01", # alternative to past_days
          # date_to="2023-05-28",

          # mls_only=True,  # only fetch MLS listings
          # proxy="http://user:pass@host:port"  # use a proxy to change your IP address
        )
        print(f"Fetched properties ({len(properties)}): {location} {listing_type}")
        # Export to csv
        properties.to_csv(filename, index=False)
        return self._filter_data(properties)

    def encode_onehot(self, df, cols):
        """
        One-hot encoding is applied to columns specified in a pandas DataFrame.

        Modified from: https://gist.github.com/kljensen/5452382

        Details:

        http://en.wikipedia.org/wiki/One-hot
        http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

        @param df pandas DataFrame
        @param cols a list of columns to encode
        @return a DataFrame with one-hot encoding
        """
        vec = DictVectorizer()

        vec_data = pd.DataFrame(vec.fit_transform(df[cols].to_dict(orient='records')).toarray())
        vec_data.columns = vec.get_feature_names_out()
        vec_data.index = df.index

        df = df.drop(cols, axis=1)
        df = df.join(vec_data)

        return df

    def _filter_data(self, data):
        original_shape = data.shape
        # Remove out of range values
        # if 'sold_price' in data.columns.values:
        #     data = data[(data['sold_price'] > MIN_PRICE) & (data['sold_price'] < MAX_PRICE)]
        if 'list_price' in data.columns.values:
            data = data[(data['list_price'] > MIN_PRICE) & (data['list_price'] < MAX_PRICE)]

        for column in self.column_filters:
            # check if value in column filters values
            if column in data.columns.values:
                allowed_values = self.column_filters[column]
                print('filtering column:', column, 'allowed_values:', allowed_values)
                data = data[data[column].isin(allowed_values)]



        print(f"Filtered data shape: {data.shape} (from {original_shape})")
        return data

    def process_data(self, data, show_debug=False):
        numeric_cols = data.select_dtypes(include=np.number).columns.values
        columns_to_use = np.concatenate((numeric_cols, RedfinModel.COLUMNS_TO_ONE_HOT_ENCODE))
        columns_to_use = np.setdiff1d(columns_to_use, RedfinModel.COLUMNS_TO_REMOVE)
        data = data[columns_to_use]
        data = self.encode_onehot(data, RedfinModel.COLUMNS_TO_ONE_HOT_ENCODE)
        # drop original unencoded columns if present

        # Fill missing values or NaN
        data = data.fillna(0)
        if show_debug:
            print('Using columns:', data.columns.values)
            print(f"Processed data shape: {data.shape}")
            print(f"Processed data columns: {len(data.columns.values)}")
        return data

    def show_cross_validation(self, X, y):
        # use cross_val_score
        train = self.process_data(X)
        scores = cross_val_score(self.model, train, y, cv=5)
        print("Cross-validation scores: {}".format(scores))
        mean_score = scores.mean()
        print("Average cross-validation score: {:.2f}".format(mean_score))
        return mean_score


    def train_from_raw(self, X, y):
        train = self.process_data(X)
        self.model = self.model_type()
        self.trained_columns = train.columns.values
        self.model.fit(train, y)
        return self.model

    def predict(self, X):
        if not self.model:
            raise Exception("Model not trained")
        test = self.process_data(X)
        # Drop any columns that are not in the training data
        dropped_columns = np.setdiff1d(test.columns.values, self.trained_columns)
        print(f"Dropping columns: {dropped_columns}")
        test = test.drop(dropped_columns, axis=1)
        # Add columns that are in the training data but not in the test data
        missing_columns = np.setdiff1d(self.trained_columns, test.columns.values)
        print(f"Adding columns: {missing_columns}")
        for column in missing_columns:
            test[column] = 0

        # Reorder columns to match training data
        test = test[self.trained_columns]

        pred = self.model.predict(test)
        print(f"Predicted {len(pred)} values")
        return pred

    def print_feature_importances(self):
        if not self.model:
            raise Exception("Model not trained")
        try:
            importances = self.model.feature_importances_
        except Exception as e:
            importances = self.model.coef_
        # Zip with columns and order by importance
        importances = list(zip(self.trained_columns, importances))
        importances.sort(key=lambda x: x[1], reverse=True)
        return importances

In [456]:
redfin = RedfinModel(TARGET_LOCATION, {
  # 'style': ['SINGLE_FAMILY', 'TOWNHOUSE'],
  'beds': [3, 4, 5]
})
train_df = redfin.fetch_data('sold')
#
train_df.head()

Using cached data for Plymouth County, MA sold as of 2024-01-04
filtering column: beds allowed_values: [3, 4, 5]
Filtered data shape: (547, 29) (from (3319, 29))


Unnamed: 0,property_url,mls,mls_id,status,style,street,unit,city,state,zip_code,...,last_sold_date,lot_sqft,price_per_sqft,latitude,longitude,stories,hoa_fee,parking_garage,primary_photo,alt_photos
0,https://www.realtor.com/realestateandhomes-det...,BSMA,73165771.0,SOLD,SINGLE_FAMILY,40 Christina Dr,,Bridgewater,MA,2324,...,2024-01-04,57935.0,284.0,41.95,-70.98,2.0,0.0,6.0,http://ap.rdcpix.com/6c3cb76e5d4d8a13eaa12ca48...,http://ap.rdcpix.com/6c3cb76e5d4d8a13eaa12ca48...
2,https://www.realtor.com/realestateandhomes-det...,BSMA,73163824.0,SOLD,SINGLE_FAMILY,111 Mann Hill Rd,,Scituate,MA,2066,...,2024-01-04,34412.0,560.0,42.22,-70.75,,0.0,2.0,http://ap.rdcpix.com/9f1cc472986b3bec72b4c2b3e...,http://ap.rdcpix.com/9f1cc472986b3bec72b4c2b3e...
3,https://www.realtor.com/realestateandhomes-det...,BSMA,73179584.0,SOLD,CONDOS,8 Backriver Rd,Unit 8,Hingham,MA,2043,...,2024-01-04,,476.0,42.24,-70.92,3.0,966.0,2.0,http://ap.rdcpix.com/e6e1967472a82681ef0635843...,http://ap.rdcpix.com/e6e1967472a82681ef0635843...
8,https://www.realtor.com/realestateandhomes-det...,BSMA,73183869.0,SOLD,SINGLE_FAMILY,4 Deerpath Trl,,Duxbury,MA,2332,...,2024-01-03,54014.0,448.0,42.09,-70.73,,0.0,2.0,http://ap.rdcpix.com/667ed80437877ee0e362ed741...,http://ap.rdcpix.com/667ed80437877ee0e362ed741...
12,https://www.realtor.com/realestateandhomes-det...,BSMA,73169688.0,SOLD,SINGLE_FAMILY,25 Olde Sheepfield Rd,,Marion,MA,2738,...,2024-01-03,30492.0,314.0,41.69,-70.77,,42.0,3.0,http://ap.rdcpix.com/a2c3110e59a70b1d72ba768ed...,http://ap.rdcpix.com/a2c3110e59a70b1d72ba768ed...


In [457]:
train_df.columns.values
# one hot encode columns

array(['property_url', 'mls', 'mls_id', 'status', 'style', 'street',
       'unit', 'city', 'state', 'zip_code', 'beds', 'full_baths',
       'half_baths', 'sqft', 'year_built', 'days_on_mls', 'list_price',
       'list_date', 'sold_price', 'last_sold_date', 'lot_sqft',
       'price_per_sqft', 'latitude', 'longitude', 'stories', 'hoa_fee',
       'parking_garage', 'primary_photo', 'alt_photos'], dtype=object)

In [458]:
redfin.train_from_raw(train_df, train_df[RedfinModel.TARGET_COLUMN])


In [459]:
# cross validation
# retrain at most 3 times until mean_score > .7 else throw
# mean_score = 0
# times = 0
# min_score = .7
# while mean_score < min_score and times < 3:
#     mean_score = redfin.show_cross_validation(train_df.drop(RedfinModel.TARGET_COLUMN, axis=1), train_df[RedfinModel.TARGET_COLUMN])
#     if mean_score < min_score:
#         print('Retraining model')
#         redfin.train_from_raw(train_df, train_df[RedfinModel.TARGET_COLUMN])
#     times += 1
# if mean_score < min_score:
#     print('Max retraining attempts reached')
#     sys.exit(1)

mean_score = redfin.show_cross_validation(train_df.drop(RedfinModel.TARGET_COLUMN, axis=1), train_df[RedfinModel.TARGET_COLUMN])


Cross-validation scores: [0.40286997 0.26389221 0.03843838 0.2982188  0.44601599]
Average cross-validation score: 0.29


In [460]:
test_df = redfin.fetch_data('for_sale')
# test_df = redfin.filter_data(test_df)
results = redfin.predict(test_df)

Using cached data for Plymouth County, MA for_sale as of 2024-01-04
filtering column: beds allowed_values: [3, 4, 5]
Filtered data shape: (60, 29) (from (465, 29))
Dropping columns: ['style=FARM']
Adding columns: ['city=Allerton' 'city=Brockton' 'city=Bryantville' 'city=Cedarville'
 'city=Chiltonville' 'city=Crow Point' 'city=East Marion' 'city=Egypt'
 'city=Ellisville' 'city=Green Harbor' 'city=Hanover Center' 'city=Hanson'
 'city=Hingham Center' 'city=Humarock' 'city=Indian Pond' 'city=Lakeville'
 'city=Manomet' 'city=Marshfield Hills' 'city=Minot' 'city=North Hanover'
 'city=North Lakeville' 'city=North Marshfield' 'city=North Pembroke'
 'city=North Scituate' 'city=Ocean Bluff' 'city=Onset' 'city=Pembroke'
 'city=Pinehills' 'city=Plympton' 'city=Sand Hills' 'city=Scituate Harbor'
 'city=Second Cliff' 'city=South Duxbury' 'city=South Hanover'
 'city=South Hingham' 'city=South Middleborough' 'city=South Plymouth'
 'city=The Village' 'city=Third Cliff' 'city=Tinkertown' 'city=Weir Rive

In [461]:
# Find rows with biggest mismatch between listing price and predicted predicted
test_df['predicted'] = results
test_df['diff'] = test_df['predicted'] - test_df['list_price']
test_df['diff_percent'] = test_df['diff'] / test_df['list_price'] * 100
test_df['readable_address'] = test_df['street'] + ', ' + test_df['city'] + ', ' + test_df['state']# + ' ' + str(test_df['zip_code'])
test_df['top_contributing_columns'] = '' # test_df.apply(lambda row: ', '.join([x.get(0) for x in redfin.print_feature_importances()[:5] if row.get(x[0], 0) > 0]), axis=1)
test_df.sort_values(by=['diff_percent'], ascending=False).head(10)


Unnamed: 0,property_url,mls,mls_id,status,style,street,unit,city,state,zip_code,...,stories,hoa_fee,parking_garage,primary_photo,alt_photos,predicted,diff,diff_percent,readable_address,top_contributing_columns
250,https://www.realtor.com/realestateandhomes-det...,BSMA,73171778,FOR_SALE,SINGLE_FAMILY,128 Brandt Island Rd,,Mattapoisett,MA,2739,...,,0.0,2.0,http://ap.rdcpix.com/5bc598c96598b216fe0ed050a...,http://ap.rdcpix.com/5bc598c96598b216fe0ed050a...,1234679.12,405679.12,48.94,"128 Brandt Island Rd, Mattapoisett, MA",
229,https://www.realtor.com/realestateandhomes-det...,BSMA,73173507,FOR_SALE,SINGLE_FAMILY,6 Kensington,,Plymouth,MA,2360,...,,318.0,2.0,http://ap.rdcpix.com/d35ed059dee4bd110c0674a08...,http://ap.rdcpix.com/d35ed059dee4bd110c0674a08...,1152086.94,312186.94,37.17,"6 Kensington, Plymouth, MA",
155,https://www.realtor.com/realestateandhomes-det...,BSMA,73187057,FOR_SALE,SINGLE_FAMILY,42 Brook St,,Scituate,MA,2066,...,,0.0,,http://ap.rdcpix.com/736cc1b17859086a40ac5672c...,http://ap.rdcpix.com/736cc1b17859086a40ac5672c...,1179471.93,284471.93,31.78,"42 Brook St, Scituate, MA",
254,https://www.realtor.com/realestateandhomes-det...,BSMA,73171229,FOR_SALE,SINGLE_FAMILY,385 Hanover St,,Hanover,MA,2339,...,,0.0,2.0,http://ap.rdcpix.com/3fd3bed78438344dbe81b6756...,http://ap.rdcpix.com/3fd3bed78438344dbe81b6756...,1367084.87,318084.87,30.32,"385 Hanover St, Hanover, MA",
34,https://www.realtor.com/realestateandhomes-det...,BSMA,73190243,FOR_SALE,SINGLE_FAMILY,25 Kingstown Way,,Duxbury,MA,2332,...,,0.0,3.0,http://ap.rdcpix.com/c1d6e50c580f62e8be8ad8c99...,http://ap.rdcpix.com/c1d6e50c580f62e8be8ad8c99...,1455564.42,330564.42,29.38,"25 Kingstown Way, Duxbury, MA",
279,https://www.realtor.com/realestateandhomes-det...,BSMA,73168894,FOR_SALE,SINGLE_FAMILY,10 Huckleberry Way,,Marion,MA,2738,...,,0.0,2.0,http://ap.rdcpix.com/065dbdac6c5c22a94e1d3dd47...,http://ap.rdcpix.com/065dbdac6c5c22a94e1d3dd47...,1038413.84,233513.84,29.01,"10 Huckleberry Way, Marion, MA",
109,https://www.realtor.com/realestateandhomes-det...,BSMA,73188449,FOR_SALE,CONDOS,11 Rachels Way,Unit 11,Scituate,MA,2066,...,2.0,626.0,1.0,http://ap.rdcpix.com/341525b57d2a31c5f7429a7be...,http://ap.rdcpix.com/341525b57d2a31c5f7429a7be...,1116314.68,226414.68,25.44,"11 Rachels Way, Scituate, MA",
323,https://www.realtor.com/realestateandhomes-det...,BSMA,73185119,FOR_SALE,CONDOS,15 Endicott Gln,Unit 15,Plymouth,MA,2360,...,3.0,1197.0,2.0,http://ap.rdcpix.com/9c3104132d6bad99629df5579...,http://ap.rdcpix.com/9c3104132d6bad99629df5579...,1033010.07,208010.07,25.21,"15 Endicott Gln, Plymouth, MA",
249,https://www.realtor.com/realestateandhomes-det...,BSMA,73171792,FOR_SALE,SINGLE_FAMILY,11 Webster Reach,,Plymouth,MA,2360,...,,388.0,2.0,http://ap.rdcpix.com/2b01531d7530f75912bf8133c...,http://ap.rdcpix.com/2b01531d7530f75912bf8133c...,1056920.42,207020.42,24.36,"11 Webster Reach, Plymouth, MA",
159,https://www.realtor.com/realestateandhomes-det...,BSMA,73187036,FOR_SALE,SINGLE_FAMILY,110 High St,,Hingham,MA,2043,...,,0.0,1.0,http://ap.rdcpix.com/0de49298a1e036bf972d7c9a7...,http://ap.rdcpix.com/0de49298a1e036bf972d7c9a7...,1226638.36,231638.36,23.28,"110 High St, Hingham, MA",


In [462]:
print(redfin.trained_columns)
# remove exponent formatting
pd.set_option('display.float_format', lambda x: '%.2f' % x)
def make_clickable(val):
    # target _blank to open new window
    return '<a target="_blank" href="{}">{}</a>'.format(val, val)

test_df.style.format({'property_url': make_clickable})
print(f"===\nPredictions ({test_df.shape[0]})\n===")
test_df[RedfinModel.OUTPUT_COLUMNS].sort_values(by=['diff_percent'], ascending=False).head(25)


['beds' 'full_baths' 'half_baths' 'hoa_fee' 'lot_sqft' 'parking_garage'
 'sqft' 'stories' 'year_built' 'city=Abington' 'city=Allerton'
 'city=Bridgewater' 'city=Brockton' 'city=Bryantville' 'city=Carver'
 'city=Cedarville' 'city=Chiltonville' 'city=Crow Point' 'city=Duxbury'
 'city=East Bridgewater' 'city=East Marion' 'city=Egypt' 'city=Ellisville'
 'city=Green Harbor' 'city=Halifax' 'city=Hanover' 'city=Hanover Center'
 'city=Hanson' 'city=Hingham' 'city=Hingham Center' 'city=Hull'
 'city=Humarock' 'city=Indian Pond' 'city=Kingston' 'city=Lakeville'
 'city=Manomet' 'city=Marion' 'city=Marshfield' 'city=Marshfield Hills'
 'city=Mattapoisett' 'city=Middleboro' 'city=Minot' 'city=North Hanover'
 'city=North Lakeville' 'city=North Marshfield' 'city=North Pembroke'
 'city=North Scituate' 'city=Norwell' 'city=Ocean Bluff' 'city=Onset'
 'city=Pembroke' 'city=Pinehills' 'city=Plymouth' 'city=Plympton'
 'city=Rochester' 'city=Sand Hills' 'city=Scituate' 'city=Scituate Harbor'
 'city=Second Cli

Unnamed: 0,readable_address,style,beds,list_price,predicted,diff,diff_percent,top_contributing_columns
250,"128 Brandt Island Rd, Mattapoisett, MA",SINGLE_FAMILY,4.0,829000,1234679.12,405679.12,48.94,
229,"6 Kensington, Plymouth, MA",SINGLE_FAMILY,3.0,839900,1152086.94,312186.94,37.17,
155,"42 Brook St, Scituate, MA",SINGLE_FAMILY,5.0,895000,1179471.93,284471.93,31.78,
254,"385 Hanover St, Hanover, MA",SINGLE_FAMILY,5.0,1049000,1367084.87,318084.87,30.32,
34,"25 Kingstown Way, Duxbury, MA",SINGLE_FAMILY,4.0,1125000,1455564.42,330564.42,29.38,
279,"10 Huckleberry Way, Marion, MA",SINGLE_FAMILY,4.0,804900,1038413.84,233513.84,29.01,
109,"11 Rachels Way, Scituate, MA",CONDOS,3.0,889900,1116314.68,226414.68,25.44,
323,"15 Endicott Gln, Plymouth, MA",CONDOS,3.0,825000,1033010.07,208010.07,25.21,
249,"11 Webster Reach, Plymouth, MA",SINGLE_FAMILY,3.0,849900,1056920.42,207020.42,24.36,
159,"110 High St, Hingham, MA",SINGLE_FAMILY,4.0,995000,1226638.36,231638.36,23.28,


In [463]:
importances = redfin.print_feature_importances()
print(f"===\nFeature Importances ({len(importances)})\n===")
for importance in importances:
    print(importance)



===
Feature Importances (76)
===
('city=Hingham Center', 661010.823991714)
('city=North Scituate', 625262.6078598736)
('city=Minot', 542524.5964369492)
('city=Tinkertown', 524277.8219302253)
('city=Weir River', 471543.87364593166)
('city=Third Cliff', 349738.74062309606)
('city=Humarock', 323729.81073242665)
('city=Crow Point', 316726.19357305)
('city=Allerton', 311560.39731620386)
('city=Hingham', 269231.5818663686)
('city=Mattapoisett', 266562.73234628025)
('city=Manomet', 248578.06261078216)
('city=South Hingham', 234370.14799191387)
('city=Second Cliff', 229761.99420939822)
('city=Duxbury', 177994.52128665434)
('city=North Marshfield', 166830.41234964784)
('city=Scituate', 162869.58172105212)
('city=Norwell', 145998.48161238388)
('city=Egypt', 100667.9230535393)
('city=Scituate Harbor', 97085.48836088613)
('city=Hull', 95525.67147256792)
('style=SINGLE_FAMILY', 92886.00685424785)
('city=Plympton', 89741.01891086293)
('city=Wareham', 63863.13667619624)
('full_baths', 58057.210737892

In [464]:
# test_df.head()
print(set(test_df['style']))

{'MULTI_FAMILY', 'CONDOS', 'FARM', 'SINGLE_FAMILY'}
