In [154]:
from datetime import datetime
from homeharvest import scrape_property
import numpy as np
import pandas as pd
import os
import sys

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor, GradientBoostingRegressor

# dict vectorizer
from sklearn.feature_extraction import DictVectorizer

ACTIVE_MODEL = GradientBoostingRegressor

# TARGET_LOCATION = 'Plymouth County, MA'
TARGET_LOCATION = 'Rockingham County, NH'
# TARGET_LOCATION = 'Suffolk County, MA'
# TARGET_LOCATION = 'Essex County, MA'
# TARGET_LOCATION = 'Pierce County, WA'
# TARGET_LOCATION = 'Middlesex County, MA'
DATA_FOLDER = './data'

DAYS_OF_SOLD_HISTORY = 210
MIN_PRICE = 800000
MAX_PRICE = 2*10**6

print('ready')

ready


In [155]:
class RedfinModel:

    TARGET_COLUMN = 'sold_price'
    COLUMNS_TO_ONE_HOT_ENCODE = ['state', 'style', 'city']
    COLUMNS_TO_REMOVE = ['zip_code', 'last_sold_date', 'mls_id', 'list_price', 'latitude', 'longitude', 'days_on_mls', 'price_per_sqft', TARGET_COLUMN]
    OUTPUT_COLUMNS = ['readable_address', 'style', 'beds', 'list_price', 'predicted', 'diff', 'diff_percent', 'property_url']

    def __init__(self, location=TARGET_LOCATION, column_filters={}):
        self.model = None
        self.model_type = ACTIVE_MODEL
        self.data_folder = DATA_FOLDER
        self.location = location
        self.column_filters = column_filters


    def fetch_data(self, listing_type="sold"):
        # Generate filename based on current timestamp
        # current_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        today = datetime.today().strftime('%Y-%m-%d')
        location = self.location
        filename = f"{self.data_folder}/{today}|{location}|{listing_type}.csv"

        # check if data already exists in data folder
        if os.path.exists(filename):
            print(f"Using cached data for {location} {listing_type} as of {today}")
            # return data
            return self._filter_data(pd.read_csv(filename))

        past_days = DAYS_OF_SOLD_HISTORY if listing_type == 'sold' else 90

        properties = scrape_property(
          location=location,
          listing_type=listing_type,  # or (for_sale, for_rent, pending)
          past_days=past_days,  # ex: sold in last 30 days - listed in last 30 days if (for_sale, for_rent)

          # date_from="2023-05-01", # alternative to past_days
          # date_to="2023-05-28",

          # mls_only=True,  # only fetch MLS listings
          # proxy="http://user:pass@host:port"  # use a proxy to change your IP address
        )
        print(f"Fetched properties ({len(properties)}): {location} {listing_type}")
        # Export to csv
        properties.to_csv(filename, index=False)
        return self._filter_data(properties)

    def encode_onehot(self, df, cols):
        """
        One-hot encoding is applied to columns specified in a pandas DataFrame.

        Modified from: https://gist.github.com/kljensen/5452382

        Details:

        http://en.wikipedia.org/wiki/One-hot
        http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

        @param df pandas DataFrame
        @param cols a list of columns to encode
        @return a DataFrame with one-hot encoding
        """
        vec = DictVectorizer()

        vec_data = pd.DataFrame(vec.fit_transform(df[cols].to_dict(orient='records')).toarray())
        vec_data.columns = vec.get_feature_names_out()
        vec_data.index = df.index

        df = df.drop(cols, axis=1)
        df = df.join(vec_data)

        return df

    def _filter_data(self, data):
        original_shape = data.shape
        # Remove out of range values
        # if 'sold_price' in data.columns.values:
        #     data = data[(data['sold_price'] > MIN_PRICE) & (data['sold_price'] < MAX_PRICE)]
        if 'list_price' in data.columns.values:
            data = data[(data['list_price'] > MIN_PRICE) & (data['list_price'] < MAX_PRICE)]

        for column in self.column_filters:
            # check if value in column filters values
            if column in data.columns.values:
                allowed_values = self.column_filters[column]
                print('filtering column:', column, 'allowed_values:', allowed_values)
                data = data[data[column].isin(allowed_values)]



        print(f"Filtered data shape: {data.shape} (from {original_shape})")
        return data

    def process_data(self, data):
        numeric_cols = data.select_dtypes(include=np.number).columns.values
        columns_to_use = np.concatenate((numeric_cols, RedfinModel.COLUMNS_TO_ONE_HOT_ENCODE))
        columns_to_use = np.setdiff1d(columns_to_use, RedfinModel.COLUMNS_TO_REMOVE)
        data = data[columns_to_use]
        data = self.encode_onehot(data, RedfinModel.COLUMNS_TO_ONE_HOT_ENCODE)
        # drop original unencoded columns if present

        # Fill missing values or NaN
        data = data.fillna(0)
        print('Using columns:', data.columns.values)
        print(f"Processed data shape: {data.shape}")
        print(f"Processed data columns: {len(data.columns.values)}")
        return data

    def show_cross_validation(self, X, y):
        # use cross_val_score
        train = self.process_data(X)
        scores = cross_val_score(self.model, train, y, cv=5)
        print("Cross-validation scores: {}".format(scores))
        print("Average cross-validation score: {:.2f}".format(scores.mean()))


    def train_from_raw(self, X, y):
        train = self.process_data(X)
        self.model = self.model_type()
        self.trained_columns = train.columns.values
        self.model.fit(train, y)
        return self.model

    def predict(self, X):
        if not self.model:
            raise Exception("Model not trained")
        test = self.process_data(X)
        # Drop any columns that are not in the training data
        dropped_columns = np.setdiff1d(test.columns.values, self.trained_columns)
        print(f"Dropping columns: {dropped_columns}")
        test = test.drop(dropped_columns, axis=1)
        # Add columns that are in the training data but not in the test data
        missing_columns = np.setdiff1d(self.trained_columns, test.columns.values)
        print(f"Adding columns: {missing_columns}")
        for column in missing_columns:
            test[column] = 0

        # Reorder columns to match training data
        test = test[self.trained_columns]

        pred = self.model.predict(test)
        print(f"Predicted {len(pred)} values")
        return pred

    def print_feature_importances(self):
        if not self.model:
            raise Exception("Model not trained")
        try:
            importances = self.model.feature_importances_
        except Exception as e:
            importances = self.model.coef_
        # Zip with columns and order by importance
        importances = list(zip(self.trained_columns, importances))
        importances.sort(key=lambda x: x[1], reverse=True)
        return importances

In [156]:
redfin = RedfinModel(TARGET_LOCATION, {
  'style': ['SINGLE_FAMILY', 'TOWNHOUSE'],
  'beds': [3, 4, 5]
})
train_df = redfin.fetch_data('sold')
#
train_df.head()

Fetched properties (235): Rockingham County, NH sold
filtering column: style allowed_values: ['SINGLE_FAMILY', 'TOWNHOUSE']
filtering column: beds allowed_values: [3, 4, 5]
Filtered data shape: (15, 29) (from (235, 29))


Unnamed: 0,property_url,mls,mls_id,status,style,street,unit,city,state,zip_code,...,last_sold_date,lot_sqft,price_per_sqft,latitude,longitude,stories,hoa_fee,parking_garage,primary_photo,alt_photos
6,https://www.realtor.com/realestateandhomes-det...,BSMA,73180483,SOLD,SINGLE_FAMILY,4 Hemlock Ln,,Salem,NH,3079,...,2023-12-15,26572,234,42.76,-71.21,,0,2,http://ap.rdcpix.com/4ae8fefaf952e39cd548e7f49...,http://ap.rdcpix.com/4ae8fefaf952e39cd548e7f49...
8,https://www.realtor.com/realestateandhomes-det...,BSMA,73153735,SOLD,SINGLE_FAMILY,10 Washington Rd,,Windham,NH,3087,...,2023-12-15,273557,295,42.77,-71.32,,0,2,http://ap.rdcpix.com/a6322e5bef8dd5fa6aad59a8b...,http://ap.rdcpix.com/a6322e5bef8dd5fa6aad59a8b...
29,https://www.realtor.com/realestateandhomes-det...,BSMA,73135253,SOLD,SINGLE_FAMILY,6 Marcona Ln,Lot 2,Londonderry,NH,3053,...,2023-11-17,50094,274,,,,0,2,http://ap.rdcpix.com/d836c187ffa667e1c0f13e1b1...,http://ap.rdcpix.com/d836c187ffa667e1c0f13e1b1...
38,https://www.realtor.com/realestateandhomes-det...,BSMA,73155383,SOLD,SINGLE_FAMILY,4 Spicket Valley Dr,,Atkinson,NH,3811,...,2023-11-03,91040,196,42.83,-71.19,,0,3,http://ap.rdcpix.com/a8a1be0d640c0b3228339c202...,http://ap.rdcpix.com/a8a1be0d640c0b3228339c202...
58,https://www.realtor.com/realestateandhomes-det...,BSMA,73127482,SOLD,SINGLE_FAMILY,1 Marcona Ln,,Londonderry,NH,3053,...,2023-10-05,57935,292,,,,0,2,http://ap.rdcpix.com/7bb5a3696cf5f823c6208fadf...,http://ap.rdcpix.com/7bb5a3696cf5f823c6208fadf...


In [157]:
train_df.columns.values
# one hot encode columns

array(['property_url', 'mls', 'mls_id', 'status', 'style', 'street',
       'unit', 'city', 'state', 'zip_code', 'beds', 'full_baths',
       'half_baths', 'sqft', 'year_built', 'days_on_mls', 'list_price',
       'list_date', 'sold_price', 'last_sold_date', 'lot_sqft',
       'price_per_sqft', 'latitude', 'longitude', 'stories', 'hoa_fee',
       'parking_garage', 'primary_photo', 'alt_photos'], dtype=object)

In [158]:
redfin.train_from_raw(train_df, train_df[RedfinModel.TARGET_COLUMN])


Using columns: ['city=Atkinson' 'city=Auburn' 'city=Londonderry' 'city=Plaistow'
 'city=Rye' 'city=Salem' 'city=Windham' 'state=NH' 'style=SINGLE_FAMILY']
Processed data shape: (15, 9)
Processed data columns: 9


In [159]:
# cross validation
redfin.show_cross_validation(train_df.drop(RedfinModel.TARGET_COLUMN, axis=1), train_df[RedfinModel.TARGET_COLUMN])

Using columns: ['city=Atkinson' 'city=Auburn' 'city=Londonderry' 'city=Plaistow'
 'city=Rye' 'city=Salem' 'city=Windham' 'state=NH' 'style=SINGLE_FAMILY']
Processed data shape: (15, 9)
Processed data columns: 9
Cross-validation scores: [-2.3203563  -0.41786482 -0.35291801  0.05077881 -1.14628932]
Average cross-validation score: -0.84


In [160]:
test_df = redfin.fetch_data('for_sale')
# test_df = redfin.filter_data(test_df)
results = redfin.predict(test_df)

Fetched properties (312): Rockingham County, NH for_sale
filtering column: style allowed_values: ['SINGLE_FAMILY', 'TOWNHOUSE']
filtering column: beds allowed_values: [3, 4, 5]
Filtered data shape: (35, 29) (from (312, 29))
Using columns: ['city=Atkinson' 'city=Candia' 'city=Chester' 'city=Derry' 'city=Exeter'
 'city=Fremont' 'city=Hampton' 'city=Hampton Falls' 'city=Kensington'
 'city=Londonderry' 'city=Newmarket' 'city=North Hampton' 'city=Plaistow'
 'city=Portsmouth' 'city=Salem' 'city=Sandown' 'city=Seabrook'
 'city=Stratham' 'city=Windham' 'state=NH' 'style=SINGLE_FAMILY']
Processed data shape: (35, 21)
Processed data columns: 21
Dropping columns: ['city=Candia' 'city=Chester' 'city=Derry' 'city=Exeter' 'city=Fremont'
 'city=Hampton' 'city=Hampton Falls' 'city=Kensington' 'city=Newmarket'
 'city=North Hampton' 'city=Portsmouth' 'city=Sandown' 'city=Seabrook'
 'city=Stratham']
Adding columns: ['city=Auburn' 'city=Rye']
Predicted 35 values


In [161]:
# Find rows with biggest mismatch between listing price and predicted predicted
test_df['predicted'] = results
test_df['diff'] = test_df['predicted'] - test_df['list_price']
test_df['diff_percent'] = test_df['diff'] / test_df['list_price'] * 100
test_df['readable_address'] = test_df['street'] + ', ' + test_df['city'] + ', ' + test_df['state']# + ' ' + str(test_df['zip_code'])
test_df.sort_values(by=['diff_percent'], ascending=False).head(10)


Unnamed: 0,property_url,mls,mls_id,status,style,street,unit,city,state,zip_code,...,longitude,stories,hoa_fee,parking_garage,primary_photo,alt_photos,predicted,diff,diff_percent,readable_address
94,https://www.realtor.com/realestateandhomes-det...,NHVT,4980154,FOR_SALE,SINGLE_FAMILY,64 Lane Rd,,Candia,NH,3034,...,,2.0,,2.0,http://ap.rdcpix.com/66f5562aaf99f21c69de13361...,http://ap.rdcpix.com/66f5562aaf99f21c69de13361...,942472.05,112572.05,13.56,"64 Lane Rd, Candia, NH"
308,https://www.realtor.com/realestateandhomes-det...,BSMA,73169929,FOR_SALE,SINGLE_FAMILY,2 Marcona Ln,Lot 39-0,Londonderry,NH,3053,...,-71.39,,0.0,2.0,http://ap.rdcpix.com/99e5d9a5783e3f334dc1ed0f1...,http://ap.rdcpix.com/99e5d9a5783e3f334dc1ed0f1...,945958.55,106058.55,12.63,"2 Marcona Ln, Londonderry, NH"
29,https://www.realtor.com/realestateandhomes-det...,NHVT,4981189,FOR_SALE,SINGLE_FAMILY,Bassett Ln,Lot 2,Fremont,NH,3844,...,,2.0,,2.0,http://ap.rdcpix.com/15f687088caa312dbcbdc63db...,http://ap.rdcpix.com/15f687088caa312dbcbdc63db...,942472.05,93472.05,11.01,"Bassett Ln, Fremont, NH"
244,https://www.realtor.com/realestateandhomes-det...,NHVT,4976867,FOR_SALE,SINGLE_FAMILY,401C Ocean Blvd,,Hampton,NH,3842,...,-70.81,2.0,,,http://ap.rdcpix.com/bb23df213f6fa2c0800d72dcd...,http://ap.rdcpix.com/bb23df213f6fa2c0800d72dcd...,942472.05,92972.05,10.94,"401C Ocean Blvd, Hampton, NH"
299,https://www.realtor.com/realestateandhomes-det...,NHVT,4974612,FOR_SALE,SINGLE_FAMILY,3 Pine Knoll Dr,,Atkinson,NH,3811,...,-71.14,1.0,,2.0,http://ap.rdcpix.com/58ca1c4274cb87c3caff619ac...,http://ap.rdcpix.com/58ca1c4274cb87c3caff619ac...,981635.82,87635.82,9.8,"3 Pine Knoll Dr, Atkinson, NH"
155,https://www.realtor.com/realestateandhomes-det...,NHVT,4979157,FOR_SALE,SINGLE_FAMILY,60 Montana Dr,,Sandown,NH,3873,...,,2.0,,3.0,http://ap.rdcpix.com/fe0091ce86441160375d0c75a...,http://ap.rdcpix.com/fe0091ce86441160375d0c75a...,942472.05,63472.05,7.22,"60 Montana Dr, Sandown, NH"
106,https://www.realtor.com/realestateandhomes-det...,NHVT,4980022,FOR_SALE,SINGLE_FAMILY,32 Daniel Rd,Lot 58-05,Derry,NH,3038,...,-71.33,1.0,,2.0,http://ap.rdcpix.com/1ce753260d2b6d7f9b40e8706...,http://ap.rdcpix.com/1ce753260d2b6d7f9b40e8706...,942472.05,62572.05,7.11,"32 Daniel Rd, Derry, NH"
294,https://www.realtor.com/realestateandhomes-det...,NHVT,4974920,FOR_SALE,SINGLE_FAMILY,159 Ashworth Ave,,Hampton,NH,3842,...,-70.81,2.0,,,http://ap.rdcpix.com/bbe4b3411489c36771b5bb5fd...,http://ap.rdcpix.com/bbe4b3411489c36771b5bb5fd...,942472.05,57472.05,6.49,"159 Ashworth Ave, Hampton, NH"
103,https://www.realtor.com/realestateandhomes-det...,NHVT,4980053,FOR_SALE,SINGLE_FAMILY,147 Lafayette Rd,,Hampton Falls,NH,3844,...,-70.85,2.0,,,http://ap.rdcpix.com/050d2f82672e30211af90a723...,http://ap.rdcpix.com/050d2f82672e30211af90a723...,942472.05,43472.05,4.84,"147 Lafayette Rd, Hampton Falls, NH"
231,https://www.realtor.com/realestateandhomes-det...,NHVT,4977407,FOR_SALE,SINGLE_FAMILY,1247 South St,,Portsmouth,NH,3801,...,-70.77,3.0,,2.0,http://ap.rdcpix.com/c70dd79417093e9056fc3c138...,http://ap.rdcpix.com/c70dd79417093e9056fc3c138...,942472.05,43472.05,4.84,"1247 South St, Portsmouth, NH"


In [162]:
print(redfin.trained_columns)
# remove exponent formatting
pd.set_option('display.float_format', lambda x: '%.2f' % x)
def make_clickable(val):
    # target _blank to open new window
    return '<a target="_blank" href="{}">{}</a>'.format(val, val)

test_df.style.format({'property_url': make_clickable})
print(f"===\nPredictions ({test_df.shape[0]})\n===")
test_df[RedfinModel.OUTPUT_COLUMNS].sort_values(by=['diff_percent'], ascending=False).head(25)


['city=Atkinson' 'city=Auburn' 'city=Londonderry' 'city=Plaistow'
 'city=Rye' 'city=Salem' 'city=Windham' 'state=NH' 'style=SINGLE_FAMILY']
===
Predictions (35)
===


Unnamed: 0,readable_address,style,beds,list_price,predicted,diff,diff_percent,property_url
94,"64 Lane Rd, Candia, NH",SINGLE_FAMILY,3,829900,942472.05,112572.05,13.56,https://www.realtor.com/realestateandhomes-det...
308,"2 Marcona Ln, Londonderry, NH",SINGLE_FAMILY,4,839900,945958.55,106058.55,12.63,https://www.realtor.com/realestateandhomes-det...
29,"Bassett Ln, Fremont, NH",SINGLE_FAMILY,4,849000,942472.05,93472.05,11.01,https://www.realtor.com/realestateandhomes-det...
244,"401C Ocean Blvd, Hampton, NH",SINGLE_FAMILY,3,849500,942472.05,92972.05,10.94,https://www.realtor.com/realestateandhomes-det...
299,"3 Pine Knoll Dr, Atkinson, NH",SINGLE_FAMILY,4,894000,981635.82,87635.82,9.8,https://www.realtor.com/realestateandhomes-det...
155,"60 Montana Dr, Sandown, NH",SINGLE_FAMILY,4,879000,942472.05,63472.05,7.22,https://www.realtor.com/realestateandhomes-det...
106,"32 Daniel Rd, Derry, NH",SINGLE_FAMILY,3,879900,942472.05,62572.05,7.11,https://www.realtor.com/realestateandhomes-det...
294,"159 Ashworth Ave, Hampton, NH",SINGLE_FAMILY,5,885000,942472.05,57472.05,6.49,https://www.realtor.com/realestateandhomes-det...
103,"147 Lafayette Rd, Hampton Falls, NH",SINGLE_FAMILY,3,899000,942472.05,43472.05,4.84,https://www.realtor.com/realestateandhomes-det...
231,"1247 South St, Portsmouth, NH",SINGLE_FAMILY,4,899000,942472.05,43472.05,4.84,https://www.realtor.com/realestateandhomes-det...


In [163]:
importances = redfin.print_feature_importances()
print(f"===\nFeature Importances ({len(importances)})\n===")
for importance in importances:
    print(importance)



===
Feature Importances (9)
===
('city=Rye', 0.9515271550350086)
('city=Windham', 0.012764450278641858)
('city=Atkinson', 0.010885503097496982)
('city=Auburn', 0.010750404385125955)
('city=Plaistow', 0.008488850631840038)
('city=Salem', 0.0052896481348787185)
('city=Londonderry', 0.00029398843700772695)
('state=NH', 0.0)
('style=SINGLE_FAMILY', 0.0)


In [164]:
# test_df.head()
print(set(test_df['style']))

{'SINGLE_FAMILY'}
