In [1]:
# Author:       Andrey Norin
# Title :       Scrape Nassau County Real Estate Listings
# Date Created: 11/20/2022
# Date Updated: 11/26/2022
# based on tutorial: https://medium.com/@knappik.marco/python-web-scraping-how-to-scrape-the-api-of-a-real-estate-website-dc8136e56249
# based on this tutorial: https://medium.com/@knappik.marco/python-web-scraping-how-to-scrape-the-api-of-a-real-estate-website-dc8136e56249

In [2]:
from datetime import date
import requests 
import json 
import pandas as pd

In [3]:
# set export file details
todaysDate  = str(date.today())
countyName  = "Nassau"
csvFileName = (countyName + "_" + todaysDate + ".csv")

In [4]:
def send_request(page_number: int, offset_parameter: int):
    url = "https://www.realtor.com/api/v1/hulk?client_id=rdc-x&schema=vesta"
    headers = {"content-type": "application/json"}

    body = r'{"query":"\nquery ConsumerSearchQuery($query: HomeSearchCriteria!) {\n  rich_snippets: home_search(query: $query,\n    sort:{field: home_phrases_score, direction: desc},\n    limit: 20,\n  ){\n    count\n    total\n    results {\n      property_id\n      list_price\n      description {\n        beds\n        baths\n        baths_full\n        baths_half\n        baths_3qtr\n        sqft\n      }\n      primary_photo(https: true) {\n        href\n      }\n      permalink\n      href\n      flags {\n        is_pending\n        is_foreclosure\n        is_contingent\n        is_new_construction\n        is_new_listing (days: 14)\n        is_price_reduced (days: 30)\n        is_plan\n      }\n      location {\n        address {\n          city\n          country\n          line\n          postal_code\n          state_code\n          state\n        }\n      }\n      open_houses {\n        start_date\n        end_date\n      }\n      sentences_list: highlights(highlight_type: phrase_srp_spl_homes) {\n        ... on HighlightPhrase {\n          phrase\n          text\n        }\n      }\n    }\n  }\n}","callfrom":"SRP","nrQueryType":"SPECIAL_FEATURES","cacheKey":"SPECIAL_FEATURES","cacheParams":"/special_features/Nassau-County_NY","variables":{"query":{"search_location":{"location":"Nassau County, NY"},"status":"for_sale"}},"isClient":true}'
    json_body = json.loads(body)

    json_body["variables"]["page_index"] = page_number
    json_body["seoPayload"] = page_number
    json_body["variables"]["offset"] = offset_parameter

    r = requests.post(url=url, json=json_body, headers=headers)
    json_data = r.json()
    return json_data

In [5]:
offset_parameter = 0

json_data_list = []

for page_number in range(1, 105):
    json_data = send_request(page_number=page_number, offset_parameter=offset_parameter)
    json_data_list.append(json_data)
    offset_parameter +=42

In [6]:
def extract_features(entry: dict):
    feature_dict = {
        "id": entry["property_id"],
        "price": entry["list_price"],
        "beds": entry["description"]["beds"],
        "baths": entry["description"]["baths"],
        "sqft": entry["description"]["sqft"],
        "address": entry["location"]["address"]["line"],
        "postal_code": entry["location"]["address"]["postal_code"],
        "state": entry["location"]["address"]["state_code"],
        "city": entry["location"]["address"]["city"]
    }

    return feature_dict

In [7]:
feature_dict_list = []

for data in json_data_list:
    for entry in data['data']['rich_snippets']['results']:
        feature_dict = extract_features(entry=entry)
        feature_dict_list.append(feature_dict)

df = pd.DataFrame(feature_dict_list)

In [8]:
# export csv file
df.to_csv(csvFileName)

In [9]:
df.head(100)

Unnamed: 0,id,price,beds,baths,sqft,address,postal_code,state,city
0,4273064857,879000,3,2,,42 Wakefield Ave,11050,NY,Port Washington
1,3499438412,685000,4,3,,31 James St,11040,NY,New Hyde Park
2,4927661252,1099999,5,3,2900.0,1270 Lakeshore Dr,11762,NY,Massapequa Park
3,9673513342,470000,3,3,,280 Franklin Ave Apt 13,11010,NY,Franklin Square
4,3948887720,795000,3,3,2505.0,200 Montauk St,11580,NY,Valley Stream
...,...,...,...,...,...,...,...,...,...
95,4323902633,1580000,4,3,3016.0,48A Nassau Dr,11021,NY,Great Neck
96,4060395644,829000,6,3,,94 W Choir Ln,11590,NY,Westbury
97,3942169181,1288000,3,3,2100.0,118 Overlook Ter,11577,NY,East Hills
98,4902996396,1698888,5,3,,30 Virginia St,11580,NY,Valley Stream
