In [None]:

from homeharvest import scrape_property
from datetime import datetime

# Generate filename based on current timestamp
current_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"HomeHarvest_{current_timestamp}.csv"

# properties = scrape_property(
#   location="San Diego, CA",
#   listing_type="sold",  # or (for_sale, for_rent, pending)
#   past_days=365*5,  # sold in last 30 days - listed in last 30 days if (for_sale, for_rent)

  # property_type=['single_family','multi_family'],
  # date_from="2023-05-01", # alternative to past_days
  # date_to="2023-05-28",
  # foreclosure=True
  # mls_only=True,  # only fetch MLS listings
#)

properties = scrape_property(
    location="13724 Mulberry Dr, Whittier, CA 90605",
    radius=30.0,  # 5 mile radius,
    listing_type="sold"
    past_days=30
)
print(f"Number of properties: {len(properties)}")



Number of properties: 9244


In [None]:
from google.cloud import storage

# Init client
client = storage.Client(project='acres-ai')
# Get or create bucket
bucket = client.bucket('scraped-mls')


In [27]:
CENTRAL_LOCATION = "13724 Mulberry Dr, Whittier, CA 90605"

city = 'Alhambra, CA'
sold_properties = scrape_property(
        location=city, 
        listing_type="sold",
        past_days=360,
        extra_property_data = True,
    )
listed_properties = scrape_property(
        location=city, 
        listing_type="for_sale",
        past_days=360,
        extra_property_data = True,
    )



In [None]:
# Add ", CA" to each city name
list_of_cities = pd.read_csv('../city_list.csv')
list_of_cities['City Name'] = list_of_cities['City Name'] + ', CA'

# Initialize empty lists to store results
sold_properties_list = []
listed_properties_list = []

# Iterate through cities
for city in list_of_cities['City Name']:
    print(f"Processing {city}...")
    
    # Get sold properties
    try:
        sold = scrape_property(
            location=city,
            listing_type="sold", 
            past_days=360,
            extra_property_data=True
        )
        if sold is not None:
            sold_properties_list.append(sold)
    except Exception as e:
        print(f"Error scraping sold properties for {city}: {str(e)}")
        
    # Get listed properties    
    try:
        listed = scrape_property(
            location=city,
            listing_type="for_sale",
            past_days=360,
            extra_property_data=True
        )
        if listed is not None:
            listed_properties_list.append(listed)
    except Exception as e:
        print(f"Error scraping listed properties for {city}: {str(e)}")


Processing Alhambra, CA...
Processing Altadena, CA...
Processing Anaheim, CA...
Processing Anaheim Hills, CA...
Processing Arcadia, CA...
Processing Artesia, CA...
Processing Azusa, CA...
Processing Baldwin Park, CA...
Processing Brea, CA...
Processing Buena Park, CA...
Processing Chino, CA...
Processing Chino Hills, CA...
Processing CITY, CA...
Processing Claremont, CA...
Processing Covina, CA...
Processing Diamond Bar, CA...
Processing Downey, CA...
Processing Duarte, CA...
Processing El Monte, CA...
Processing Fountain Valley, CA...
Processing Fullerton, CA...
Processing Garden Grove, CA...
Processing Glendora, CA...
Processing Hacienda Heights, CA...
Processing Hacienda Hts, CA...
Processing La Canada Flintridge, CA...
Processing La Habra, CA...
Processing La Habra Heights, CA...
Processing La Mirada, CA...
Processing La Palma, CA...
Processing La Puente, CA...
Processing La Verne, CA...
Processing Lakewood, CA...
Processing Los Alamitos, CA...
Processing Midway City, CA...
Process

In [43]:

# Combine results
sold_properties_df = pd.concat(sold_properties_list) if sold_properties_list else None
listed_properties_df = pd.concat(listed_properties_list) if listed_properties_list else None


In [50]:
sold_properties_df['last_sold_date'].min()


'2024-07-30'

In [51]:
import pandas as pd

# Upload each dataset to GCS
for properties_df, listing_status in [
    (sold_properties_df, "sold"),
    (listed_properties_df, "for_sale"),
]:
    # Create folder structure
    current_date = datetime.now().strftime("%Y/%m/%d")
    base_path = f"homeharvest/{listing_status}/{current_date}/"
    
    # Create filename
    current_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{listing_status}_{current_timestamp}.csv"
    
    # Upload to GCS
    if properties_df is not None:
        blob = bucket.blob(base_path + filename)
        properties_df.to_csv('temp.csv', index=False)
        blob.upload_from_filename('temp.csv')






In [67]:
# Add ", CA" to each city name
list_of_cities = pd.read_csv('../city_list.csv')
list_of_cities['City Name'] = list_of_cities['City Name'] + ', CA'

# Initialize empty lists to store results
sold_properties_list = []
listed_properties_list = []

import json

# Iterate through cities
for city in list_of_cities['City Name']:
    print(f"Processing {city}...")
    
    # Get sold properties
    try:
        sold = scrape_property(
            location=city,
            listing_type="sold", 
            past_days=3600,
            extra_property_data=True,
            return_type="raw"
        )
        if sold is not None:
            sold_properties_list.append(sold)
    except Exception as e:
        print(f"Error scraping sold properties for {city}: {str(e)}")

    # Create folder structure for raw JSON
    current_date = datetime.now().strftime("%Y/%m/%d")
    base_path = f"homeharvest/raw_json/sold_history/"
    
    # Create filename with timestamp
    current_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"raw_sold_{city.replace(', ', '_')}_{current_timestamp}.json"
    
    # Upload raw JSON to GCS
    if sold is not None:
        blob = bucket.blob(base_path + filename)
        with open('temp.json', 'w') as f:
            json.dump(sold, f)
        blob.upload_from_filename('temp.json')

        

Processing Alhambra, CA...
Processing Altadena, CA...
Processing Anaheim, CA...
Processing Anaheim Hills, CA...
Processing Arcadia, CA...
Processing Artesia, CA...
Processing Azusa, CA...
Processing Baldwin Park, CA...
Processing Brea, CA...
Processing Buena Park, CA...
Processing Chino, CA...
Error scraping sold properties for Chino, CA: Expecting value: line 1 column 1 (char 0)
Processing Chino Hills, CA...
Processing CITY, CA...
Processing Claremont, CA...
Processing Covina, CA...
Processing Diamond Bar, CA...
Processing Downey, CA...
Processing Duarte, CA...
Processing El Monte, CA...
Processing Fountain Valley, CA...
Processing Fullerton, CA...
Processing Garden Grove, CA...
Processing Glendora, CA...
Processing Hacienda Heights, CA...
Processing Hacienda Hts, CA...
Processing La Canada Flintridge, CA...
Processing La Habra, CA...
Processing La Habra Heights, CA...
Processing La Mirada, CA...
Processing La Palma, CA...
Processing La Puente, CA...
Processing La Verne, CA...
Process