In [34]:
# Try homeharvest isntead
from homeharvest import scrape_property
from datetime import datetime

In [36]:
# Generate filename based on current timestamp
current_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"HomeHarvest_{current_timestamp}.csv"

properties = scrape_property(
  location="San Diego, CA",
  listing_type="for_rent",  # or (for_sale, for_rent, pending)
  past_days=30,  # sold in last 30 days - listed in last 30 days if (for_sale, for_rent)

  # date_from="2023-05-01", # alternative to past_days
  # date_to="2023-05-28",
  # foreclosure=True
  # mls_only=True,  # only fetch MLS listings
)
print(f"Number of properties: {len(properties)}")

# Export to csv
properties.to_csv(filename, index=False)
print(properties.head())

Number of properties: 671
                                        property_url   mls  \
0  https://www.realtor.com/realestateandhomes-det...  ZUMU   
1  https://www.realtor.com/realestateandhomes-det...  RLHC   
2  https://www.realtor.com/realestateandhomes-det...  RLXB   
3  https://www.realtor.com/realestateandhomes-det...  ZUMU   
4  https://www.realtor.com/realestateandhomes-det...  SDCA   

                                 mls_id    status  \
0                              58161419  FOR_RENT   
1  a3b0e553-99ec-11e8-8c9e-1866da72afcc  FOR_RENT   
2  7dd146f2-e31e-11ee-bfba-069ca18f5865  FOR_RENT   
3                              58161416  FOR_RENT   
4                             240014315  FOR_RENT   

                                                text          style  \
0  Extension 3336 - We offer custom pricing for a...      APARTMENT   
1                                  Phone Extension:1      APARTMENT   
2                                               <NA>      APARTMENT  

In [2]:
import pandas as pd

In [37]:
for_sale = pd.read_csv('HomeHarvest_20240621_124527.csv')

In [38]:
print(for_sale.shape)
for_sale.head()

(671, 43)


Unnamed: 0,property_url,mls,mls_id,status,text,style,full_street_line,street,unit,city,...,parking_garage,agent,agent_email,agent_phones,broker,broker_phone,broker_website,nearby_schools,primary_photo,alt_photos
0,https://www.realtor.com/realestateandhomes-det...,ZUMU,58161419,FOR_RENT,Extension 3336 - We offer custom pricing for a...,APARTMENT,3883 Ingraham St Unit 159A,3883 Ingraham St,Unit 159A,San Diego,...,,Zumper,,,,,,San Diego Unified School District,http://ap.rdcpix.com/0f46650c4f1e73411c64fe53c...,http://ap.rdcpix.com/0f46650c4f1e73411c64fe53c...
1,https://www.realtor.com/realestateandhomes-det...,RLHC,a3b0e553-99ec-11e8-8c9e-1866da72afcc,FOR_RENT,Phone Extension:1,APARTMENT,4027 34th St,4027 34th St,,San Diego,...,,,,,,,,San Diego Unified School District,,
2,https://www.realtor.com/realestateandhomes-det...,RLXB,7dd146f2-e31e-11ee-bfba-069ca18f5865,FOR_RENT,,APARTMENT,3046 Nimitz Blvd,3046 Nimitz Blvd,,San Diego,...,,,,,,,,San Diego Unified School District,,
3,https://www.realtor.com/realestateandhomes-det...,ZUMU,58161416,FOR_RENT,Extension 7869 - We offer custom pricing for a...,APARTMENT,7225 Charmant Dr Unit 104A,7225 Charmant Dr,Unit 104A,San Diego,...,,Zumper,,,,,,San Diego Unified School District,http://ap.rdcpix.com/b72a590aa320f5d414ede91f7...,http://ap.rdcpix.com/b72a590aa320f5d414ede91f7...
4,https://www.realtor.com/realestateandhomes-det...,SDCA,240014315,FOR_RENT,12 Month Lease Beginning 8/1/14 (& potentially...,SINGLE_FAMILY,8865 Robin Hood Ln,8865 Robin Hood Ln,,La Jolla,...,2.0,Jay Becker,jay.becker@pacificsir.com,"[{'number': '8589263060', 'type': 'Office', 'e...",Pacific Sotheby's International Realty,(858) 926-3060,https://www.pacificsothebysrealty.com/office_1...,San Diego Unified School District,http://ap.rdcpix.com/fd71e02bc1697fbecff0fb93d...,http://ap.rdcpix.com/fd71e02bc1697fbecff0fb93d...


In [39]:
for_sale.columns

Index(['property_url', 'mls', 'mls_id', 'status', 'text', 'style',
       'full_street_line', 'street', 'unit', 'city', 'state', 'zip_code',
       'beds', 'full_baths', 'half_baths', 'sqft', 'year_built', 'days_on_mls',
       'list_price', 'list_date', 'sold_price', 'last_sold_date',
       'assessed_value', 'estimated_value', 'lot_sqft', 'price_per_sqft',
       'latitude', 'longitude', 'neighborhoods', 'county', 'fips_code',
       'stories', 'hoa_fee', 'parking_garage', 'agent', 'agent_email',
       'agent_phones', 'broker', 'broker_phone', 'broker_website',
       'nearby_schools', 'primary_photo', 'alt_photos'],
      dtype='object')

In [40]:
for_sale['style'].unique()

array(['APARTMENT', 'SINGLE_FAMILY', 'CONDOS', 'TOWNHOMES',
       'DUPLEX_TRIPLEX'], dtype=object)

In [13]:
for_sale.property_url[10]

'https://www.realtor.com/realestateandhomes-detail/2662705567'

## General filtering
What kind of listings are you looking for? Filtering options:

Required

├── location (str): The address in various formats - this could be just a zip code, a full address, or city/state, etc.

└── listing_type (option): Choose the type of listing.
    - 'for_rent'
    - 'for_sale'
    - 'sold'
    - 'pending'

Optional

├── radius (decimal): Radius in miles to find comparable properties based on individual addresses.

│    Example: 5.5 (fetches properties within a 5.5-mile radius if location is set to a specific address; otherwise, ignored)

│
├── past_days (integer): Number of past days to filter properties. Utilizes 'last_sold_date' for 'sold' listing types, and 'list_date' for others (for_rent, for_sale).

│    Example: 30 (fetches properties listed/sold in the last 30 days)
│
├── date_from, date_to (string): Start and end dates to filter properties listed or sold, both dates are required.
|    (use this to get properties in chunks as there's a 10k result limit)

│    Format for both must be "YYYY-MM-DD".

│    Example: "2023-05-01", "2023-05-15" (fetches properties listed/sold between these dates)
│

├── mls_only (True/False): If set, fetches only MLS listings (mainly applicable to 'sold' listings)
│

├── foreclosure (True/False): If set, fetches only foreclosures
│

├── proxy (string): In format 'http://user:pass@host:port'
│

├── extra_property_data (True/False): Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.)
│

└── exclude_pending (True/False): If set, excludes pending properties from the results unless listing_type is 'pending'


In [16]:
def get_listings(location, listing_type, past_days):
    # Generate filename based on current timestamp
    current_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"HomeHarvest_{current_timestamp}.csv"

    properties = scrape_property(
    location=location,
    listing_type=listing_type,  # or (for_sale, for_rent, pending)
    past_days=past_days),  # sold in last 30 days - listed in last 30 days if (for_sale, for_rent)

    # date_from="2023-05-01", # alternative to past_days
    # date_to="2023-05-28",
    # foreclosure=True
    # mls_only=True,  # only fetch MLS listings

    print(f"Number of properties: {len(properties)}")

    # Export to csv
    properties.to_csv(filename, index=False)
    print(properties.head())

    return(filename)

## Run through llm

In [25]:
# Example first prompt:

prompt = "I'm looking to buy a house in Seattle, WA with at least three bedrooms"

# Ideally this will have the llm return a structure something like:
listing_args = {
    'location': 'Seattle, WA',
    'listing_type': 'for_sale'
}

filter_args = {
    'num_bedrooms': 3
}

In [12]:
format = "Given a prompt, I want you to extract arguments for a function that creates a \
        dataframe with real estate listings. I also want you to extract information that relates \
        to the columns of the output dataframe so I can filter the dataframe. \
        The required function arguments are: location (str): The address in various formats - \
        zip code, a full address, or city/state, etc., and listing_type (option): 'for_rent', \
        'for_sale', 'sold', or 'pending'. \
        The output dataframe columns are ['property_url', 'mls', 'mls_id', 'status', 'text', \
        'style', 'full_street_line', 'street', 'unit', 'city', 'state', 'zip_code', 'beds', \
        'full_baths', 'half_baths', 'sqft', 'year_built', 'days_on_mls', 'list_price', \
        'list_date', 'sold_price', 'last_sold_date', 'assessed_value', 'estimated_value', \
        'lot_sqft', 'price_per_sqft', 'latitude', 'longitude', 'neighborhoods', 'county', \
        'fips_code',,'stories', 'hoa_fee', 'parking_garage', 'agent', 'agent_email', \
        'agent_phones', 'broker', 'broker_phone', 'broker_website', 'nearby_schools', \
        'primary_photo', 'alt_photos']. \
        Please return two dictionaries with the argument key and the following input for function arguments and \
        filtering features, like func_arg = {'location': 'Seattle, WA', 'listing_type': 'for_sale'}, \
        filter_arg = {'beds': 3, 'full_baths': 2}. \
        The prompt is:" + f"{prompt}"

In [26]:
from transformers import pipeline

# Example of a simplified setup
classifier = pipeline('zero-shot-classification')

def parse_prompt(prompt):
    # Categories could be dynamic based on DataFrame schema
    categories = ['for sale', 'for rent', 'num_bedrooms', 'num_bathrooms', 'location']
    
    # Use classifier to identify categories
    response = classifier(prompt, candidate_labels=categories, multi_label=True)
    
    # Process response to structure it into arguments
    listing_args = {}
    filter_args = {}
    
    for label, score in zip(response['labels'], response['scores']):
        if score > 0.5:  # threshold can be adjusted
            if label in ['for sale', 'for rent']:
                listing_args['listing_type'] = label.replace(' ', '_')
            elif 'num_' in label:
                # Assuming the prompt contains numbers, extract them
                import re
                number = int(re.search(r'\d+', prompt).group())
                filter_args[label] = number
            elif label == 'location':
                # Example to extract location, adjust according to expected format
                import re
                location = re.search(r'in (\w+)', prompt).group(1)
                listing_args['location'] = location
    
    return listing_args, filter_args

# Example usage
prompt = "I'm looking to buy a house in Seattle, WA with at least three bedrooms"
listing_args, filter_args = parse_prompt(prompt)
print("Listing Arguments:", listing_args)
print("Filter Arguments:", filter_args)

No model was supplied, defaulted to facebook/bart-large-mnli and revision c626438 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.


Listing Arguments: {'location': 'Seattle', 'listing_type': 'for_sale'}
Filter Arguments: {}


In [6]:
nlp = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased-distilled-squad")

In [22]:
data = for_sale.loc[0]
' '.join(data.values.astype(str))

"https://www.realtor.com/realestateandhomes-detail/1343754694 SDCA 240014227 FOR_SALE COMING SOON! Lovely and spacious upper-level condominium in Mira Mesa. Recently renovated with new carpet and luxury vinyl flooring. Features a beautifully appointed kitchen with new appliances, two master suites with updated bathrooms, and in-unit laundry hookups. Conveniently located within walking distance to parks, shopping plazas, restaurants, a library, and schools. CONDOS 8510 Westmore Rd Apt 278 8510 Westmore Rd Apt 278 San Diego CA 92126 2.0 2.0 nan 907.0 1988.0 0 655000 2024-06-20 nan nan 151561.0 633289.0 146797.0 722.0 32.918595 -117.139937 Mira Mesa, Northeastern San Diego San Diego 6073.0 1.0 300.0 nan Robert Colello Robert@robertcolello.com [{'number': '8588294069', 'type': 'Mobile', 'ext': '', 'primary': True}] eXp Realty of California, Inc (888) 584-9427 nan San Diego Unified School District http://ap.rdcpix.com/10b2a60d6f9a0da1930f2cedf23a04fdl-m3356193086od-w480_h360_x2.webp?w=1080&

In [24]:
result = nlp(question=format, context=data)

ValueError: Arguments can't be understood

In [9]:
result['answer']

'func_arg'

In [10]:
result['score']

0.00014399067731574178

In [3]:
from transformers import pipeline

# Load your CSV file
data = pd.read_csv('HomeHarvest_20240620_145139.csv')

# Function to format each row as a context
def format_row(row):
    return ' '.join(f"{key}: {value}" for key, value in row.items() if pd.notna(value))

# Create a function to query the data
def answer_question(question, data):
    nlp = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased-distilled-squad")

    # Iterate over each row in the data
    answers = []
    for index, row in data.iterrows():
        context = format_row(row)
        result = nlp(question=question, context=context)
        answers.append((index, result['answer'], result['score']))

    # Sort answers by the confidence score in descending order
    answers.sort(key=lambda x: x[2], reverse=True)
    return answers[0] if answers else None

# Example usage
question = "How many houses have 3 bedrooms?"
result = answer_question(question, data)
print(f"Best answer: {result[1]} (confidence: {result[2]}) at row {result[0]}")


Best answer: 75 (confidence: 0.9288774132728577) at row 406


### Try chatterbot