In [None]:
# Run this cell if on colab
! pip install transformers torch
! pip install homeharvest
! pip install accelerate -U

## Test out zillow data scraper

In [3]:
# Try homeharvest isntead
from homeharvest import scrape_property
from datetime import datetime

In [4]:
# Generate filename based on current timestamp
current_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"HomeHarvest_{current_timestamp}.csv"

properties = scrape_property(
  location="San Diego, CA",
  listing_type="for_rent",  # or (for_sale, for_rent, pending)
  past_days=30,  # sold in last 30 days - listed in last 30 days if (for_sale, for_rent)

  # date_from="2023-05-01", # alternative to past_days
  # date_to="2023-05-28",
  # foreclosure=True
  # mls_only=True,  # only fetch MLS listings
)
print(f"Number of properties: {len(properties)}")

# Export to csv
properties.to_csv(filename, index=False)
print(properties.head())

Number of properties: 639
                                        property_url   mls  \
0  https://www.realtor.com/realestateandhomes-det...  RLHC   
1  https://www.realtor.com/realestateandhomes-det...  RLHC   
2  https://www.realtor.com/realestateandhomes-det...  RTEC   
3  https://www.realtor.com/realestateandhomes-det...  SDCA   
4  https://www.realtor.com/realestateandhomes-det...  ZUMP   

                                 mls_id    status  \
0  a5968950-1d51-11eb-8290-0e34fd0bfc09  FOR_RENT   
1  2bbf4e70-99f6-11e8-bc05-1866da72aa09  FOR_RENT   
2                              59573841  FOR_RENT   
3                             240014531  FOR_RENT   
4                               1687046  FOR_RENT   

                                                text          style  \
0  Phone Extension:1 - CONTACT OLIVER TYSON, LEAS...      APARTMENT   
1                                  Phone Extension:1      APARTMENT   
2  Move Right In to this gorgeous Home on a Hill ...  SINGLE_FAMILY  

In [5]:
import pandas as pd

In [6]:
for_sale = pd.read_csv('HomeHarvest_20240621_124527.csv')

In [7]:
print(for_sale.shape)
for_sale.head()

(671, 43)


Unnamed: 0,property_url,mls,mls_id,status,text,style,full_street_line,street,unit,city,...,parking_garage,agent,agent_email,agent_phones,broker,broker_phone,broker_website,nearby_schools,primary_photo,alt_photos
0,https://www.realtor.com/realestateandhomes-det...,ZUMU,58161419,FOR_RENT,Extension 3336 - We offer custom pricing for a...,APARTMENT,3883 Ingraham St Unit 159A,3883 Ingraham St,Unit 159A,San Diego,...,,Zumper,,,,,,San Diego Unified School District,http://ap.rdcpix.com/0f46650c4f1e73411c64fe53c...,http://ap.rdcpix.com/0f46650c4f1e73411c64fe53c...
1,https://www.realtor.com/realestateandhomes-det...,RLHC,a3b0e553-99ec-11e8-8c9e-1866da72afcc,FOR_RENT,Phone Extension:1,APARTMENT,4027 34th St,4027 34th St,,San Diego,...,,,,,,,,San Diego Unified School District,,
2,https://www.realtor.com/realestateandhomes-det...,RLXB,7dd146f2-e31e-11ee-bfba-069ca18f5865,FOR_RENT,,APARTMENT,3046 Nimitz Blvd,3046 Nimitz Blvd,,San Diego,...,,,,,,,,San Diego Unified School District,,
3,https://www.realtor.com/realestateandhomes-det...,ZUMU,58161416,FOR_RENT,Extension 7869 - We offer custom pricing for a...,APARTMENT,7225 Charmant Dr Unit 104A,7225 Charmant Dr,Unit 104A,San Diego,...,,Zumper,,,,,,San Diego Unified School District,http://ap.rdcpix.com/b72a590aa320f5d414ede91f7...,http://ap.rdcpix.com/b72a590aa320f5d414ede91f7...
4,https://www.realtor.com/realestateandhomes-det...,SDCA,240014315,FOR_RENT,12 Month Lease Beginning 8/1/14 (& potentially...,SINGLE_FAMILY,8865 Robin Hood Ln,8865 Robin Hood Ln,,La Jolla,...,2.0,Jay Becker,jay.becker@pacificsir.com,"[{'number': '8589263060', 'type': 'Office', 'e...",Pacific Sotheby's International Realty,(858) 926-3060,https://www.pacificsothebysrealty.com/office_1...,San Diego Unified School District,http://ap.rdcpix.com/fd71e02bc1697fbecff0fb93d...,http://ap.rdcpix.com/fd71e02bc1697fbecff0fb93d...


In [8]:
for_sale.columns

Index(['property_url', 'mls', 'mls_id', 'status', 'text', 'style',
       'full_street_line', 'street', 'unit', 'city', 'state', 'zip_code',
       'beds', 'full_baths', 'half_baths', 'sqft', 'year_built', 'days_on_mls',
       'list_price', 'list_date', 'sold_price', 'last_sold_date',
       'assessed_value', 'estimated_value', 'lot_sqft', 'price_per_sqft',
       'latitude', 'longitude', 'neighborhoods', 'county', 'fips_code',
       'stories', 'hoa_fee', 'parking_garage', 'agent', 'agent_email',
       'agent_phones', 'broker', 'broker_phone', 'broker_website',
       'nearby_schools', 'primary_photo', 'alt_photos'],
      dtype='object')

In [9]:
for_sale['style'].unique()

array(['APARTMENT', 'SINGLE_FAMILY', 'CONDOS', 'TOWNHOMES',
       'DUPLEX_TRIPLEX'], dtype=object)

In [10]:
for_sale.property_url[10]

'https://www.realtor.com/realestateandhomes-detail/M1266539661?listing_status=rental'

## General filtering
What kind of listings are you looking for? Filtering options:

Required

├── location (str): The address in various formats - this could be just a zip code, a full address, or city/state, etc.

└── listing_type (option): Choose the type of listing.
    - 'for_rent'
    - 'for_sale'
    - 'sold'
    - 'pending'

Optional

├── radius (decimal): Radius in miles to find comparable properties based on individual addresses.

│    Example: 5.5 (fetches properties within a 5.5-mile radius if location is set to a specific address; otherwise, ignored)

│
├── past_days (integer): Number of past days to filter properties. Utilizes 'last_sold_date' for 'sold' listing types, and 'list_date' for others (for_rent, for_sale).

│    Example: 30 (fetches properties listed/sold in the last 30 days)
│
├── date_from, date_to (string): Start and end dates to filter properties listed or sold, both dates are required.
|    (use this to get properties in chunks as there's a 10k result limit)

│    Format for both must be "YYYY-MM-DD".

│    Example: "2023-05-01", "2023-05-15" (fetches properties listed/sold between these dates)
│

├── mls_only (True/False): If set, fetches only MLS listings (mainly applicable to 'sold' listings)
│

├── foreclosure (True/False): If set, fetches only foreclosures
│

├── proxy (string): In format 'http://user:pass@host:port'
│

├── extra_property_data (True/False): Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.)
│

└── exclude_pending (True/False): If set, excludes pending properties from the results unless listing_type is 'pending'


In [11]:
def get_listings(location, listing_type, past_days):
    # Generate filename based on current timestamp
    current_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"HomeHarvest_{current_timestamp}.csv"

    properties = scrape_property(
    location=location,
    listing_type=listing_type,  # or (for_sale, for_rent, pending)
    past_days=past_days),  # sold in last 30 days - listed in last 30 days if (for_sale, for_rent)

    # date_from="2023-05-01", # alternative to past_days
    # date_to="2023-05-28",
    # foreclosure=True
    # mls_only=True,  # only fetch MLS listings

    print(f"Number of properties: {len(properties)}")

    # Export to csv
    properties.to_csv(filename, index=False)
    print(properties.head())

    return(filename)

## test mistral

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.3")
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.3")

In [None]:
inputs = tokenizer("can you format this text to filter a dataframe: I'm looking to buy in San Diego, CA with 5 bedrooms and 1 bathrooms, with at least 872 square feet, built after 1966, priced around 150000, with 2 stories", return_tensors="pt")

outputs = model.generate(**inputs, max_new_tokens=500)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

## Run through llm

### Import libraries and data

In [60]:
import pandas as pd

# Load your dataset
train_data = pd.read_csv('expanded_real_estate_data.csv')

### Tranform csv to be llm friendly

In [61]:
train_data['llm_output'] = train_data.apply(lambda row: '; '.join([f"{col}: {row[col]}" for col in train_data.columns[1:]]), axis=1)

In [62]:
train_data[['prompt', 'llm_output']].head()

Unnamed: 0,prompt,llm_output
0,"I'm looking to buy in San Diego, CA with 5 bed...","location: San Diego, CA; listing_type: for_sal..."
1,I'm looking to buy a single family in New York...,"location: New York, NY; listing_type: for_sale..."
2,"I'm looking to rent a apartment in San Diego, ...","location: San Diego, CA; listing_type: for_ren..."
3,"I'm looking to rent a townhome in New York, NY...","location: New York, NY; listing_type: for_rent..."
4,"I'm looking to rent a townhome in Los Angeles,...","location: Los Angeles, CA; listing_type: for_r..."


In [65]:
# split train data into train and eval 70/30 split
train_set_data = train_data.head(int(0.7*500))
test_set_data = train_data.tail(int(0.3*500))

output_file_path = "train_data.txt"

with open(output_file_path, 'w') as f:
    for _, row in train_set_data.iterrows():
        line = f"generate structured response: {row['prompt']} <|endoftext|> {row['llm_output']}\n"
        f.write(line)

output_file_path = "test_data.txt"

with open(output_file_path, 'w') as f:
    for _, row in test_set_data.iterrows():
        line = f"generate structured response: {row['prompt']} <|endoftext|> {row['llm_output']}\n"
        f.write(line)

### Model selection

In [50]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

model_name = 't5-small'  # You can choose other sizes like t5-base, t5-large, etc.
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)



### Create the dataset

In [None]:
from transformers import TextDataset, DataCollatorForLanguageModeling

train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="train_data.txt",
    block_size=512  # Adjust block size to maximum length acceptable by T5
)

eval_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="test_data.txt",  # Ensure you have a similar formatted file for evaluation
    block_size=512
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

### Fine-tuning

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./t5-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=2,
    save_strategy="epoch",  # Save at the end of each epoch
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    logging_dir='./logs',
    logging_steps=500,
    load_best_model_at_end=True,  # Load the best model at the end based on evaluation
    metric_for_best_model="loss",  # Assuming loss; choose an appropriate metric
    greater_is_better=False  # Assuming loss where lower is better; adjust based on your metric
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

trainer.train()

### Save the model

In [None]:
model.save_pretrained("./t5-finetuned")
tokenizer.save_pretrained("./t5-finetuned")

In [None]:
# Use the max_length or max_new_tokens parameter to control output length
inputs = tokenizer("generate structured response: I'm looking for a cheap studio to rent downtown.", return_tensors="pt")
outputs = model.generate(
    inputs['input_ids'],
    max_length=100,  # Increase max length to allow longer responses
    num_return_sequences=1,  # You can generate multiple sequences if needed
    no_repeat_ngram_size=2,  # Prevent repeating n-grams
    early_stopping=True  # Stop generating when the model outputs the EOS token
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)