In [3]:
# Try homeharvest isntead
from homeharvest import scrape_property
from datetime import datetime

In [4]:
# Generate filename based on current timestamp
current_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"HomeHarvest_{current_timestamp}.csv"

properties = scrape_property(
  location="San Diego, CA",
  listing_type="for_rent",  # or (for_sale, for_rent, pending)
  past_days=30,  # sold in last 30 days - listed in last 30 days if (for_sale, for_rent)

  # date_from="2023-05-01", # alternative to past_days
  # date_to="2023-05-28",
  # foreclosure=True
  # mls_only=True,  # only fetch MLS listings
)
print(f"Number of properties: {len(properties)}")

# Export to csv
properties.to_csv(filename, index=False)
print(properties.head())

Number of properties: 639
                                        property_url   mls  \
0  https://www.realtor.com/realestateandhomes-det...  RLHC   
1  https://www.realtor.com/realestateandhomes-det...  RLHC   
2  https://www.realtor.com/realestateandhomes-det...  RTEC   
3  https://www.realtor.com/realestateandhomes-det...  SDCA   
4  https://www.realtor.com/realestateandhomes-det...  ZUMP   

                                 mls_id    status  \
0  a5968950-1d51-11eb-8290-0e34fd0bfc09  FOR_RENT   
1  2bbf4e70-99f6-11e8-bc05-1866da72aa09  FOR_RENT   
2                              59573841  FOR_RENT   
3                             240014531  FOR_RENT   
4                               1687046  FOR_RENT   

                                                text          style  \
0  Phone Extension:1 - CONTACT OLIVER TYSON, LEAS...      APARTMENT   
1                                  Phone Extension:1      APARTMENT   
2  Move Right In to this gorgeous Home on a Hill ...  SINGLE_FAMILY  

In [5]:
import pandas as pd

In [6]:
for_sale = pd.read_csv('HomeHarvest_20240621_124527.csv')

In [7]:
print(for_sale.shape)
for_sale.head()

(671, 43)


Unnamed: 0,property_url,mls,mls_id,status,text,style,full_street_line,street,unit,city,...,parking_garage,agent,agent_email,agent_phones,broker,broker_phone,broker_website,nearby_schools,primary_photo,alt_photos
0,https://www.realtor.com/realestateandhomes-det...,ZUMU,58161419,FOR_RENT,Extension 3336 - We offer custom pricing for a...,APARTMENT,3883 Ingraham St Unit 159A,3883 Ingraham St,Unit 159A,San Diego,...,,Zumper,,,,,,San Diego Unified School District,http://ap.rdcpix.com/0f46650c4f1e73411c64fe53c...,http://ap.rdcpix.com/0f46650c4f1e73411c64fe53c...
1,https://www.realtor.com/realestateandhomes-det...,RLHC,a3b0e553-99ec-11e8-8c9e-1866da72afcc,FOR_RENT,Phone Extension:1,APARTMENT,4027 34th St,4027 34th St,,San Diego,...,,,,,,,,San Diego Unified School District,,
2,https://www.realtor.com/realestateandhomes-det...,RLXB,7dd146f2-e31e-11ee-bfba-069ca18f5865,FOR_RENT,,APARTMENT,3046 Nimitz Blvd,3046 Nimitz Blvd,,San Diego,...,,,,,,,,San Diego Unified School District,,
3,https://www.realtor.com/realestateandhomes-det...,ZUMU,58161416,FOR_RENT,Extension 7869 - We offer custom pricing for a...,APARTMENT,7225 Charmant Dr Unit 104A,7225 Charmant Dr,Unit 104A,San Diego,...,,Zumper,,,,,,San Diego Unified School District,http://ap.rdcpix.com/b72a590aa320f5d414ede91f7...,http://ap.rdcpix.com/b72a590aa320f5d414ede91f7...
4,https://www.realtor.com/realestateandhomes-det...,SDCA,240014315,FOR_RENT,12 Month Lease Beginning 8/1/14 (& potentially...,SINGLE_FAMILY,8865 Robin Hood Ln,8865 Robin Hood Ln,,La Jolla,...,2.0,Jay Becker,jay.becker@pacificsir.com,"[{'number': '8589263060', 'type': 'Office', 'e...",Pacific Sotheby's International Realty,(858) 926-3060,https://www.pacificsothebysrealty.com/office_1...,San Diego Unified School District,http://ap.rdcpix.com/fd71e02bc1697fbecff0fb93d...,http://ap.rdcpix.com/fd71e02bc1697fbecff0fb93d...


In [8]:
for_sale.columns

Index(['property_url', 'mls', 'mls_id', 'status', 'text', 'style',
       'full_street_line', 'street', 'unit', 'city', 'state', 'zip_code',
       'beds', 'full_baths', 'half_baths', 'sqft', 'year_built', 'days_on_mls',
       'list_price', 'list_date', 'sold_price', 'last_sold_date',
       'assessed_value', 'estimated_value', 'lot_sqft', 'price_per_sqft',
       'latitude', 'longitude', 'neighborhoods', 'county', 'fips_code',
       'stories', 'hoa_fee', 'parking_garage', 'agent', 'agent_email',
       'agent_phones', 'broker', 'broker_phone', 'broker_website',
       'nearby_schools', 'primary_photo', 'alt_photos'],
      dtype='object')

In [9]:
for_sale['style'].unique()

array(['APARTMENT', 'SINGLE_FAMILY', 'CONDOS', 'TOWNHOMES',
       'DUPLEX_TRIPLEX'], dtype=object)

In [10]:
for_sale.property_url[10]

'https://www.realtor.com/realestateandhomes-detail/M1266539661?listing_status=rental'

## General filtering
What kind of listings are you looking for? Filtering options:

Required

├── location (str): The address in various formats - this could be just a zip code, a full address, or city/state, etc.

└── listing_type (option): Choose the type of listing.
    - 'for_rent'
    - 'for_sale'
    - 'sold'
    - 'pending'

Optional

├── radius (decimal): Radius in miles to find comparable properties based on individual addresses.

│    Example: 5.5 (fetches properties within a 5.5-mile radius if location is set to a specific address; otherwise, ignored)

│
├── past_days (integer): Number of past days to filter properties. Utilizes 'last_sold_date' for 'sold' listing types, and 'list_date' for others (for_rent, for_sale).

│    Example: 30 (fetches properties listed/sold in the last 30 days)
│
├── date_from, date_to (string): Start and end dates to filter properties listed or sold, both dates are required.
|    (use this to get properties in chunks as there's a 10k result limit)

│    Format for both must be "YYYY-MM-DD".

│    Example: "2023-05-01", "2023-05-15" (fetches properties listed/sold between these dates)
│

├── mls_only (True/False): If set, fetches only MLS listings (mainly applicable to 'sold' listings)
│

├── foreclosure (True/False): If set, fetches only foreclosures
│

├── proxy (string): In format 'http://user:pass@host:port'
│

├── extra_property_data (True/False): Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.)
│

└── exclude_pending (True/False): If set, excludes pending properties from the results unless listing_type is 'pending'


In [11]:
def get_listings(location, listing_type, past_days):
    # Generate filename based on current timestamp
    current_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"HomeHarvest_{current_timestamp}.csv"

    properties = scrape_property(
    location=location,
    listing_type=listing_type,  # or (for_sale, for_rent, pending)
    past_days=past_days),  # sold in last 30 days - listed in last 30 days if (for_sale, for_rent)

    # date_from="2023-05-01", # alternative to past_days
    # date_to="2023-05-28",
    # foreclosure=True
    # mls_only=True,  # only fetch MLS listings

    print(f"Number of properties: {len(properties)}")

    # Export to csv
    properties.to_csv(filename, index=False)
    print(properties.head())

    return(filename)

## Run through llm

### Import libraries and data

In [39]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
import torch
from transformers import DistilBertTokenizer, DistilBertModel, Trainer, TrainingArguments
import torch.nn as nn

# Load your dataset
train_data = pd.read_csv('training_data_real_estate.csv')

### Tranform csv to be llm friendly

In [48]:
train_data['llm_output'] = train_data.apply(lambda row: '; '.join([f"{col}: {row[col]}" for col in train_data.columns[1:]]), axis=1)

In [51]:
train_data[['prompt', 'llm_output']].head()

Unnamed: 0,prompt,llm_output
0,"I'm looking for a apartment in New York, NY wi...","location: New York, NY; listing_type: for_rent..."
1,"I'm looking for a single_family in Phoenix, AZ...","location: Phoenix, AZ; listing_type: pending; ..."
2,I'm looking for a single_family in Los Angeles...,"location: Los Angeles, CA; listing_type: for_s..."
3,"I'm looking for a condos in San Diego, CA with...","location: San Diego, CA; listing_type: pending..."
4,"I'm looking for a townhomes in San Antonio, TX...","location: San Antonio, TX; listing_type: for_s..."


In [55]:
# format data to be single string .txt file
# select first 70 rows for train data
train_set_data = train_data.head(70)
test_set_data = train_data.tail(30)

output_file_path = "train_data.txt"

with open(output_file_path, 'w') as f:
    for _, row in train_set_data.iterrows():
        line = f"Prompt: {row['prompt']} <|endoftext|> Response: {row['llm_output']}\n"
        f.write(line)

output_file_path = "test_data.txt"

with open(output_file_path, 'w') as f:
    for _, row in test_set_data.iterrows():
        line = f"Prompt: {row['prompt']} <|endoftext|> Response: {row['llm_output']}\n"
        f.write(line)

### Model selection

In [50]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')



### Fine-tuning

In [53]:
from transformers import TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

# Create a dataset and a data collator
train_dataset = TextDataset(
  tokenizer=tokenizer,
  file_path="train_data.txt",  # this should be a text file with each line being one example
  block_size=128)

eval_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="eval_data.txt",  # this should be a text file with each line being one example
    block_size=128
)

data_collator = DataCollatorForLanguageModeling(
  tokenizer=tokenizer, mlm=False)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=2,
    save_steps=10_000,
    save_total_limit=2,
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    logging_dir="./logs",  # Logs directory
    logging_steps=50,  # Log every 50 steps
)

# Create a Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,  # Add the evaluation dataset here
)

# Start training
trainer.train()



RuntimeError: MPS backend out of memory (MPS allocated: 9.04 GB, other allocations: 24.36 MB, max allowed: 9.07 GB). Tried to allocate 147.24 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

### Encode labels

In [40]:
# Initialize a dictionary to hold the label encoders
label_encoders = {}
encoded_labels = pd.DataFrame()

# Encode each label
for column in train_data.columns[1:]:  # Assuming the first column is 'prompt'
    le = LabelEncoder()
    encoded_labels[column] = le.fit_transform(train_data[column])
    label_encoders[column] = le

### Define dataset class

In [41]:
class RealEstateDataset(Dataset):
    def __init__(self, prompts, labels):
        self.prompts = prompts
        self.labels = torch.tensor(labels.values)
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

    def __len__(self):
        return len(self.prompts)

    def __getitem__(self, idx):
        encoded_prompt = self.tokenizer(
            self.prompts[idx],
            add_special_tokens=True,
            max_length=512,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoded_prompt['input_ids'].squeeze(0),
            'attention_mask': encoded_prompt['attention_mask'].squeeze(0),
            'labels': self.labels[idx]
        }

In [None]:
dataset = RealEstateDataset(train_data['prompt'], encoded_labels)

### Define the model class

In [42]:
class DistilBertForMultiLabelSequenceClassification(nn.Module):
    def __init__(self, num_labels_dict):
        super(DistilBertForMultiLabelSequenceClassification, self).__init__()
        self.num_labels = sum(num_labels_dict.values())  # Total number of labels
        self.distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        
        # Create a single output layer
        self.classifier = nn.Linear(self.distilbert.config.dim, self.num_labels)
        
        # Loss function
        self.loss_fct = nn.BCEWithLogitsLoss()

    def forward(self, input_ids, attention_mask=None, labels=None):
        # Get the output from DistilBERT model
        outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state[:, 0]  # Take the first token's output
        
        # Get logits
        logits = self.classifier(sequence_output)
        
        # Calculate loss if labels are provided
        loss = None
        if labels is not None:
            loss = self.loss_fct(logits.view(-1, self.num_labels), labels.float().view(-1, self.num_labels))
        
        # If no labels are provided, just return logits, else return loss and logits
        return (loss, logits) if loss is not None else logits

### Set up training arguments

In [45]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = (labels == preds).float().mean()
    return {'accuracy': accuracy}

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    logging_dir='./logs',
    logging_steps=10
)

# Initialize the model
model = DistilBertForMultiLabelSequenceClassification({key: len(le.classes_) for key, le in label_encoders.items()})

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    compute_metrics=compute_metrics
)

RuntimeError: MPS backend out of memory (MPS allocated: 9.04 GB, other allocations: 24.36 MB, max allowed: 9.07 GB). Tried to allocate 89.42 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

### Start training

In [None]:
trainer.train()



RuntimeError: MPS backend out of memory (MPS allocated: 8.77 GB, other allocations: 264.36 MB, max allowed: 9.07 GB). Tried to allocate 96.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).