In [None]:
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
import time
import os
import datetime

def setup_driver():
   options = uc.ChromeOptions()
   options.headless = False
   options.add_argument('--start-maximized')
   return uc.Chrome(options=options)

def get_single_page(page_number):
   print(f"\n=== Starting collection for page {page_number} at {datetime.datetime.now().strftime('%I:%M %p')} ===")
   
   driver = setup_driver()
   try:
       url = f'https://www.rvtrader.com/New-Winnebago/rvs-for-sale?make=Winnebago%7C2307464&condition=N&seller_type=dealer&zip=33021&radius=10000&page={page_number}'
       
       print(f"\nNavigating to page {page_number}...")
       driver.get(url)
       
       # Wait for page to load
       print("Waiting 10 seconds for page to load...")
       time.sleep(10)
       
       # Save the source
       filename = f'rv_page_{page_number}_source.html'
       with open(filename, 'w', encoding='utf-8') as f:
           f.write(driver.page_source)
       
       size = os.path.getsize(filename)
       print(f"Saved file: {filename} ({size/1024:.2f} KB)")
       
   finally:
       driver.quit()
       print(f"Browser closed for page {page_number}")

def main():
   start_page = int(input("Start from page number: "))
   end_page = int(input("End at page number: "))
   wait_minutes = int(input("Minutes to wait between pages: "))
   
   for page in range(start_page, end_page + 1):
       get_single_page(page)
       
       if page < end_page:
           wait_time = wait_minutes * 60
           print(f"\nWaiting {wait_minutes} minutes before next page...")
           print(f"Next page will start at: {(datetime.datetime.now() + datetime.timedelta(minutes=wait_minutes)).strftime('%I:%M %p')}")
           time.sleep(wait_time)
           
   print("\n=== Collection Complete! ===")
   print(f"Collected pages {start_page} through {end_page}")
   
   # Show summary of files
   print("\nFiles collected:")
   for page in range(start_page, end_page + 1):
       filename = f'rv_page_{page}_source.html'
       if os.path.exists(filename):
           size = os.path.getsize(filename)
           print(f"Page {page}: {size/1024:.2f} KB")

if __name__ == "__main__":
   print("=== RV Trader Multi-Page Collector ===")
   print("This will automatically collect pages with waits between them")
   main()

In [None]:
import os
import json
import pandas as pd
import re

def extract_json_from_html(file_path):
    # Read the HTML file
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    
    # Find the JSON data within script tags
    json_match = re.search(r'<script type="application/ld\+json">(.*?)</script>', content, re.DOTALL)
    if json_match:
        json_str = json_match.group(1)
        # Remove escaped forward slashes
        json_str = json_str.replace('\/', '/')
        return json.loads(json_str)
    return None

def parse_rv_data(json_data):
    # Navigate to the offers array
    if not json_data or 'offers' not in json_data:
        return []
        
    offers = json_data['offers']['offers']
    
    rv_list = []
    for offer in offers:
        item = offer['itemOffered']
        
        # Extract model parts from the name
        name_parts = item['name'].split()
        
        rv_info = {
            'year': name_parts[0],
            'model': ' '.join(name_parts[2:]),  # Everything after "Winnebago"
            'price': float(item['price']),
            'condition': item['itemCondition'],
            'url': offer['url']
        }
        rv_list.append(rv_info)
    
    return rv_list

def main():
    # Find all RV page files
    html_files = [f for f in os.listdir() if f.startswith('rv_page_') and f.endswith('.html')]
    
    print(f"Found {len(html_files)} HTML files")
    
    all_rv_data = []
    
    # Process each file
    for file in sorted(html_files):
        print(f"\nProcessing {file}...")
        
        # Check file size
        size = os.path.getsize(file)
        print(f"File size: {size/1024:.2f} KB")
        
        if size < 10000:  # Skip small files (likely error pages)
            print("File too small, skipping...")
            continue
            
        # Extract and parse data
        json_data = extract_json_from_html(file)
        if json_data:
            rv_data = parse_rv_data(json_data)
            all_rv_data.extend(rv_data)
            print(f"Added {len(rv_data)} RVs from this file")
        else:
            print("No valid JSON data found in file")
    
    # Create DataFrame
    df = pd.DataFrame(all_rv_data)
    
    # Save to CSV
    csv_filename = 'all_winnebago_rvs.csv'
    df.to_csv(csv_filename, index=False)
    
    print(f"\nTotal RVs collected: {len(df)}")
    print(f"Data saved to: {csv_filename}")
    print("\nFirst few rows:")
    print(df.head())
    
    # Show some basic statistics
    print("\nPrice Statistics:")
    print(df['price'].describe())
    
    # Count by year
    print("\nRVs by Year:")
    print(df['year'].value_counts().sort_index())
    
    return df

if __name__ == "__main__":
    print("=== RV Data Combiner ===")
    df = main()

In [3]:
pwd

'/Users/avijames'