In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import time
import pandas as pd

In [8]:
listing_base_url = 'https://ozflatmates.com/{}'

# Open the CSV file in write mode
with open('room_listings.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    
    # Write the header row, including Publish Date and Listing ID
    writer.writerow(['Listing ID', 'Room Description', 'Suburb', 'Price', 'Publish Date'])

    # Base URL for room listings pages
    base_url = 'https://ozflatmates.com/rooms-for-rent/sydney/page-{}'

    # Loop through each page
    for page_num in range(1, 7):  # Loop through pages 1 to 6
        # Format the URL for the current page
        url = base_url.format(page_num)
        print(f"Scraping page {page_num}: {url}")

        # Fetch the webpage
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all room listings on the current page
        room_listings = soup.find_all('div', class_='room__wide listing-container')

        # Loop through each listing and extract information
        for room in room_listings:
            # Extract room suburb
            room_suburb = room.find('h3', class_='room-sublocation mobile-room-sublocation')
            suburb_text = room_suburb.text.strip().split('-')[-1].strip() if room_suburb else "N/A"

            # Extract room price
            room_price = room.find('div', class_='room-price')
            price_text = room_price.text.strip().replace('$', '') if room_price else "N/A"

            # Extract room name
            room_description = room.find('h4', class_='room-name')
            name_text = room_description.text.strip() if room_description else "N/A"

            # Extract listing URL and extract ID 
            listing_link = room.find('a', href=True)
            listing_url = listing_link['href'] if listing_link else None
            listing_id = listing_url.split('/')[-1] if listing_url else "N/A"

            # Construct the individual listing URL using the Listing ID
            if listing_id != "N/A":
                listing_page_url = listing_base_url.format(listing_id)
                print(f"Fetching listing page: {listing_page_url}")

                # Fetch the individual listing page to get the publish date
                listing_response = requests.get(listing_page_url)
                listing_soup = BeautifulSoup(listing_response.content, 'html.parser')

                # Extract publish date 
                publish_date_element = listing_soup.find(text=lambda x: x and "Published on" in x)
                publish_date = publish_date_element.split("Published on")[1].strip() if publish_date_element else "N/A"
            else:
                publish_date = "N/A"

            # Write the room details to the CSV file
            writer.writerow([listing_id, name_text, suburb_text, price_text, publish_date])

            # Print the details
            print(f"Listing ID: {listing_id}")
            print(f"Room Name: {name_text}")
            print(f"Suburb: {suburb_text}")
            print(f"Price: {price_text}")
            print(f"Publish Date: {publish_date}")
            print("-" * 40)

        # Add a delay between requests to avoid overwhelming the server
        time.sleep(2)  # Sleep for 2 seconds between page requests


Scraping page 1: https://ozflatmates.com/rooms-for-rent/sydney/page-1
Fetching listing page: https://ozflatmates.com/ID32860


  publish_date_element = listing_soup.find(text=lambda x: x and "Published on" in x)


Listing ID: ID32860
Room Name: Sydney CBD Room
Suburb: Sydney
Price: 450
Publish Date: 15.10.2024
----------------------------------------
Fetching listing page: https://ozflatmates.com/ID32846
Listing ID: ID32846
Room Name: Parramatta, quiet modern single room, female
Suburb: Granville
Price: 300
Publish Date: 15.10.2024
----------------------------------------
Fetching listing page: https://ozflatmates.com/ID32838
Listing ID: ID32838
Room Name: Active Christian Guys house, ideal for Medic or...
Suburb: Wentworthville
Price: 390
Publish Date: 13.10.2024
----------------------------------------
Fetching listing page: https://ozflatmates.com/ID32835
Listing ID: ID32835
Room Name: Homey Room in 2 Bedroom 2 Bathroom Flat!
Suburb: Rydalmere
Price: 340
Publish Date: 13.10.2024
----------------------------------------
Fetching listing page: https://ozflatmates.com/ID32844
Listing ID: ID32844
Room Name: Beautiful North Bondi apartment
Suburb: North Bondi
Price: 600
Publish Date: 13.10.2024
--

In [3]:
df = pd.read_csv('room_listings.csv')

In [4]:
df.head()

Unnamed: 0,Listing ID,Room Description,Suburb,Price,Publish Date
0,ID32860,Sydney CBD Room,Sydney,450,15.10.2024
1,ID32846,"Parramatta, quiet modern single room, female",Granville,300,15.10.2024
2,ID32838,"Active Christian Guys house, ideal for Medic o...",Wentworthville,390,13.10.2024
3,ID32835,Homey Room in 2 Bedroom 2 Bathroom Flat!,Rydalmere,340,13.10.2024
4,ID32844,Beautiful North Bondi apartment,North Bondi,600,13.10.2024


In [7]:
print(df.shape)
print(df.isna().sum())
print(df.dtypes)

(56, 5)
Listing ID          0
Room Description    0
Suburb              0
Price               0
Publish Date        0
dtype: int64
Listing ID          object
Room Description    object
Suburb              object
Price                int64
Publish Date        object
dtype: object


In [27]:
df['Publish Date'] = pd.to_datetime(df['Publish Date'], format='%d.%m.%Y')
df.head()

Unnamed: 0,Listing ID,Room Description,Suburb,Price,Publish Date
0,ID32860,Sydney CBD Room,Sydney,450,2024-10-15
1,ID32846,"Parramatta, quiet modern single room, female",Granville,300,2024-10-15
2,ID32838,"Active Christian Guys house, ideal for Medic o...",Wentworthville,390,2024-10-13
3,ID32835,Homey Room in 2 Bedroom 2 Bathroom Flat!,Rydalmere,340,2024-10-13
4,ID32844,Beautiful North Bondi apartment,North Bondi,600,2024-10-13


In [17]:
price_by_suburb = df.groupby('Suburb').agg(Min_Price = ('Price', 'min'), Max_Price = ('Price', 'max'), Average_Price=('Price', 'mean')).astype(int).sort_values(by='Average_Price')
print(price_by_suburb)

                   Min_Price  Max_Price  Average_Price
Suburb                                                
Carlingford              200        200            200
North Strathfield        200        200            200
Merrylands West          230        230            230
Cambridge Park           230        230            230
Cabramatta               220        300            240
Cherrybrook              240        240            240
Cranebrook               250        250            250
Maroubra                 250        250            250
Hurstville               250        250            250
Bankstown                250        260            255
Rozelle                  260        260            260
Schofields               260        260            260
Saint Marys              270        270            270
Penrith                  250        300            275
Penshurst                295        295            295
Bardia                   295        295            295
Parramatta

--> 
Sydney's room rents vary widely based on proximity to the CBD, demand for beachside locations, infrastructure, and lifestyle factors, with inner-city and coastal areas having the highest prices. Meanwhile, suburbs like Carlingford, North Strathfield, and Cabramatta have lower rents because they're further from the CBD, and have more affordable housing, appealing to budget-conscious renters.
