In [45]:
import requests
from bs4 import BeautifulSoup
from collections import OrderedDict
import re

class RealEstateComScraper(object):

    def __init__(self, search_url, output_cvs="listings.csv", domain_url="https://www.realestate.com.au"):
        self.search_url = search_url
        self.output_cvs = output_cvs
        self.domain_url = domain_url

    def get_page_listings(self, page_url):
        """
        Returns a Result set containing all listings on the page
        :rtype: bs4.element.ResultSet
        """
        r = requests.get(page_url)
        c = r.content
        soup = BeautifulSoup(c, "html.parser")
        return soup.find_all("div", {"class": "listingInfo rui-clearfix"})

    def get_page_listings_details_urls(self, page_listings) -> list:
        page_listings_details_urls = []
        for listing in page_listings:
            details_url = self.domain_url + listing.find("h2", {"class": "rui-truncate"}).find('a')['href']
            page_listings_details_urls.append(details_url)
        return page_listings_details_urls

    def get_property_details(self, page_listings_details_urls) -> list:
        feature_dict = OrderedDict()
        feature_list = []
        for property in page_listings_details_urls:
            house_request = requests.get(property)
            house_content = house_request.content
            house_soup = BeautifulSoup(house_content, "html.parser")
            house_base_content = house_soup.find("span", {"class": "street-address"}).text
            house_info = house_soup.find_all("div", {"id": "primaryContent"})
            #print(house_base_content)
            feature_dict['Street']=house_soup.find("span", {"class": "street-address"}).text
            feature_dict['Locality']=house_soup.find("span", {"itemprop": "addressLocality"}).text
            feature_dict['Region']=house_soup.find("span", {"itemprop": "addressRegion"}).text
            feature_dict['Post Code']=house_soup.find("span", {"itemprop": "postalCode"}).text
            for features in house_info:
                for feature in features.find_all("div", {"class": "featureList"}):
                    for line in feature.find_all("li"):
                        try:
                            heading = line.text
                            lst = re.findall('[^:]+', heading)
                            if len(lst) > 1:
                                feature_dict[lst[0]] = lst[1]
                        except AttributeError:
                            pass
            feature_list.append(feature_dict)
        return feature_list    

In [46]:
Rs = RealEstateComScraper("https://www.realestate.com.au/buy/in-attadale,+wa+6156/list-1")
ld = Rs.get_page_listings_details_urls(Rs.get_page_listings(Rs.search_url))
#ld[0]
Rs.get_property_details(ld)

[OrderedDict([('Street', '36A Kingsall Road'),
              ('Locality', 'Attadale'),
              ('Region', 'WA'),
              ('Post Code', '6156'),
              ('Property Type', 'House'),
              ('Bedrooms', '3'),
              ('Bathrooms', '2'),
              ('Toilets', '1'),
              ('Garage Spaces', '1'),
              ('Building Size', '280.94 m² (30 squares) approx'),
              ('Land Size', '400 m² (approx)'),
              ('Price per m²', '$2,445'),
              ('Ensuite', '2'),
              ('Open Car Spaces', '4'),
              ('Carport Spaces', '1')]),
 OrderedDict([('Street', '36A Kingsall Road'),
              ('Locality', 'Attadale'),
              ('Region', 'WA'),
              ('Post Code', '6156'),
              ('Property Type', 'House'),
              ('Bedrooms', '3'),
              ('Bathrooms', '2'),
              ('Toilets', '1'),
              ('Garage Spaces', '1'),
              ('Building Size', '280.94 m² (30 squares) appro

In [7]:
import requests
from bs4 import BeautifulSoup
from collections import OrderedDict
import re

base_url = "https://www.realestate.com.au/buy/in-attadale,+wa+6156/list-1"
domain_base_url = "https://www.realestate.com.au"

r = requests.get(base_url)
c = r.content

soup = BeautifulSoup(c, "html.parser")

all = soup.find_all("div", {"class":"listingInfo rui-clearfix"})

agent = all[1].find("figcaption").text


#for item in all:
    #print(item.find("h2", {"class":"rui-truncate"}).find('a')['href'])s


listing_details_url = all[1].find("h2", {"class":"rui-truncate"})
tag = listing_details_url.find('a')['href']

house_request = requests.get(domain_base_url+tag)
house_content = house_request.content
house_soup = BeautifulSoup(house_content, "html.parser")
house_info = house_soup.find_all("div", {"id":"primaryContent"})

features = house_info[0].find_all("div", {"id":"features"})
feature_lists = house_info[0].find_all("div", {"class":"featureList"})

for fl in house_info:
    print(fl.find_all("div", {"class":"featureList"}))


[<div class="featureList"><ul><li class="header">General Features</li> <li>Property Type:<span>House</span></li> <li>Bedrooms:<span>5</span></li> <li>Bathrooms:<span>2</span></li> <li>Building Size:<span>291.00 m² (31 squares) approx</span></li> <li>Land Size:<span>1008 m² (approx)</span></li></ul> <ul><li class="header">Indoor Features</li> <li>Floorboards</li> <li>Air Conditioning</li></ul></div>, <div class="featureList"><ul><li class="header">Outdoor Features</li> <li>Secure Parking</li> <li>Garage Spaces:<span>1</span></li></ul> <ul><li class="header">Other Features</li> <li>Close to Schools, Close to Shops, Close to Transport, Garden, Formal Lounge</li></ul></div>]


In [2]:
listing_details_urls

bs4.element.ResultSet

In [2]:
d = OrderedDict()
for feature in feature_lists:
    for line in feature.find_all("li"):
        feature_type = line.find("span")
        try:
            heading = line.text
            lst = re.findall('[^:]+',heading)
            if len(lst) > 1:
                d[lst[0]]=lst[1]                
        except AttributeError:
            pass
print(d)

OrderedDict([('Property Type', 'House'), ('Bedrooms', '5'), ('Bathrooms', '2'), ('Building Size', '291.00 m² (31 squares) approx'), ('Land Size', '1008 m² (approx)'), ('Garage Spaces', '1')])


'5'

In [85]:
s = 'Property Type:Hou'

In [86]:
s[:-4]

'Property Type'

In [28]:
tag = listing_details_url.find('a')['href']

'/property-house-wa-attadale-127184642'