# WH_4 - Web scraping

In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

In [34]:
def scrape_madlan(city): #Between the words, use a hyphen. For example ('כפר-סבא')
    url = "https://www.madlan.co.il/for-sale/" + city + "-ישראל"
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'}
    response = requests.get(url, headers=headers)

    # An empty list to store asset details
    assets_list = []
    try:
        if not response.status_code == 200:
            return None
        results_page = BeautifulSoup(response.content, 'html.parser')
        # Find all assets on the page
        assets = results_page.find_all('div', {'data-auto': 'listed-bulletin', 'data-auto-bulletin-type': 'bulletin'})
        # loop over each asset and extract the details
        for asset in assets:
            asset_details = {}
            asset_details['City'] = str(city.replace('-', ' '))

            asset_type = asset.find("div", {"data-auto": "property-class"})
            asset_details['type'] = str(asset_type.text.strip() if asset_type else "N/A")

            asset_room_number = asset.find("div", {"data-auto": "property-rooms"})
            asset_room_number_string = asset_room_number.text.strip() if asset_room_number else None
            asset_details['room_number'] = str(re.sub(r'[^\d.-]', '', asset_room_number_string) if asset_room_number_string else None)

            asset_area = asset.find("div", {"data-auto": "property-size"})
            asset_area_string = asset_area.text.strip() if asset_area else "N/A"
            asset_area_numbers_only = re.findall(r'\d+', asset_area_string)
            asset_details['Area'] = int(asset_area_numbers_only[0]) if asset_area_numbers_only else None

            asset_address = asset.find('div', {'data-auto': 'property-address'})
            asset_address = asset_address.get_text(strip=True) if asset_address else "N/A"
            Street_and_num = asset_address.split(',')[0] if asset_address != "N/A" else "N/A"
            Street_and_num = Street_and_num.split()
            Street = ''
            num = ''
            for part in Street_and_num:
                if part.isnumeric():
                    num = num + part
                else:
                    Street = Street + ' ' + part
            asset_details['Street'] = str(Street.strip())
            asset_details['number_in_street'] = int(num) if num != '' else "N/A"
            asset_details['city_area'] = str(asset_address.split(',')[1].strip() if len(asset_address.split(',')) > 1 else "N/A")

            price_element = asset.find("div", {"data-auto": "property-price"})
            price_string = price_element.text.strip() if price_element else "N/A"
            num_match = re.search(r'\d+(?:,\d+)*', price_string)
            asset_details['price'] = int(num_match.group().replace(',', '')) if num_match else "N/A"

            # Append asset details to the assets_list
            assets_list.append(asset_details)

        df_assets_details = pd.DataFrame(assets_list)

    except:
        pass

    return df_assets_details


In [35]:
assets_details = scrape_madlan('כפר-סבא')
assets_details

Unnamed: 0,City,type,room_number,Area,Street,number_in_street,city_area,price
0,כפר סבא,דירה,4.0,122.0,ביאליק,22.0,מרכז העיר דרום,3100000
1,כפר סבא,דירה,4.0,107.0,הכרמל,85.0,מרכז העיר צפון,3130000
2,כפר סבא,דירה,3.5,,נחשון,,מרכז העיר דרום,2300000
3,כפר סבא,דופלקס,5.5,,ארלוזורוב,,מרכז העיר דרום,4100000
4,כפר סבא,דירה,6.0,161.0,מנור אהוד,2.0,הזמר העברי,4850000
5,כפר סבא,דירה,4.0,100.0,דובה ויצחק שיינפיין,8.0,השכונה הירוקה,3850000
6,כפר סבא,דו משפחתי,7.0,250.0,הפארק,,,6650000
7,כפר סבא,קוטג',7.0,250.0,הפארק,,,6650000
8,כפר סבא,דירה,4.0,143.0,זהבי דוד,16.0,הזמר העברי,3650000
9,כפר סבא,דירה,4.0,123.0,נחשון,11.0,מרכז העיר דרום,3190000


In [36]:
assets_details.to_csv('assets_details.csv', encoding='utf-8-sig', index=False)