In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import re


In [2]:

def split_numbers_and_street(string):
    return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string) if s]

## Reading from a local file

In [4]:

def scrape_madlan():
	page = 1
	data = []
	while page<= 1:
		with open('h1.html', 'r', encoding='utf-8') as f:
			response = f.read()
		soup = BeautifulSoup(response, 'html.parser')

		apartments = soup.find_all('div',{'data-auto': 'property-details'} )
		if not apartments:
			print('er')
			break
		for apartment in apartments:
			price_elem = apartment.find('div', {'data-auto': 'property-price'})
			price = price_elem.text.strip() if price_elem else None
			room_number_elem = apartment.find('div', {'data-auto': 'property-rooms'})
			room_number = room_number_elem.text.strip() if room_number_elem else None
			area_elem = apartment.find('div', {'data-auto': 'property-size'})
			area = area_elem.text.strip() if area_elem else None
			apt_type_elem = apartment.find('div', {'data-auto': 'property-class'})
			apt_type = apt_type_elem.text.strip() if apt_type_elem else None
			address_elem = apartment.find('div', {'data-auto': 'property-address'})
			address = address_elem.text.strip().split(',') if address_elem else None
			street = None
			number_in_street = None
			city_area = None
			if  address:
				street_num = split_numbers_and_street(address[0])
				if len(street_num)==2:
					number_in_street = street_num[1]
					street = street_num[0]
				else:
					street= street_num[0]
				if len(address)==2:
					city_area = address[1].strip()
				else:
					city_area = None
			data.append([apt_type, room_number, area, street, number_in_street, city_area, price])
		page += 1
		
	df = pd.DataFrame(data, columns=['type', 'room_number', 'Area', 'Street', 'number_in_street', 'city_area', 'price'])
	return df
df = scrape_madlan()


## Reading from the website online

In [7]:

def scrape_madlan():
    base_url = "https://www.madlan.co.il/for-sale/נוף-הגליל-ישראל"
    page = 1
    max_page = 3
    data = []
    while page<= max_page:
        url = f"{base_url}?page={page}"
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'}
        response = requests.get(url,headers= headers)
        response.encoding = response.apparent_encoding
        soup = BeautifulSoup(response.text, 'html.parser')
        if response.status_code == 200:
            apartments = soup.find_all('div',{'data-auto': 'property-details'} )
            if not apartments:
                print('er')
                break
            for apartment in apartments:
                price_elem = apartment.find('div', {'data-auto': 'property-price'})
                price = price_elem.text.strip() if price_elem else None
                room_number_elem = apartment.find('div', {'data-auto': 'property-rooms'})
                room_number = room_number_elem.text.strip() if room_number_elem else None
                area_elem = apartment.find('div', {'data-auto': 'property-size'})
                area = area_elem.text.strip() if area_elem else None
                apt_type_elem = apartment.find('div', {'data-auto': 'property-class'})
                apt_type = apt_type_elem.text.strip() if apt_type_elem else None
                address_elem = apartment.find('div', {'data-auto': 'property-address'})
                address = address_elem.text.strip().split(',') if address_elem else None
                street = None
                number_in_street = None
                city_area = None
                if  address:
                    street_num = split_numbers_and_street(address[0])
                    if len(street_num)==2:
                        number_in_street = street_num[1]
                        street = street_num[0]
                    else:
                        street= street_num[0]
                    if len(address)==2:
                        city_area = address[1].strip()
                    else:
                        city_area = None
                data.append([apt_type, room_number, area, street, number_in_street, city_area, price])
            page += 1
        else:
            print (f' erorr status code {response.status_code}')
            break
    df = pd.DataFrame(data, columns=['type', 'room_number', 'Area', 'Street', 'number_in_street', 'city_area', 'price'])
    return df
df = scrape_madlan()


In [8]:
df['number_in_street'] = pd.to_numeric(df['number_in_street'], errors='coerce').astype('Int64')
df['CITY'] = 'נוף הגליל'
df = df.dropna(subset= 'price').reset_index(drop=True)


In [9]:
df

Unnamed: 0,type,room_number,Area,Street,number_in_street,city_area,price,CITY
0,דירה,3 חד׳,"70 מ""ר",דרך אריאל שרון,19,שלום,"₪1,030,000",נוף הגליל
1,דירה,4 חד׳,"91 מ""ר",הרב עובדיה יוסף,1,צפונית,"₪1,050,000",נוף הגליל
2,קוטג',6 חד׳,"151 מ""ר",ירדן,,זאב,"₪2,580,000מתיווך",נוף הגליל
3,דירה,2 חד׳,"45 מ""ר",מנחם אריאב,27,צפונית,"₪660,000",נוף הגליל
4,דירה,3 חד׳,"80 מ""ר",הרב עובדיה יוסף,1,צפונית,"₪1,100,000",נוף הגליל
...,...,...,...,...,...,...,...,...
73,דירת גן,4 חד׳,"218 מ""ר",ירדן,1,זאב,"₪1,750,000",נוף הגליל
74,דירה,4 חד׳,"106 מ""ר",חרוד,16,אשכול,"₪3,333,333",נוף הגליל
75,דירה,4 חד׳,"115 מ""ר",כרכום,13,הר יונה,"₪1,280,000",נוף הגליל
76,דירה,4 חד׳,"80 מ""ר",חרוד,20,דרומית,"₪950,000",נוף הגליל


In [None]:
df.to_excel('result.xlsx')