# Project Part 1

##### Amit Elgazar              
##### Rebecca Elhaj 

In [134]:
!pip install beautifulsoup4 requests



In [135]:
import requests
from bs4 import BeautifulSoup
import time
import re
import pandas as pd
import urllib.parse

In [136]:
urls = {
    'נוה שאנן': 'https://www.ad.co.il/nadlanrent?sp275=17413&sp276=17414&sp277=17472',
    'גבעת הרצל': 'https://www.ad.co.il/nadlanrent?sp275=17413&sp276=17414&sp277=17622',
    'רמת החייל': 'https://www.ad.co.il/nadlanrent?sp275=17413&sp276=17414&sp277=18340'
}

all_links = []

for name, url in urls.items():
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    current_links = []

    # שליפה של כל תגיות <a>
    for a_tag in soup.find_all("a", href=True):
        h2 = a_tag.find("h2", class_="card-title mb-0 mb-sm-1")

        if h2 and "תל אביב" in h2.get_text() and name in h2.get_text():
            href = a_tag['href']
            full_link = 'https://www.ad.co.il' + href
            current_links.append(full_link)

    print(f'\n{name} – נמצאו {len(current_links)} דירות:')
    for link in current_links:
        print(link)
    
    all_links = all_links + current_links

print(f'\nסה״כ דירות מכל השכונות: {len(all_links)}')



נוה שאנן – נמצאו 20 דירות:
https://www.ad.co.il/ad/16191187
https://www.ad.co.il/ad/16191233
https://www.ad.co.il/ad/16191333
https://www.ad.co.il/ad/16191129
https://www.ad.co.il/ad/16191144
https://www.ad.co.il/ad/16190911
https://www.ad.co.il/ad/16053810
https://www.ad.co.il/ad/15596991
https://www.ad.co.il/ad/14993059
https://www.ad.co.il/ad/16191242
https://www.ad.co.il/ad/16191446
https://www.ad.co.il/ad/13402199
https://www.ad.co.il/ad/13752134
https://www.ad.co.il/ad/13746633
https://www.ad.co.il/ad/13227494
https://www.ad.co.il/ad/13228344
https://www.ad.co.il/ad/12264270
https://www.ad.co.il/ad/12394265
https://www.ad.co.il/ad/12454660
https://www.ad.co.il/ad/6640566

גבעת הרצל – נמצאו 7 דירות:
https://www.ad.co.il/ad/16191350
https://www.ad.co.il/ad/15101633
https://www.ad.co.il/ad/16165490
https://www.ad.co.il/ad/15711674
https://www.ad.co.il/ad/14897671
https://www.ad.co.il/ad/14897673
https://www.ad.co.il/ad/13156526

רמת החייל – נמצאו 3 דירות:
https://www.ad.co.il/ad/15

In [137]:
def extract_apartment_data(url):
    response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
    soup = BeautifulSoup(response.text, 'html.parser')

    data = {
        'property_type': None,
        'neighborhood': None,
        'address': None,
        'room_num': None,
        'floor': None,
        'area': None,
        'garden_area': None,
        'num_of_payments': None,
        'monthly_arnona': None,
        'building_tax': None,
        'days_to_enter': None,
        'total_floors': None,
        'price': None,
        'description': None,
        'has_parking': 0,
        'has_stotsge': 0,
        'elevator': 0,
        'ac': 0,
        'handicap': 0,
        'has_bars': 0,
        'has_safe_room': 0,
        'has_balcon': 0,
        'is_furnished': 0,
        'is_renovated': 0,
        'num_of_images': None,
        'distance_from_center': None
    }

    try:
        # מחיר
        tags = soup.find_all("h2", class_="card-title")
        if len(tags) >= 2:
            price_tag = tags[1]
            price_text = price_tag.get_text(strip=True).replace(",", "").replace("₪", "").replace(" ", "")
            data['price'] = float(price_text)

        # טבלת פרטים
        table = soup.find("table", class_="table table-sm mb-4")
        rows = table.find_all("tr") if table else []

        hebrew_to_english = {
            "פרטי הנכס": "property_type",
            "שכונה": "neighborhood",
            "כתובת": "address",
            "חדרים": "room_num",
            "קומה": "floor",
            "שטח בנוי": "area",
            "שטח גינה": "garden_area",
            "תשלומים בשנה": "num_of_payments",
            "ארנונה בחודש": "monthly_arnona",
            "ועד בית בחודש": "building_tax",
            "תאריך כניסה": "enter_date"
        }

        for row in rows:
            cells = row.find_all("td")
            if len(cells) == 2:
                heb_key = cells[0].get_text(strip=True)
                val = cells[1].get_text(strip=True)

                if heb_key in hebrew_to_english:
                    eng_key = hebrew_to_english[heb_key]

                    # קומות בבניין מתוך "קומה X מתוך Y"
                    if eng_key == "floor":
                        numbers = re.findall(r"\d+", val)
                        if len(numbers) >= 2:
                            data["total_floors"] = int(numbers[1])
                        if len(numbers) >= 1:
                            data["floor"] = int(numbers[0])

                    # תאריך כניסה → ימים
                    elif eng_key == "enter_date":
                        if "מייד" in val:
                            data["days_to_enter"] = 0
                        else:
                            match = re.search(r"\d+", val)
                            data["days_to_enter"] = int(match.group()) if match else None
                    # שדות מספריים רגילים
                    elif eng_key in ["room_num", "area", "num_of_payments",
                                     "monthly_arnona", "building_tax"]:
                        val_clean = re.sub(r"[^\d.]", "", val).strip()
                        try:
                            if val_clean == "":
                                data[eng_key] = None
                            elif eng_key == "room_num":
                                data[eng_key] = float(val_clean)
                            elif eng_key == "garden_area":
                                data[eng_key] = int(float(val_clean)) if val else 0
                            else:
                                data[eng_key] = int(float(val_clean))
                        except:
                            data[eng_key] = None
                    else:
                        data[eng_key] = pd.NA if not val else str(val)

        # שליפה חלופית מהאייקונים
        spans = soup.find_all("span", class_="ms-1")
        for span in spans:
            text = span.get_text(strip=True)

            if "חד" in text and data['room_num'] is None:
                val = re.sub(r"[^\d.]", "", text)
                data['room_num'] = float(val) if val else None

            elif "מ\"ר" in text and data['area'] is None:
                val = re.sub(r"[^\d.]", "", text)
                data['area'] = int(float(val)) if val else None

            elif "קומה" in text and data['floor'] is None:
                val = re.sub(r"[^\d]", "", text)
                data['floor'] = int(val) if val else None

        # תיאור הדירה
        description_p = soup.find("p", class_="text-word-break")
        if description_p:
            data['description'] = description_p.get_text(strip=True)

        # מאפיינים בינאריים לפי איקון
        binary_features = {
            "חניה": "has_parking",
            "מחסן": "has_stotsge",
            "מעלית": "elevator",
            "מזגן": "ac",
            "נגישות": "handicap",
            "סורגים": "has_bars",
            "ממ\"ד": "has_safe_room",
            "מרפסת": "has_balcon",
            "מרוהטת": "is_furnished",
            "משופצת": "is_renovated"
        }

        features_divs = soup.find_all("div", class_="card-icon col-6 d-inline")
        for div in features_divs:
            label_span = div.find("span")
            icon = div.find("i", class_="fa-check")

            if label_span:
                text = label_span.get_text(strip=True)
                text = text.replace("\u200f", "").replace(" ", "")
                if text in binary_features:
                    data[binary_features[text]] = 1 if icon else 0

        # תמונות
        figures = soup.find_all("figure", itemtype="http://schema.org/ImageObject")
        data['num_of_images'] = len(figures)

    except Exception as e:
        print(f"שגיאה כללית בעיבוד {url}: {e}")
        return None

    return data


In [138]:
# הרצת איסוף הדירות
all_data = []

for i, link in enumerate(all_links):
    print(f"✅ דירה {i+1} מתוך {len(all_links)}: {link}")
    info = extract_apartment_data(link)
    if info:
        all_data.append(info)
    time.sleep(1)


✅ דירה 1 מתוך 30: https://www.ad.co.il/ad/16191187
✅ דירה 2 מתוך 30: https://www.ad.co.il/ad/16191233
✅ דירה 3 מתוך 30: https://www.ad.co.il/ad/16191333
✅ דירה 4 מתוך 30: https://www.ad.co.il/ad/16191129
✅ דירה 5 מתוך 30: https://www.ad.co.il/ad/16191144
✅ דירה 6 מתוך 30: https://www.ad.co.il/ad/16190911
✅ דירה 7 מתוך 30: https://www.ad.co.il/ad/16053810
✅ דירה 8 מתוך 30: https://www.ad.co.il/ad/15596991
✅ דירה 9 מתוך 30: https://www.ad.co.il/ad/14993059
✅ דירה 10 מתוך 30: https://www.ad.co.il/ad/16191242
✅ דירה 11 מתוך 30: https://www.ad.co.il/ad/16191446
✅ דירה 12 מתוך 30: https://www.ad.co.il/ad/13402199
✅ דירה 13 מתוך 30: https://www.ad.co.il/ad/13752134
✅ דירה 14 מתוך 30: https://www.ad.co.il/ad/13746633
✅ דירה 15 מתוך 30: https://www.ad.co.il/ad/13227494
✅ דירה 16 מתוך 30: https://www.ad.co.il/ad/13228344
✅ דירה 17 מתוך 30: https://www.ad.co.il/ad/12264270
✅ דירה 18 מתוך 30: https://www.ad.co.il/ad/12394265
✅ דירה 19 מתוך 30: https://www.ad.co.il/ad/12454660
✅ דירה 20 מתוך 30: ht

In [139]:
# מפתח API:
API_KEY = "Enter API key here"

# כתובת המרכז של תל אביב – כיכר דיזינגוף
DESTINATION = "Dizengoff Square, Tel Aviv"


In [140]:
def get_distance_from_center(address, neighborhood):
    base_url = "https://maps.googleapis.com/maps/api/distancematrix/json?"

    def fetch_distance(origin):
        params = {
            "origins": origin,
            "destinations": DESTINATION,
            "key": API_KEY,
            "units": "metric",
            "language": "he"
        }
        try:
            response = requests.get(base_url + urllib.parse.urlencode(params))
            data = response.json()
            return data['rows'][0]['elements'][0]['distance']['value']  
        except Exception as e:
            return None

    #  לפי כתובת מלאה
    distance = fetch_distance(address)

    # אם לא עבד, לפי שכונה
    if distance is None:
        distance = fetch_distance(neighborhood)

    return distance


In [141]:
column_order = [
    'property_type', 'neighborhood', 'address', 'room_num', 'floor',
    'area', 'garden_area', 'days_to_enter', 'num_of_payments',
    'monthly_arnona', 'building_tax', 'total_floors', 'description',
    'has_parking', 'has_stotsge', 'elevator', 'ac', 'handicap',
    'has_bars', 'has_safe_room', 'has_balcon', 'is_furnished', 'is_renovated',
    'price', 'num_of_images', 'distance_from_center'
]

# יצירת טבלת DataFrame
df = pd.DataFrame(all_data)[column_order]

In [142]:

# הוספת המרחק לעמודה חדשה
# df['distance_from_center'] = df['address'].apply(get_distance_from_center)
df['distance_from_center'] = df.apply(
    lambda row: get_distance_from_center(row['address'], row['neighborhood']),
    axis=1
)


In [143]:
# המרה לעמודות טקסט לטיפוס string של pandas
string_columns = [
    'property_type', 'neighborhood', 'address', 'description'
]
df[string_columns] = df[string_columns].astype('string')

# המרה לעמודות מספר שלם לטיפוס Int64 שתומך גם ב-NA
int_columns = [
    'floor', 'garden_area', 'num_of_payments',
    'monthly_arnona', 'building_tax', 'days_to_enter', 'total_floors'
]
df[int_columns] = df[int_columns].astype('Int64')
df['garden_area'] = df['garden_area'].fillna(0).astype('Int64')
df['total_floors'] = df['total_floors'].fillna(0).astype('Int64')
df['floor'] = df['floor'].fillna(0).astype('Int64')
df['distance_from_center'] = df['distance_from_center'].astype('float64')


In [144]:
# שמירת הקובץ  
df.to_csv("Project_Part_1.csv", index=False, encoding='utf-8-sig')

print("✅ הקובץ נשמר בהצלחה!")

✅ הקובץ נשמר בהצלחה!


In [145]:
print(df.dtypes)

property_type           string[python]
neighborhood            string[python]
address                 string[python]
room_num                       float64
floor                            Int64
area                             int64
garden_area                      Int64
days_to_enter                    Int64
num_of_payments                  Int64
monthly_arnona                   Int64
building_tax                     Int64
total_floors                     Int64
description             string[python]
has_parking                      int64
has_stotsge                      int64
elevator                         int64
ac                               int64
handicap                         int64
has_bars                         int64
has_safe_room                    int64
has_balcon                       int64
is_furnished                     int64
is_renovated                     int64
price                          float64
num_of_images                    int64
distance_from_center     