## Project: Rental Apartment Analysis Tel Aviv - Jaffa  
### Neighborhood 1 - The Big Block  
### Neighborhood 2 - Central Jaffa, West of Jerusalem Boulevard  
### Presented by:  
### Amichay Nager -316225986 
### Tair Mimon-322240615

#### Import libraries

In [1]:
!pip install googlemaps
import requests
from bs4 import BeautifulSoup
import re
import time
import pandas as pd
from datetime import datetime
import getpass
import numpy as np 
import googlemaps



#### We will generate data with advertisement details

In [2]:

BASE_URL = "https://www.ad.co.il/nadlanrent?sp275=17413&sp277=18262"
headers = {"User-Agent": "Mozilla/5.0"}

column_mapping = {
    "פרטי הנכס": "property_type",
    "אזור": "Region",
    "עיר": "City",
    "שכונה": "neighborhood",
    "כתובת": "address",
    "קומה": "floor",
    "שטח בנוי": "area",
    "תאריך כניסה": "days_to_enter",
    "חדרים":"room_num",
    "תשלומים בשנה":"num_of_payments",
    "ארנונה בחודש":"monthly_arnona",
    "ועד בית בחודש":"building_tax",
    "על עמודים": "on_pillars",
    "מרוהטת": "is_furnished",
    "מזגן": "ac",
    "חניה": "has_parking",
    "ממ\"ד": "has_safe_room",
    "מרפסת": "has_balcon",
    "נגישות": "handicap",
    "סורגים": "has_bars",
    "מעלית": "elevator",
    "מחסן": "has_storage",
    "מרפסת שמש": "sun_balcony",
    "משופצת": "is_renovated",
    "שטח גינה":"garden_area"
}

page = 1
properties = []
previous_count = 0

while True:
    print(f" אוספים נתונים מעמוד {page}...")
    url = f"{BASE_URL}&pageindex={page}" if page > 1 else BASE_URL
    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        print(" שגיאה בטעינת הדף. מפסיקים את האיסוף.")
        break

    soup = BeautifulSoup(response.text, "html.parser")
    ads = soup.find_all("div", class_="card-block")

    if not ads or len(ads) == previous_count:
        print(" אין עוד נתונים - סיימנו לאסוף!")
        break

    for ad in ads:
        data = {}
        data["num_of_images"] = int(ad.get("data-images", 0))
        card_body = ad.find("div", class_="card-body p-md-3")
        link_tag = card_body.find("a") if card_body else None

        if not link_tag or "href" not in link_tag.attrs:
            print(" לא נמצא קישור לדירה. מדלג...")
            continue

        link = "https://www.ad.co.il" + link_tag["href"]
        detail_response = requests.get(link, headers=headers)
        if detail_response.status_code != 200:
            print(f" שגיאה בטעינת הדף הפנימי: {link}")
            continue

        detail_soup = BeautifulSoup(detail_response.text, "html.parser")
        rows = detail_soup.find("table", class_="table table-sm mb-4").find_all("tr")
        for row in rows:
            cells = row.find_all("td")
            if len(cells) == 2:
                key = cells[0].text.strip()
                value = cells[1].text.strip()
                if key in column_mapping:
                    data[column_mapping[key]] = value

        if "neighborhood" not in data:
            continue

        if data["neighborhood"] not in ["מרכז יפו מערבית לשדרות ירושלים", "הגוש הגדול"]:
            continue
        
        # Saved apartment without an address
        if "address" not in data:
            data["address"] = np.nan  

        # price
        titles = detail_soup.find("div", class_="d-flex justify-content-between").find_all("h2", class_="card-title")
        if len(titles) > 1:
            price_text = titles[1].text.strip()
            price_text = price_text.replace("₪", "").replace(",", "").strip()
            data["price"] = float(price_text)

        # Boolean features
        features = detail_soup.find_all("div", class_="card-icon")
        for feature in features:
            hebrew_name = feature.find("span").text.strip()
            english_name = column_mapping.get(hebrew_name, hebrew_name)
            value = 0 if "disabled" in feature["class"] else 1
            data[english_name] = int(value)

        # description
        data["description"] = " ".join(detail_soup.find("div", class_="p-3").text.split())

        properties.append(data)

    previous_count = len(ads)
    page += 1
    time.sleep(1)

print(f" סיימנו! נשמרו {len(properties)} דירות.")

columns = [
    "property_type", "neighborhood", "address", "room_num", "floor", "area", "garden_area","days_to_enter","num_of_payments",
    "monthly_arnona","building_tax","total_floors","description","has_parking","has_storage", "elevator", "ac", "handicap", "has_bars",
    "has_safe_room", "has_balcon", "is_furnished", "is_renovated", "price","num_of_images", "distance_from_center"
]

df = pd.DataFrame(properties, columns=columns)

# Floor data processing
df['floor'] = df['floor'].replace('קרקע','0')
df[['floor', 'total_floors']] = df['floor'].str.extract(r'(\d+)\s*(?:מתוך\s*(\d+))?').astype('float')
df['floor'] = df['floor'].astype('Int64')
df['total_floors'] = df['total_floors'].astype('Int64')

# Current date
today = datetime.today()

# המרת 'כניסה מיידית' ל-0
df['days_to_enter'] = df['days_to_enter'].apply(
    lambda x: 0 if x == 'מיידית' else (pd.NA if pd.isna(x) or x == '' else x)
)

# "Filtering unwanted property types by list"
allowed_types = ["דירה", "דירת גן", "בית פרטי/ קוטג'", "גג/פנטהאוז", "מגרשים", "דופלקס", "תיירות ונופש", "כללי"]
df["property_type"] = df["property_type"].apply(lambda x: x if x in allowed_types else "כללי")

# "Displaying all columns"
pd.set_option('display.max_columns', None)


 אוספים נתונים מעמוד 1...
 אוספים נתונים מעמוד 2...
 אוספים נתונים מעמוד 3...
 אין עוד נתונים - סיימנו לאסוף!
 סיימנו! נשמרו 29 דירות.


#### We will calculate distances using a distance matrix and add them to the data

In [3]:
def add_distance_from_center(df, api_key, center_address="כיכר דיזנגוף, תל אביב"):
    """
    מקבלת DataFrame עם עמודת address או neighborhood,
    מחזירה את אותו df עם עמודה חדשה: distance_from_center (במטרים, כ-float)
    """
    import requests
    import json
    import time
    import pandas as pd

    distances = []

    def format_address(addr):
        if not isinstance(addr, str) or not addr.strip():
            return None
        addr = addr.strip()
        if "תל אביב" not in addr:
            addr += ", תל אביב"
        return addr

    def get_distance(origin, destination, api_key):
        url = f"https://routes.googleapis.com/directions/v2:computeRoutes"
        headers = {
            'Content-Type': 'application/json',
            'X-Goog-Api-Key': api_key,
            'X-Goog-FieldMask': 'routes.distanceMeters'
        }
        body = {
            "origin": {"address": origin},
            "destination": {"address": destination},
            "travelMode": "DRIVE",
            "routingPreference": "TRAFFIC_AWARE"
        }

        response = requests.post(url, headers=headers, json=body)
        if response.status_code == 200:
            data = response.json()
            if 'routes' in data and data['routes']:
                return float(data['routes'][0]['distanceMeters'])  
        return None

    for i, row in df.iterrows():
        address = format_address(row.get("address", ""))
        neighborhood = format_address(row.get("neighborhood", ""))
        distance = None

        if address:
            distance = get_distance(address, center_address, api_key)
            if distance is not None:
                distances.append(distance)
                continue
            else:
                print(f"שגיאה: לא נמצאה תוצאה עבור הכתובת {address}")

           
        if neighborhood:
            distance = get_distance(neighborhood, center_address, api_key)
            if distance is not None:
                distances.append(distance)
                continue
            else:
                print(f"שגיאה: לא נמצאה תוצאה גם עבור השכונה {neighborhood}")

        distances.append(None)
        time.sleep(0.2)

    df["distance_from_center"] = pd.to_numeric(distances, errors="coerce")

    return df



In [4]:
# Put your API here
API_KEY =getpass.getpass()
df = add_distance_from_center(df, API_KEY)

········


#### We will organize the data

In [5]:
# List of Boolean columns we want to convert to-int32
bool_columns = [
    'has_parking', 'has_storage', 'elevator', 'ac', 'handicap',
    'has_bars', 'has_safe_room', 'has_balcon', 'is_furnished',
    'is_renovated'
]

# Keep only the columns that actually exist in the data
existing_bool_columns = [col for col in bool_columns if col in df.columns]

# Ensure that the columns you want to convert do not contain invalid values
for col in existing_bool_columns:
    df[col] = df[col].fillna(0).astype('int32')
    
# List of columns you want to convert to-INT64
columns_to_convert = [
     'garden_area' , 'num_of_payments',
     'building_tax','num_of_images'
]

# Convert each column to its type- INT64
for col in columns_to_convert:
    if col in df.columns:
        # While handling non-numeric values converting to- INT64
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype('Int64')

# Converting the column'room_num' to-FLOAT
df['room_num'] = pd.to_numeric(df['room_num'], errors='coerce')

# If garden_area does not exist, skip this step
if 'garden_area' in df.columns:
    df['garden_area'] = df['garden_area'].fillna(0)

df[['property_type','neighborhood','address','description']] = df[['property_type','neighborhood','address','description']].astype('string')

In [6]:
# Displaying the data
df


Unnamed: 0,property_type,neighborhood,address,room_num,floor,area,garden_area,days_to_enter,num_of_payments,monthly_arnona,building_tax,total_floors,description,has_parking,has_storage,elevator,ac,handicap,has_bars,has_safe_room,has_balcon,is_furnished,is_renovated,price,num_of_images,distance_from_center
0,דירה,הגוש הגדול,יחזקאל שטרייכמן 2,5.0,3,118,0,,12,,0,9,"יחזקאל שטרייכמן 2 13,000 ₪ הגוש הגדול, תל אביב...",1,1,1,1,1,0,1,1,0,0,13000.0,6,4826.0
1,דירה,הגוש הגדול,אמיר גלבוע 9,3.0,6,80,0,,12,,0,11,"אמיר גלבוע 9 10,950 ₪ הגוש הגדול, תל אביב יפו ...",1,1,1,1,1,0,1,1,1,0,10950.0,8,4761.0
2,דירה,הגוש הגדול,מרק שאגאל 10,3.0,4,85,0,0.0,12,600.0,420,6,"מרק שאגאל 10 10,800 ₪ הגוש הגדול, תל אביב יפו ...",1,1,1,1,0,0,1,1,0,1,10800.0,4,4829.0
3,דירה,הגוש הגדול,ארתור רובינשטיין 8,3.0,5,104,6,,12,,0,10,"ארתור רובינשטיין 8 9,500 ₪ הגוש הגדול, תל אביב...",1,0,1,1,1,0,1,1,0,1,9500.0,10,4769.0
4,דירה,הגוש הגדול,אייזק שטרן 17,4.5,2,125,0,,12,990.0,575,12,"אייזק שטרן 17 12,000 ₪ הגוש הגדול, תל אביב יפו...",1,1,1,1,1,0,1,1,0,0,12000.0,10,4336.0
5,דירה,הגוש הגדול,שדרות לוי אשכול 69,3.0,5,102,12,,12,700.0,490,12,"שדרות לוי אשכול 69 9,000 ₪ הגוש הגדול, תל אביב...",1,0,1,1,1,0,1,1,0,0,9000.0,9,4810.0
6,דירה,הגוש הגדול,שדרות לוי אשכול 63,4.5,2,125,10,,12,950.0,725,5,"שדרות לוי אשכול 63 12,500 ₪ הגוש הגדול, תל אבי...",1,1,1,1,1,1,1,1,1,0,12500.0,10,4511.0
7,דירה,הגוש הגדול,שטרייכמן יחזקאל 18,4.0,2,130,0,0.0,12,1500.0,600,7,"שטרייכמן יחזקאל 18 12,600 ₪ הגוש הגדול, תל אבי...",1,1,1,1,0,1,1,1,1,1,12600.0,4,4470.0
8,כללי,הגוש הגדול,ארתור רובינשטיין,5.0,9,250,0,0.0,12,,0,10,"ארתור רובינשטיין 25,500 ₪ הגוש הגדול, תל אביב ...",1,1,1,1,1,0,1,1,0,1,25500.0,14,4724.0
9,דירה,הגוש הגדול,,4.5,1,112,0,,12,,0,9,"הגוש הגדול 12,500 ₪ הגוש הגדול, תל אביב יפו 4....",1,1,1,1,1,0,1,1,0,0,12500.0,2,4778.0


In [7]:
# List of all columns
df.columns

Index(['property_type', 'neighborhood', 'address', 'room_num', 'floor', 'area',
       'garden_area', 'days_to_enter', 'num_of_payments', 'monthly_arnona',
       'building_tax', 'total_floors', 'description', 'has_parking',
       'has_storage', 'elevator', 'ac', 'handicap', 'has_bars',
       'has_safe_room', 'has_balcon', 'is_furnished', 'is_renovated', 'price',
       'num_of_images', 'distance_from_center'],
      dtype='object')

In [8]:
# Displaying the type of each column 
df.dtypes

property_type            string
neighborhood             string
address                  string
room_num                float64
floor                     Int64
area                     object
garden_area               Int64
days_to_enter            object
num_of_payments           Int64
monthly_arnona           object
building_tax              Int64
total_floors              Int64
description              string
has_parking               int32
has_storage               int32
elevator                  int32
ac                        int32
handicap                  int32
has_bars                  int32
has_safe_room             int32
has_balcon                int32
is_furnished              int32
is_renovated              int32
price                   float64
num_of_images             Int64
distance_from_center    float64
dtype: object

#### We will save as CSV


In [9]:
# Saving the data to-CSV
df.to_csv('apartments_details.csv', index=False, encoding='utf-8-sig')