In [26]:
from bs4 import BeautifulSoup
import requests
import tqdm


URL = "https://www.mubawab.ma/en/cc/real-estate-for-rent"
page = requests.get(URL)

soup = BeautifulSoup(page.content, "html.parser")

In [95]:
import logging
from time import sleep
from random import uniform
import re
import pandas as pd
from urllib.parse import unquote

def get_links(url, max_pages=1):
    """
    Scrapes property links from mubawab.ma and handles pagination.

    Args:
        url (str): The base URL containing all the property listings.
        max_pages (int, optional): Number of pages to scrape. Defaults to 20.

    Returns:
        list: URLs of all the specific property pages to be scraped.
    """
    prop_links = []
    page = 1

    while page <= max_pages:
        page_url = f"{url}:p:{page}"
        
        try:
            response = requests.get(page_url)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')

            listings = soup.find_all('div', class_="listingBox sPremium")
            if not listings:
                logging.info(f"No listings found on page {page}. Stopping pagination.")
                break

            for listing in listings:
                try:
                    link = listing.get('linkref')

                    if not link:
                        continue

                    prop_links.append((link))

                except Exception as e:
                    logging.error(f"Error parsing listing: {e}")

            page += 1
            sleep(uniform(1, 3))

        except requests.exceptions.HTTPError as http_err:
            logging.error(f"HTTP error on page {page}: {http_err}")
            break
        except requests.RequestException as req_err:
            logging.error(f"Request error on page {page}: {req_err}")
            break

    return prop_links

def get_details(links):
    """
    Scrapes the important features of each property.

    Args:
        links (str): The URLs of each property to be scraped.
        cursor (psycopg2.extensions.cursor): The database cursor.

    Returns:
        DataFrame: A pandas DataFrame containing all the features of the 
        property
    """
    full_list = []
    
    for link in tqdm.tqdm(links, desc="Scraping property details"):
        try:
            response = requests.get(link)
            soup = BeautifulSoup(response.content, 'html.parser')

            raw_price = soup.find('h3', class_='orangeTit').text.strip()
            raw_area_text = soup.find('h3', class_='greyTit').text.strip()
            raw_title = soup.find('h1', class_='searchTitle').text.strip()
            
            price = clean_integer(raw_price)
            title = clean_text(raw_title)
            
            area, city = parse_area_and_city(raw_area_text)
            
            text_content = None
            description_div = soup.find('div', class_='wordBreak')
            if not description_div:
                # Try backup option if structure varies
                for p in soup.find_all('p'):
                    if len(p.text.strip()) > 50:  # Avoid short/label-like p tags
                        text_content = p.get_text(separator=" ").strip()
                        break
            else:
                text_content = description_div.get_text(separator=" ").strip()

            features_block = soup.find("div", class_="adFeatures")

            label_value = {}

            if features_block:
                for content in features_block.select("div.adMainFeatureContent"):
                    label_tag = content.find("p", class_="adMainFeatureContentLabel")
                    value_tag = content.find("p", class_="adMainFeatureContentValue")
                    if not label_tag or not value_tag:
                        continue

                    label = clean_att(label_tag.get_text())
                    value = clean_att(value_tag.get_text())
                    label_value[label] = value

            # Now safely pull out what you want
            prop_type = label_value.get("Type of property")
            condition = label_value.get("Condition")
            age = label_value.get("Age")
            orientation = label_value.get("Orientation")
            flooring = label_value.get("Flooring")

            # For apartments vs villas
            floor_number = label_value.get("Floor number")
            number_of_floors = label_value.get("Number of floors")
            
            size = rooms = bedrooms = bathrooms = None
            # This is extra info like size, number of rooms etc.
            details = soup.find_all('div', class_='adDetailFeature')

            for detail in details:
                text = detail.text
                value = detail.find('span').text.strip()
                
                # Check for size (since it's the first one with 'm²')
                if 'm²' in text:
                    size = clean_integer(value)
                
                # Check for number of rooms (called pieces on site)
                if 'Pieces' in text or 'Piece' in text:
                    rooms = clean_integer(value)
                
                # Check for number of bedrooms
                if 'Rooms' in text or 'Room' in text:
                    bedrooms = clean_integer(value)
                
                # Check for number of bathrooms
                if 'Bathrooms' in text or 'Bathroom' in text:
                    bathrooms = clean_integer(value)
                    
            # If there are no rooms extracted, the function searches the description
            if rooms is None and text_content:
                rooms = clean_rooms(text_content)
                    
            features = soup.find_all('p', class_='fSize11 centered')
            feature_list = [clean_text(feature.text) for feature in features]   
            feature_str = ', '.join(filter(None, feature_list))
            
            lat, lon = extract_coordinates(soup)
                     
            property_details = {
                                'title': title,
                                'description': text_content,
                                'property_type': prop_type,
                                'city' : city, 
                                'area': area, 
                                'size': size, 
                                'rooms': rooms, 
                                'bedrooms': bedrooms, 
                                'bathrooms': bathrooms, 
                                'price': price,
                                'features': feature_str,
                                'condition': condition,
                                'age': clean_age(str(age)),
                                'Orientation': orientation,
                                'flooring': flooring,
                                'floor_number': floor_number,
                                'number_of_floors': number_of_floors,
                                'lat': lat,
                                'lon': lon,
                                'url': link
                                }
            
            full_list.append(property_details)
            sleep(uniform(1, 3))
            
        except Exception as e:
            logging.error(f'Error fetching property data from {link}: {e}')
            
    return pd.DataFrame(full_list)

def clean_integer(number_str):
    """
    Cleans all numerate fields by removing any non-digit characters, and 
    converting it into an integer value.

    Args:
        number_str (str): The price string of the property.

    Returns:
        int or None: The cleaned price as an integer, or None if invalid.
    """
    
    if not number_str:
        return None
    try:
        # Remove all non-digit characters
        number_str = re.sub(r'[^\d]', '', number_str)
        return int(number_str)
    except ValueError:
        return None
    
def clean_text(text):
    return text.strip() if text else None

def clean_att(s: str) -> str:
    return " ".join(s.split()).strip()

def clean_age(age_str):
    """
    Cleans the age string by extracting age ranges in the format 'min-max' 
    only if 'years' is in the original string and exactly two numbers are 
    present.

    Args:
        age_str (str): The age string to clean.

    Returns:
        str or None: The age range in 'min-max' format, or None if conditions 
        are not met.
    """
    if not age_str:
        return None

    # Convert the string to lowercase for case-insensitive matching
    age_str_lower = age_str.lower()

    # Check if 'years' is in the original string
    if 'years' not in age_str_lower:
        return None

    # Extract all numbers from the string
    numbers = re.findall(r'\d+', age_str)
    if len(numbers) == 2:
        # Exactly two numbers found, format as 'min-max'
        min_age = int(numbers[0])
        max_age = int(numbers[1])
        return f"{min_age}-{max_age}"
    else:
        # Either less than or more than two numbers found
        return None
        
def clean_rooms(description):
    pattern = r'(\d+)\s*(?:\w+\s)?rooms?\b'
    match = re.search(pattern, description, re.IGNORECASE)
    if match:
        return int(match.group(1))
    return None

def clean_condition(cond_str):
    try:
        if not cond_str:
            return None
        
        if cond_str == 'Good condition':
            cond_str = 'Good'
            return cond_str
        elif cond_str == 'Due for reform':
            cond_str == 'Old'
            return cond_str
        elif cond_str == 'New':
            return cond_str
        else:
            return None
    except ValueError:
        return None
    
def parse_area_and_city(raw_area_text):
    if not raw_area_text:
        logging.warning("raw_area_text is None or empty.")
        return None, None
    raw_area_text = raw_area_text.strip()
    pattern = r'^(.*)\s+in\s+(.*)$'
    match = re.search(pattern, raw_area_text, re.IGNORECASE)
    if match:
        area = match.group(1).strip()
        city = match.group(2).strip()
        logging.debug(f"Parsed area: '{area}', city: '{city}' from raw_area_text: '{raw_area_text}'")
    else:
        area = None
        city = raw_area_text.strip()
        logging.debug(f"No 'in' found. Set area to None and city to '{city}' from raw_area_text: '{raw_area_text}'")
    return area, city

def extract_coordinates(soup):
    scripts = soup.find_all("script")
    for s in scripts:
        if s.string and "waze.com/ul" in s.string:
            match = re.search(r"waze\.com/ul\?ll=([^&]+)", s.string)
            if match:
                ll = unquote(match.group(1))
                lat, lon = ll.split(",")
                lat = float(lat)
                lon = float(lon)
    
    return lat, lon

In [96]:
props = get_links(URL, max_pages=1)
dets = get_details(props)

Scraping property details: 100%|██████████| 8/8 [00:19<00:00,  2.47s/it]


In [97]:
dets

Unnamed: 0,title,description,property_type,city,area,size,rooms,bedrooms,bathrooms,price,features,condition,age,Orientation,flooring,floor_number,number_of_floors,lat,lon,url
0,Apartments for rent in Anfa. 2 lovely rooms. R...,"Apartments for rent. Price 1,000 DH. 7 living ...",Apartment,Casablanca,Ferme Bretonne (Hay Arraha),50,4,1,1,7500500,"Terrace, 8 m², Garage, 1 Space, Elevator, Sea ...",New,,South,Marble,4th,,33.596176,-7.657471,https://www.mubawab.ma/en/a/8259319/apartments...
1,Apartment for rent in Ain Diab. 3 lovely rooms...,"Nice apartment for rent. Price 24,000 DH. 12 l...",Apartment,Casablanca,Ain Diab,205,12,3,3,22000,"Garden, Terrace, Garage, Elevator, Sea views, ...",New,5-10,West,Marble,6th,,33.579843,-7.690143,https://www.mubawab.ma/en/a/8242902/apartment-...
2,Rent this apartment in Casablanca Finance City...,"Rent your dream apartment. Price 13,000 DH. 5 ...",Apartment,Casablanca,Casablanca Finance City,106,5,2,2,12500,"Garden, Terrace, 30 m², Garage, 1 Space, Eleva...",Good condition,5-10,East,Marble,6th,,33.563095,-7.660584,https://www.mubawab.ma/en/a/8243598/rent-this-...
3,Apartment for rent in Branes 1. Area of 88 m²....,"Ideally located apartment for rent. Price 6,00...",Apartment,Tanger,Branes 1,88,2,2,1,6000,"Garden, Terrace, Garage, Elevator, Concierge, ...",New,,East,,9th,,35.76582,-5.831727,https://www.mubawab.ma/en/a/8257927/apartment-...
4,Apartment for rent in Bourgogne Ouest. Total a...,"Ideal apartment for rent. Price 7,500 DH. 3 li...",Apartment,Casablanca,Bourgogne Ouest,64,3,2,1,7500,"Garden, 64 m², Terrace, 64 m², Elevator, Conci...",Good condition,10-20,,Tiled,2nd,,33.582734,-7.675838,https://www.mubawab.ma/en/a/8267762/apartment-...
5,High quality villa for rent in Riyad. 10 rooms...,High quality amenities with this villa for ren...,Villa,Rabat,Riyad,600,10,4,3,34000,"Garden, Garage, 2 Spaces, Fireplace, Air condi...",Good condition,1-5,South,Marble,,1.0,33.981612,-6.864653,https://www.mubawab.ma/en/a/8267682/high-quali...
6,Great apartment for rent in Racine. 1 Room. St...,Great amenities with this apartment for rent. ...,Apartment,Casablanca,Racine,40,1,1,1,6000,"Terrace, Elevator, Concierge, Box room, Furnis...",Good condition,5-10,West,Marble,4th,,33.592458,-7.648158,https://www.mubawab.ma/en/a/8265675/great-apar...
7,Lovely apartment for rent in Centre Ville. 6 l...,"Beautifull apartment for rent. Price 6,500 DH....",Apartment,Mohammédia,Quartier du Parc,96,6,2,2,6500,"Garden, Terrace, 3 m², Garage, 1 Space, Elevat...",New,1-5,East,Marble,2nd,,33.705307,-7.392885,https://www.mubawab.ma/en/a/8265576/lovely-apa...


In [98]:

features_clean = []

for feat_div in soup.find_all("div", class_="adFeature"):
    ps = feat_div.find_all("p", class_="fSize11 centered")
    if not ps:
        continue

    label = clean_text(ps[0].get_text())          # "Garden"
    value = clean_text(ps[1].get_text()) if len(ps) > 1 else None  # "150 m²"

    if value:
        features_clean.append(f"{label}||{value}")  # keep value if present
    else:
        features_clean.append(label)

# store as a list or JSON, not a single comma string if you can


In [99]:
features_clean

['Garden',
 'Terrace',
 'Garage',
 'Elevator',
 'Sea views',
 'Satellite dish',
 'Fireplace',
 'Air conditioning',
 'Heating',
 'Security system',
 'Equipped kitchen',
 'Oven']