In [149]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from time import sleep
import re
from tqdm import tqdm
from random import uniform

#-------------------------PREPARATION-----------------------------#

def prepare_url(payment, location):
    
    base_url = f'https://www.mubawab.ma/en/ct/{location}/real-estate-for-{payment}'

    return base_url

def get_links(url, max_pages=20):
    prop_links = []
    
    page = 1  # Start from the first page
    while page <= max_pages:
        print(f'Scraping links from page {page}...')
        page_url = url + f':p:{page}'
        
        try:
            response = requests.get(page_url)
            response.raise_for_status()  # Raise an error for bad status codes
            
            soup = BeautifulSoup(response.content, 'html.parser')
            listings = soup.find_all('h2', class_='listingTit')

            # If no listings are found, break the loop
            if not listings:
                print(f"No listings found on page {page}. Stopping pagination.")
                break

            for listing in listings:
                try:
                    link_tag = listing.find('a')
                    if link_tag and 'href' in link_tag.attrs:
                        link = link_tag['href']
                        prop_links.append(link)
                    else:
                        print(f"Link not found in listing: {listing}")
                
                except AttributeError as e:
                    print(f"Error finding a link: {e}")

            # Check if there is a "Next" page
            next_page = soup.find('a', class_='arrowDot')
            if not next_page:
                print("No 'Next' page found. Stopping pagination.")
                break
        
        except requests.exceptions.HTTPError as http_err:
            print(f"HTTP error occurred on page {page}: {http_err}")
            break  # Stop the loop if we hit an HTTP error (like 404)
        
        except requests.RequestException as req_e:
            print(f"Request error on page {page}: {req_e}")
            break  # Stop the loop for other request issues

        print('Sleeping for a bit...')
        sleep(uniform(1, 3))  # Random sleep between 1 and 3 seconds

        # Increment page counter if there is a next page
        page += 1

    return prop_links

#-------------------------FEATURES--------------------------------#

def get_details(links):
    full_list = []
    for counter, link in enumerate(tqdm(links, desc="Fetching property details")):
        try:
            counter += 1
            response = requests.get(link)
            soup = BeautifulSoup(response.content, 'html.parser')

            price = soup.find('h3', class_='orangeTit').text.strip()
            
            area_text = soup.find('h3', class_='greyTit').text.strip()
            pattern = r'^(.*)\sin\s(.*)$'

            # Use re.search to match both the area and the city
            match = re.search(pattern, area_text)

            if match:
                area = match.group(1).strip()  # First group: Area
                city = match.group(2).strip()  # Second group: City
            
            title = soup.find('h1', class_='searchTitle').text.strip()
            
            description = soup.find_all('p', class_='adMainFeatureContentValue')

            description_titles = ['Property Type', 'Condition', 'Age', 'Floor', 'Orientation', 'Floor']
            descriptor_list = [desc.text.strip() for desc in description]

            desc_dict = dict(zip(description_titles, descriptor_list))
            
            size, rooms, bedrooms, bathrooms = None, None, None, None

            details = soup.find_all('div', class_='adDetailFeature')

            for detail in details:
                # Check for size (since it's the first one with 'm²')
                if 'm²' in detail.text:
                    size = detail.find('span').text.strip().replace('m²', '').strip()
                
                # Check for number of pieces
                if 'Pieces' in detail.text:
                    rooms = detail.find('span').text.strip().replace('Pieces', '').strip()
                
                # Check for number of rooms
                if 'Rooms' in detail.text:
                    bedrooms = detail.find('span').text.strip().replace('Rooms', '').strip()
                
                # Check for number of bathrooms
                if 'Bathrooms' in detail.text:
                    bathrooms = detail.find('span').text.strip().replace('Bathrooms', '').strip()
                    
            features = soup.find_all('span', class_='fSize11 centered')
            feature_list = [feature.text.strip() for feature in features]   
            feature_str = ', '.join(feature_list)
                     
            property_details = {
                                'Title': title,
                                'City' : city, 
                                'Area': area, 
                                'Size': size, 
                                'Rooms': rooms, 
                                'Bedrooms': bedrooms, 
                                'Bathrooms': bathrooms, 
                                'Price': price,
                                'Features': feature_str
                                }
            
            all_details = {**property_details, **desc_dict}
            
            full_list.append(all_details)
            
            sleep(uniform(1, 3))
            
        except requests.RequestException as req_e:
            print(f'Error fetching property data: {req_e}')
        except AttributeError as attr_e:
            print(f'Missing element in the page: {attr_e}')
            
    return pd.DataFrame(full_list)

In [150]:
cities = ['casablanca', 'marrakech', 'rabat', 'tanger', 'agadir', 'fès', 'kénitra', 'mohammédia', 'salé', 'bouskoura', 'temara', 'essaouira', 'meknes']

def scrape_property_data(payment, city):
    base_url = prepare_url(payment, city)
    prop_links = get_links(base_url, max_pages=10)
    df = get_details(prop_links)
    
    return df

all_dfs = []

for city in cities:
    print(f'Scraping data for {city.title()}')
    city_df = scrape_property_data('rent', city)
    all_dfs.append(city_df)
    
all_property_data = pd.concat(all_dfs, ignore_index=True)
    

Scraping data for Casablanca
Scraping links from page 1...
Link not found in listing: <h2 class="listingTit">Al Boustane</h2>
Sleeping for a bit...
Scraping links from page 2...
Link not found in listing: <h2 class="listingTit">Rio Beach </h2>
Sleeping for a bit...
Scraping links from page 3...
Link not found in listing: <h2 class="listingTit">Anfa Blue Living Villas</h2>
Sleeping for a bit...
Scraping links from page 4...
Link not found in listing: <h2 class="listingTit">Dyour Tamaris</h2>
Sleeping for a bit...
Scraping links from page 5...
Link not found in listing: <h2 class="listingTit">DIAZ Residence</h2>
Sleeping for a bit...
Scraping links from page 6...
Link not found in listing: <h2 class="listingTit">Douirti Invest</h2>
Sleeping for a bit...
Scraping links from page 7...
Link not found in listing: <h2 class="listingTit">DIAZ Residence</h2>
Sleeping for a bit...
Scraping links from page 8...
Link not found in listing: <h2 class="listingTit">Missimi Office</h2>
Sleeping for a b

Fetching property details: 100%|██████████| 330/330 [15:01<00:00,  2.73s/it]


Scraping data for Marrakech
Scraping links from page 1...
Link not found in listing: <h2 class="listingTit">Menzah Annakhil Tranche 1</h2>
Sleeping for a bit...
Scraping links from page 2...
Link not found in listing: <h2 class="listingTit">Lotissement Jnane El Kheir 1, 2 et 3 à Benguerir</h2>
Sleeping for a bit...
Scraping links from page 3...
Link not found in listing: <h2 class="listingTit">Signature Luxury Villa 2</h2>
Sleeping for a bit...
Scraping links from page 4...
Link not found in listing: <h2 class="listingTit">Abouab Al Falah</h2>
Sleeping for a bit...
Scraping links from page 5...
Link not found in listing: <h2 class="listingTit">Menzah Annakhil Tranche 2</h2>
Sleeping for a bit...
Scraping links from page 6...
Link not found in listing: <h2 class="listingTit">Signature Luxury Villa 2</h2>
Sleeping for a bit...
Scraping links from page 7...
Link not found in listing: <h2 class="listingTit">Oasis Atlas Resort - Villas</h2>
Sleeping for a bit...
Scraping links from page 8..

Fetching property details: 100%|██████████| 330/330 [14:56<00:00,  2.72s/it]


Scraping data for Rabat
Scraping links from page 1...
Link not found in listing: <h2 class="listingTit">Résidence fermée de villas isolées : 18M Avenue</h2>
Sleeping for a bit...
Scraping links from page 2...
Link not found in listing: <h2 class="listingTit">Residence Yassmine Temara</h2>
Sleeping for a bit...
Scraping links from page 3...
Link not found in listing: <h2 class="listingTit">Residence Yassmine Temara</h2>
Sleeping for a bit...
Scraping links from page 4...
Link not found in listing: <h2 class="listingTit">Aday litaamir</h2>
Sleeping for a bit...
Scraping links from page 5...
Link not found in listing: <h2 class="listingTit">Résidence fermée de villas isolées : 18M Avenue</h2>
Sleeping for a bit...
Scraping links from page 6...
Link not found in listing: <h2 class="listingTit">Residence Yassmine Temara</h2>
Sleeping for a bit...
Scraping links from page 7...
Link not found in listing: <h2 class="listingTit">Residence Yassmine Temara</h2>
Sleeping for a bit...
Scraping link

Fetching property details: 100%|██████████| 330/330 [14:56<00:00,  2.72s/it]


Scraping data for Tanger
Scraping links from page 1...
Link not found in listing: <h2 class="listingTit">Aquarelle Residence Marina Bay -B</h2>
Sleeping for a bit...
Scraping links from page 2...
Link not found in listing: <h2 class="listingTit">Aquarelle Residences Marina Bay</h2>
Sleeping for a bit...
Scraping links from page 3...
Link not found in listing: <h2 class="listingTit">Les Jardins de Mesnana Golf</h2>
Sleeping for a bit...
Scraping links from page 4...
Link not found in listing: <h2 class="listingTit">Aquarelle Residence Marina Bay -B</h2>
Sleeping for a bit...
Scraping links from page 5...
Link not found in listing: <h2 class="listingTit">Aquarelle Residences Marina Bay</h2>
Sleeping for a bit...
Scraping links from page 6...
Link not found in listing: <h2 class="listingTit">Les Jardins de Mesnana Golf</h2>
Sleeping for a bit...
Scraping links from page 7...
Link not found in listing: <h2 class="listingTit">Aquarelle Residences Marina Bay</h2>
Sleeping for a bit...
Scrapi

Fetching property details: 100%|██████████| 330/330 [15:18<00:00,  2.78s/it]


Scraping data for Agadir
Scraping links from page 1...
Sleeping for a bit...
Scraping links from page 2...
Sleeping for a bit...
Scraping links from page 3...
Sleeping for a bit...
Scraping links from page 4...
Sleeping for a bit...
Scraping links from page 5...
Sleeping for a bit...
Scraping links from page 6...
Sleeping for a bit...
Scraping links from page 7...
Sleeping for a bit...
Scraping links from page 8...
Sleeping for a bit...
Scraping links from page 9...
Sleeping for a bit...
Scraping links from page 10...
Sleeping for a bit...


Fetching property details: 100%|██████████| 330/330 [15:28<00:00,  2.81s/it]


Scraping data for Fès
Scraping links from page 1...
Sleeping for a bit...
Scraping links from page 2...
Sleeping for a bit...
Scraping links from page 3...
Sleeping for a bit...
Scraping links from page 4...
Sleeping for a bit...
Scraping links from page 5...
No 'Next' page found. Stopping pagination.


Fetching property details: 100%|██████████| 163/163 [07:22<00:00,  2.72s/it]


Scraping data for Kénitra
Scraping links from page 1...
Link not found in listing: <h2 class="listingTit">Residence Yassmine Temara</h2>
Sleeping for a bit...
Scraping links from page 2...
Link not found in listing: <h2 class="listingTit">Residence Yassmine Temara</h2>
Sleeping for a bit...
Scraping links from page 3...
Link not found in listing: <h2 class="listingTit">Al Baraka</h2>
Sleeping for a bit...
Scraping links from page 4...
Link not found in listing: <h2 class="listingTit">Residence Yassmine Temara</h2>
Sleeping for a bit...
Scraping links from page 5...
Link not found in listing: <h2 class="listingTit">Aday litaamir</h2>
Sleeping for a bit...
Scraping links from page 6...
Link not found in listing: <h2 class="listingTit">Al Baraka</h2>
Sleeping for a bit...
Scraping links from page 7...
Link not found in listing: <h2 class="listingTit">Aday litaamir</h2>
Sleeping for a bit...
Scraping links from page 8...
No 'Next' page found. Stopping pagination.


Fetching property details: 100%|██████████| 262/262 [11:58<00:00,  2.74s/it]


Scraping data for Mohammédia
Scraping links from page 1...
Link not found in listing: <h2 class="listingTit">LARIMAR residence</h2>
Sleeping for a bit...
Scraping links from page 2...
Link not found in listing: <h2 class="listingTit">Marina Sidi Rehal</h2>
Sleeping for a bit...
Scraping links from page 3...
Link not found in listing: <h2 class="listingTit">Gardenia Zenata Park</h2>
Sleeping for a bit...
Scraping links from page 4...
Link not found in listing: <h2 class="listingTit">Anaé Garden &amp; Sea</h2>
Sleeping for a bit...
Scraping links from page 5...
Link not found in listing: <h2 class="listingTit">Les Beaux Rivages</h2>
Sleeping for a bit...
Scraping links from page 6...
Link not found in listing: <h2 class="listingTit">Natura</h2>
Sleeping for a bit...
Scraping links from page 7...
Link not found in listing: <h2 class="listingTit">Gardenia Zenata Park</h2>
Sleeping for a bit...
Scraping links from page 8...
Link not found in listing: <h2 class="listingTit">Chraibi Real Esta

Fetching property details: 100%|██████████| 322/322 [14:08<00:00,  2.63s/it]


Scraping data for Salé
Scraping links from page 1...
Link not found in listing: <h2 class="listingTit">Aday litaamir</h2>
Sleeping for a bit...
Scraping links from page 2...
Link not found in listing: <h2 class="listingTit">Aday litaamir</h2>
Sleeping for a bit...
Scraping links from page 3...
Link not found in listing: <h2 class="listingTit">Résidence fermée de villas isolées : 18M Avenue</h2>
Sleeping for a bit...
Scraping links from page 4...
Link not found in listing: <h2 class="listingTit">Résidence fermée de villas isolées : 18M Avenue</h2>
Sleeping for a bit...
Scraping links from page 5...
Link not found in listing: <h2 class="listingTit">Résidence fermée de villas isolées : 18M Avenue</h2>
Sleeping for a bit...
Scraping links from page 6...
Link not found in listing: <h2 class="listingTit">Al Baraka</h2>
Sleeping for a bit...
Scraping links from page 7...
No 'Next' page found. Stopping pagination.


Fetching property details: 100%|██████████| 210/210 [09:34<00:00,  2.74s/it]


Scraping data for Bouskoura
Scraping links from page 1...
Link not found in listing: <h2 class="listingTit">Ahl Loghlam - Moyen standing-Tit Mellil</h2>
Sleeping for a bit...
Scraping links from page 2...
Link not found in listing: <h2 class="listingTit">Gardenia Zenata Park</h2>
Sleeping for a bit...
Scraping links from page 3...
Link not found in listing: <h2 class="listingTit">Missimi Living</h2>
Sleeping for a bit...
Scraping links from page 4...
Link not found in listing: <h2 class="listingTit">Missimi Living</h2>
Sleeping for a bit...
Scraping links from page 5...
Link not found in listing: <h2 class="listingTit">Les Jardins de Ain Sebâa</h2>
Sleeping for a bit...
Scraping links from page 6...
Link not found in listing: <h2 class="listingTit">DIAZ Residence</h2>
Sleeping for a bit...
Scraping links from page 7...
Link not found in listing: <h2 class="listingTit">Kanzi Subdivision</h2>
Sleeping for a bit...
Scraping links from page 8...
Link not found in listing: <h2 class="listin

Fetching property details: 100%|██████████| 330/330 [14:56<00:00,  2.72s/it]


Scraping data for Temara
Scraping links from page 1...
Link not found in listing: <h2 class="listingTit">Aday litaamir</h2>
Sleeping for a bit...
Scraping links from page 2...
Link not found in listing: <h2 class="listingTit">Al Baraka</h2>
Sleeping for a bit...
Scraping links from page 3...
Link not found in listing: <h2 class="listingTit">Résidence fermée de villas isolées : 18M Avenue</h2>
Sleeping for a bit...
Scraping links from page 4...
Link not found in listing: <h2 class="listingTit">Aday litaamir</h2>
Sleeping for a bit...
Scraping links from page 5...
Link not found in listing: <h2 class="listingTit">Résidence fermée de villas isolées : 18M Avenue</h2>
Sleeping for a bit...
Scraping links from page 6...
Link not found in listing: <h2 class="listingTit">Al Baraka</h2>
Sleeping for a bit...
Scraping links from page 7...
Link not found in listing: <h2 class="listingTit">Aday litaamir</h2>
Sleeping for a bit...
Scraping links from page 8...
Link not found in listing: <h2 class="

Fetching property details: 100%|██████████| 283/283 [12:36<00:00,  2.67s/it]


Scraping data for Essaouira
Scraping links from page 1...
Link not found in listing: <h2 class="listingTit">Residence ALMARWA</h2>
Sleeping for a bit...
Scraping links from page 2...
Link not found in listing: <h2 class="listingTit">Abouab Al Falah</h2>
Sleeping for a bit...
Scraping links from page 3...
Link not found in listing: <h2 class="listingTit">Abouab Al Falah</h2>
Sleeping for a bit...
Scraping links from page 4...
Link not found in listing: <h2 class="listingTit">Abouab Al Falah</h2>
Sleeping for a bit...
Scraping links from page 5...
Link not found in listing: <h2 class="listingTit">Residence ALMARWA</h2>
Sleeping for a bit...
Scraping links from page 6...
Link not found in listing: <h2 class="listingTit">SHEMS AL MADINA - LAND LOTS R 1</h2>
Sleeping for a bit...
Scraping links from page 7...
No 'Next' page found. Stopping pagination.


Fetching property details: 100%|██████████| 209/209 [09:31<00:00,  2.74s/it]


Scraping data for Meknes
Scraping links from page 1...
Sleeping for a bit...
Scraping links from page 2...
Sleeping for a bit...
Scraping links from page 3...
Sleeping for a bit...
Scraping links from page 4...
Sleeping for a bit...
Scraping links from page 5...
No 'Next' page found. Stopping pagination.


Fetching property details: 100%|██████████| 165/165 [07:25<00:00,  2.70s/it]


In [151]:
all_property_data.to_pickle('property_df.pkl')

In [152]:
all_property_data.to_csv('property_data.csv')

In [164]:
all_property_data.Price.str.replace("[^0-9]", "", regex=True)

ValueError: invalid literal for int() with base 10: ''