In [24]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from time import sleep
import re
from tqdm import tqdm
from random import uniform

#-------------------------PREPARATION-----------------------------#

def prepare_url(payment, location):
    
    base_url = f'https://www.mubawab.ma/en/ct/{location}/real-estate-for-{payment}'

    return base_url

def get_links(url, max_pages=20):
    prop_links = []
    
    page = 1  # Start from the first page
    while page <= max_pages:
        print(f'Scraping links from page {page}...')
        page_url = url + f':p:{page}'
        
        try:
            response = requests.get(page_url)
            response.raise_for_status()  # Raise an error for bad status codes
            
            soup = BeautifulSoup(response.content, 'html.parser')
            listings = soup.find_all('h2', class_='listingTit')

            # If no listings are found, break the loop
            if not listings:
                print(f"No listings found on page {page}. Stopping pagination.")
                break

            for listing in listings:
                try:
                    link_tag = listing.find('a')
                    if link_tag and 'href' in link_tag.attrs:
                        link = link_tag['href']
                        prop_links.append(link)
                    else:
                        print(f"Link not found in listing: {listing}")
                
                except AttributeError as e:
                    print(f"Error finding a link: {e}")

            # Check if there is a "Next" page
            next_page = soup.find('a', class_='arrowDot')
            if not next_page:
                print("No 'Next' page found. Stopping pagination.")
                break
        
        except requests.exceptions.HTTPError as http_err:
            print(f"HTTP error occurred on page {page}: {http_err}")
            break  # Stop the loop if we hit an HTTP error (like 404)
        
        except requests.RequestException as req_e:
            print(f"Request error on page {page}: {req_e}")
            break  # Stop the loop for other request issues

        print('Sleeping for a bit...')
        sleep(uniform(1, 3))  # Random sleep between 1 and 3 seconds

        # Increment page counter if there is a next page
        page += 1

    return prop_links

#-------------------------FEATURES--------------------------------#

def get_details(links):
    full_list = []
    for counter, link in enumerate(tqdm(links, desc="Fetching property details")):
        try:
            counter += 1
            response = requests.get(link)
            soup = BeautifulSoup(response.content, 'html.parser')

            price = soup.find('h3', class_='orangeTit').text.strip()
            
            area_text = soup.find('h3', class_='greyTit').text.strip()
            pattern = r'^(.*)\sin\s(.*)$'

            # Use re.search to match both the area and the city
            match = re.search(pattern, area_text)

            if match:
                area = match.group(1).strip()  # First group: Area
                city = match.group(2).strip()  # Second group: City
            
            title = soup.find('h1', class_='searchTitle').text.strip()
            
            div_block = soup.find('div', class_='blockProp')
            if div_block:
                p_tag = div_block.find('p')
                if p_tag:
                    text_content = p_tag.get_text(separator=" ").strip()
            
            description = soup.find_all('p', class_='adMainFeatureContentValue')

            description_titles = ['Property Type', 'Condition', 'Age', 'Floor']
            descriptor_list = [desc.text.strip() for desc in description]

            desc_dict = dict(zip(description_titles, descriptor_list))
            
            size, rooms, bedrooms, bathrooms = None, None, None, None

            details = soup.find_all('div', class_='adDetailFeature')

            for detail in details:
                # Check for size (since it's the first one with 'm²')
                if 'm²' in detail.text:
                    size = detail.find('span').text.strip().replace('m²', '').strip()
                
                # Check for number of pieces
                if 'Pieces' in detail.text:
                    rooms = detail.find('span').text.strip().replace('Pieces', '').strip()
                
                # Check for number of rooms
                if 'Rooms' in detail.text:
                    bedrooms = detail.find('span').text.strip().replace('Rooms', '').strip()
                
                # Check for number of bathrooms
                if 'Bathrooms' in detail.text:
                    bathrooms = detail.find('span').text.strip().replace('Bathrooms', '').strip()
                    
            features = soup.find_all('span', class_='fSize11 centered')
            feature_list = [feature.text.strip() for feature in features]   
            feature_str = ', '.join(feature_list)
                     
            property_details = {
                                'Title': title,
                                'Description': text_content,
                                'City' : city, 
                                'Area': area, 
                                'Size': size, 
                                'Rooms': rooms, 
                                'Bedrooms': bedrooms, 
                                'Bathrooms': bathrooms, 
                                'Price': price,
                                'Features': feature_str
                                }
            
            all_details = {**property_details, **desc_dict}
            
            full_list.append(all_details)
            
            sleep(uniform(1, 3))
            
        except requests.RequestException as req_e:
            print(f'Error fetching property data: {req_e}')
        except AttributeError as attr_e:
            print(f'Missing element in the page: {attr_e}')
            
    return pd.DataFrame(full_list)

In [25]:
cities = ['casablanca']

def scrape_property_data(payment, city):
    base_url = prepare_url(payment, city)
    prop_links = get_links(base_url, max_pages=1)
    df = get_details(prop_links)
    
    return df

all_dfs = []

for city in cities:
    print(f'Scraping data for {city.title()}')
    city_df = scrape_property_data('rent', city)
    all_dfs.append(city_df)
    
all_property_data = pd.concat(all_dfs, ignore_index=True)
    

Scraping data for Casablanca
Scraping links from page 1...
Link not found in listing: <h2 class="listingTit">Kanzi Subdivision</h2>
Sleeping for a bit...


Fetching property details: 100%|██████████| 33/33 [01:20<00:00,  2.44s/it]


In [26]:
all_property_data

Unnamed: 0,Title,Description,City,Area,Size,Rooms,Bedrooms,Bathrooms,Price,Features,Property Type,Condition,Age,Floor,Orientation
0,Apartment for rent in Ain Diab Extension. 4 ro...,Great amenities with this apartment for rent. ...,Casablanca,Ain Diab Extension,170,4.0,3.0,3.0,"17,000 DH\n\t\t\t\t\t\t\t\t\t\t\t\t\n\n\t\t\t\...","Garden, Terrace, Garage, Elevator, Sea views, ...",Apartment,New,Less than 1 year,Marble,South
1,Great apartment for rent in Gauthier. Large ar...,Find out about this apartment for rent. Price ...,Casablanca,Gauthier,51,2.0,,,"8,500 DH","Terrace, Garage, Elevator, Concierge, Furnishe...",Apartment,New,3rd,,
2,Apartment for rent in Racine. 4 comfortable ro...,"Rent your dream apartment. Price 15,000 DH. 4 ...",Casablanca,Racine,205,4.0,3.0,3.0,"15,000 DH",,Apartment,Good condition,,,
3,Great apartment for rent in Racine. 3 lovely r...,Find out about this apartment for rent. Price ...,Casablanca,Racine,120,4.0,2.0,2.0,"12,500 DH","Terrace, Garage, Elevator, Concierge, Exterior...",Apartment,New,1-5 years old,Marble,West
4,Very nice apartment for rent in Mers Sultan. 1...,Do not miss out on this apartment for rent. Pr...,Casablanca,Mers Sultan,50,,,,"4,500 DH","Terrace, Garage, Elevator, Furnished, Exterior...",Apartment,Good condition,,,
5,Apartment for rent in Gauthier. 2 lovely rooms...,"Beautifull apartment for rent. Price 9,000 DH....",Casablanca,Gauthier,75,2.0,2.0,2.0,"9,000 DH","Garage, Elevator, Furnished, Air conditioning,...",Apartment,Good condition,,,
6,Find an apartment for rent in Gauthier. 3 love...,"Ideal apartment for rent. Price 14,000 DH. 3 r...",Casablanca,Gauthier,250,3.0,3.0,,"14,000 DH","Garage, Fireplace",Apartment,Good condition,,,
7,Very nice apartment for rent in Bourgogne Oues...,Find out about this apartment for rent. Price ...,Casablanca,Bourgogne Ouest,75,2.0,2.0,2.0,"7,500 DH","Garden, Terrace, Garage, Elevator, Fireplace, ...",Apartment,Good condition,,,
8,Find an apartment for rent in Beauséjour. Area...,Great amenities with this apartment for rent. ...,Casablanca,Beauséjour,127,3.0,2.0,2.0,"10,000 DH","Terrace, Garage, Elevator, Concierge, Box room...",Apartment,New,,,
9,Apartment for rent in Racine. Small area 213 m...,"Beautifull apartment for rent. Price 20,000 DH...",Casablanca,Racine,213,5.0,4.0,3.0,"20,000 DH","Terrace, Garage, Elevator, Concierge, Box room...",Apartment,New,5th,,


In [152]:
all_property_data.to_csv('property_data.csv')

In [164]:
all_property_data.Price.str.replace("[^0-9]", "", regex=True)

ValueError: invalid literal for int() with base 10: ''

In [46]:
from scraper import prepare_url, get_links, get_details
import logging
import pandas as pd

def main():
    try:
        url = prepare_url('rent', 'casablanca')
        links = get_links(url, max_pages=1)
        if links:
            properties = get_details(links[:5])
            df = pd.DataFrame(properties)
            df.to_csv('properties.csv')
        else:
            logging.info("No new properties found.")
    except Exception as e:
        logging.error(f'An error occurred: {e}')

if __name__ == '__main__':
    main()

ERROR:root:An error occurred: get_details() missing 1 required positional argument: 'cursor'


In [43]:
pd.read_csv('properties.csv').iloc[0]['url']

'https://www.mubawab.ma/en/a/7517328/flat-for-rent-in-oasis-2-rooms-with-lift-and-terrace-'