In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
import random
import re
from fake_useragent import UserAgent
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import os

# Configuration
BASE_URL = "https://www.mubawab.ma"
SEARCH_URL = f"{BASE_URL}/fr/ct/tanger/immobilier-a-vendre"
MAX_LISTINGS = 200
MIN_LISTINGS = 150
OUTPUT_FILE = "mubawab_properties_150_to_200_rows.csv"
PLOT_DIR = "eda_plots"
DELAY = random.uniform(5, 10)
TIMEOUT = 20

# Rotating user agents
ua = UserAgent()

def get_headers():
    return {
        'User-Agent': ua.random,
        'Accept-Language': 'fr-FR,fr;q=0.9',
        'Referer': BASE_URL,
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8',
        'Accept-Encoding': 'gzip, deflate, br',
        'Connection': 'keep-alive',
        'DNT': '1'
    }

def extract_price(text):
    if not text:
        return None
    text = text.replace(' ', '').replace(',', '').replace('\xa0', '')
    match = re.search(r'(\d[\d\s]*\.?\d+)', text)
    return float(match.group(1).replace(' ', '')) if match else None

def parse_listing(listing):
    result = {
        'title': None,
        'price': None,
        'bedrooms': None,
        'surface': None,
        'location': None,
        'url': None,
        'type': None,
        'scraped_at': pd.Timestamp.now()
    }
    
    try:
        title_elem = listing.find('h2', class_=re.compile('listingTit|title'))
        if not title_elem:
            title_elem = listing.find('h2') or listing.find(class_=re.compile('title|heading'))
        result['title'] = title_elem.get_text(strip=True) if title_elem else None

        price_elem = listing.find(class_=re.compile('priceTag|price|cost'))
        if not price_elem:
            price_elem = listing.find(string=re.compile('DH|MAD|€|Dhs'))
        price_text = price_elem.get_text(strip=True) if price_elem else ""
        result['price'] = extract_price(price_text)

        link = listing.find('a', href=True)
        if link and link['href']:
            result['url'] = BASE_URL + link['href'] if not link['href'].startswith('http') else link['href']

        card_text = listing.get_text(' ')
        
        bedrooms = re.search(r'(\d+)\s*(?:chambres?|pieces?)', card_text, re.IGNORECASE)
        result['bedrooms'] = int(bedrooms.group(1)) if bedrooms else None
        
        surface = re.search(r'(\d+)\s*(?:m²|m2|m\s*²)', card_text, re.IGNORECASE)
        result['surface'] = int(surface.group(1)) if surface else None
        
        if result['title'] and 'à' in result['title']:
            result['location'] = result['title'].split('à')[-1].strip()
        else:
            location_elem = listing.find(class_=re.compile('location|ville|place'))
            result['location'] = location_elem.get_text(strip=True) if location_elem else None

        if result['title']:
            title_lower = result['title'].lower()
            if 'appartement' in title_lower:
                result['type'] = 'Appartement'
            elif 'maison' in title_lower or 'villa' in title_lower:
                result['type'] = 'Maison'
            else:
                result['type'] = 'Autre'

    except Exception as e:
        print(f"Error parsing listing: {e}")
        return None
    
    if result['title'] or result['price']:
        return result
    return None

def scrape_page(page_num):
    try:
        url = f"{SEARCH_URL}:p:{page_num}" if page_num > 1 else SEARCH_URL
        print(f"Requesting page {page_num}...")
        
        response = requests.get(url, headers=get_headers(), timeout=TIMEOUT)
        
        if response.status_code != 200:
            print(f"Request failed with status {response.status_code}")
            return None
            
        soup = BeautifulSoup(response.text, 'html.parser')
        
        listings = soup.find_all('li', class_=re.compile('listingBox|propertyBox|listing'))
        if not listings:
            listings = soup.select('div.listingBox, div.propertyBox, section.property')
        if not listings:
            listings = soup.find_all(class_=re.compile('listing|property'))
        
        print(f"Found {len(listings)} listings on page {page_num}")
        return [parse_listing(l) for l in listings if parse_listing(l) is not None]
        
    except requests.exceptions.RequestException as e:
        print(f"Request error on page {page_num}: {e}")
        return None
    except Exception as e:
        print(f"Error scraping page {page_num}: {e}")
        return None

def create_data_pipeline():
    numeric_features = ['bedrooms', 'surface']
    categorical_features = ['location', 'type']
    
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    
    return preprocessor

def handle_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 2.0 * IQR
    upper_bound = Q3 + 2.0 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

def perform_eda(df):
    os.makedirs(PLOT_DIR, exist_ok=True)
    
    plt.figure(figsize=(10, 6))
    sns.histplot(df['price'].dropna(), kde=True)
    plt.title('Distribution des Prix')
    plt.xlabel('Prix (DH)')
    plt.ylabel('Fréquence')
    plt.savefig(f"{PLOT_DIR}/price_distribution.png")
    plt.close()
    
    plt.figure(figsize=(10, 6))
    sns.boxplot(y=df['price'].dropna())
    plt.title('Boxplot des Prix')
    plt.ylabel('Prix (DH)')
    plt.savefig(f"{PLOT_DIR}/price_boxplot.png")
    plt.close()
    
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x='surface', y='price', hue='type', size='bedrooms', data=df)
    plt.title('Prix vs Surface')
    plt.xlabel('Surface (m²)')
    plt.ylabel('Prix (DH)')
    plt.savefig(f"{PLOT_DIR}/price_vs_surface.png")
    plt.close()
    
    plt.figure(figsize=(10, 8))
    numeric_cols = ['price', 'bedrooms', 'surface']
    sns.heatmap(df[numeric_cols].corr(), annot=True, cmap='coolwarm')
    plt.title('Matrice de Corrélation')
    plt.savefig(f"{PLOT_DIR}/correlation_matrix.png")
    plt.close()
    
    plt.figure(figsize=(12, 6))
    sns.boxplot(x='location', y='price', data=df)
    plt.title('Prix par Localisation')
    plt.xticks(rotation=45)
    plt.savefig(f"{PLOT_DIR}/price_by_location.png")
    plt.close()
    
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='type', y='price', data=df)
    plt.title('Prix par Type de Bien')
    plt.xlabel('Type')
    plt.ylabel('Prix (DH)')
    plt.savefig(f"{PLOT_DIR}/price_by_type.png")
    plt.close()

def train_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    model_pipeline = Pipeline([
        ('preprocessor', create_data_pipeline()),
        ('regressor', LinearRegression())
    ])
    
    model_pipeline.fit(X_train, y_train)
    
    y_pred = model_pipeline.predict(X_test)
    
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    
    print("\nModel Evaluation:")
    print(f"R² Score: {r2:.4f}")
    print(f"RMSE: {rmse:.2f} DH")
    print(f"MAE: {mae:.2f} DH")
    
    preprocessor = model_pipeline.named_steps['preprocessor']
    regressor = model_pipeline.named_steps['regressor']
    numeric_features = ['bedrooms', 'surface']
    categorical_features = ['location', 'type']
    feature_names = (numeric_features + 
                     preprocessor.named_transformers_['cat']
                     .named_steps['onehot']
                     .get_feature_names_out(categorical_features).tolist())
    
    coefficients = pd.DataFrame({
        'Feature': feature_names,
        'Coefficient': regressor.coef_
    })
    print("\nFeature Importance (Regression Coefficients):")
    print(coefficients.sort_values(by='Coefficient', key=abs, ascending=False))
    
    plt.figure(figsize=(10, 6))
    sns.barplot(x='Coefficient', y='Feature', data=coefficients)
    plt.title('Importance des variables (coefficients)')
    plt.xlabel('Coefficient')
    plt.ylabel('Feature')
    plt.savefig(f"{PLOT_DIR}/feature_importance_linear.png")
    plt.close()
    
    return model_pipeline

def main():
    all_data = []
    page_num = 1
    
    while len(all_data) < MAX_LISTINGS:
        page_data = scrape_page(page_num)
        if page_data:
            all_data.extend(page_data)
            print(f"Page {page_num} scraped successfully. Total listings: {len(all_data)}")
            
            if len(all_data) >= MIN_LISTINGS:
                print(f"Reached target of {len(all_data)} listings (min {MIN_LISTINGS}). Stopping.")
                break
        else:
            print(f"Failed to scrape page {page_num}")
        
        wait_time = DELAY * (1 + page_num/10)
        print(f"Waiting {wait_time:.1f} seconds...")
        time.sleep(wait_time)
        
        if page_num > 10 and len(all_data) < MIN_LISTINGS:
            print("Stopping - likely blocked or no more listings")
            break
        
        page_num += 1
    
    if not all_data:
        print("FAILED - No data collected.")
        return
    
    if len(all_data) > MAX_LISTINGS:
        all_data = all_data[:MAX_LISTINGS]
    
    df = pd.DataFrame(all_data)
    print(f"Initial dataset size: {len(df)}")
    
    df = df.drop_duplicates(subset=['url'], keep='first')
    print(f"After removing duplicates: {len(df)}")
    
    df = df[df['title'].notna() | df['price'].notna()]
    print(f"After removing rows with missing title and price: {len(df)}")
    
    df = handle_outliers(df, 'price')
    print(f"After removing price outliers: {len(df)}")
    
    df = handle_outliers(df, 'surface')
    print(f"After removing surface outliers: {len(df)}")
    
    df.to_csv(OUTPUT_FILE, index=False)
    print(f"\nSuccess! Saved {len(df)} listings to {OUTPUT_FILE}")
    
    perform_eda(df)
    print(f"EDA plots saved in {PLOT_DIR}")
    
    features = ['bedrooms', 'surface', 'location', 'type']
    X = df[features]
    y = df['price'].dropna()
    X = X.loc[y.index]
    print(f"Dataset for modeling: {len(X)} rows")
    
    model = train_model(X, y)
    
    print("\nAnalysis complete!")

if __name__ == "__main__":
    main()

Requesting page 1...
Found 35 listings on page 1
Page 1 scraped successfully. Total listings: 34
Waiting 6.7 seconds...
Requesting page 2...
Found 5 listings on page 2
Page 2 scraped successfully. Total listings: 37
Waiting 7.3 seconds...
Requesting page 3...
Found 35 listings on page 3
Page 3 scraped successfully. Total listings: 71
Waiting 7.9 seconds...
Requesting page 4...
Found 5 listings on page 4
Page 4 scraped successfully. Total listings: 74
Waiting 8.6 seconds...
Requesting page 5...
Found 35 listings on page 5
Page 5 scraped successfully. Total listings: 108
Waiting 9.2 seconds...
Requesting page 6...
Found 5 listings on page 6
Page 6 scraped successfully. Total listings: 111
Waiting 9.8 seconds...
Requesting page 7...
Found 5 listings on page 7
Page 7 scraped successfully. Total listings: 114
Waiting 10.4 seconds...
Requesting page 8...
Found 35 listings on page 8
Page 8 scraped successfully. Total listings: 148
Waiting 11.0 seconds...
Requesting page 9...
Found 5 listings 