#### 1. Imports & Setup

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from tqdm.notebook import tqdm
import re
from pathlib import Path
from datetime import datetime

# Folders
DATA_DIR = Path("../data")
DATA_DIR.mkdir(exist_ok=True)

# Headers – polite scraping
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9"
}

def fetch_page(url, delay=3.0, timeout=20):
    try:
        time.sleep(delay)
        resp = requests.get(url, headers=HEADERS, timeout=timeout)
        resp.raise_for_status()
        return BeautifulSoup(resp.text, "html.parser")
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return None

#### 2. Configuration (confirmed working categories – 2026)

In [2]:
BASE_URL = "https://con.2merkato.com"

CATEGORIES = [
    {"name": "Finishing",           "cat_id": 9},  
    {"name": "Sanitary",            "cat_id": 10}, 
    {"name": "Electrical",          "cat_id": 11}, 
    {"name": "Tiles & Ceramics",    "cat_id": 12}, 
    {"name": "Wood & Carpentry",    "cat_id": 13}, 
    {"name": "Metal Works",         "cat_id": 14}, 
    {"name": "Glass & Glazing",     "cat_id": 15}, 
    {"name": "Roofing & Ceiling",   "cat_id": 6},  
]

# Build full URLs automatically
for cat in CATEGORIES:
    cat["url"] = f"{BASE_URL}/prices/cat/{cat['cat_id']}"

MAX_ITEMS_PER_CATEGORY = 100     
DELAY_BETWEEN_REQUESTS = 4.0     

print("Scraping plan:")
for cat in CATEGORIES:
    print(f"  • {cat['name']} → {cat['url']}")
print(f"Max items per category: {MAX_ITEMS_PER_CATEGORY}")
print(f"Total potential items: up to {len(CATEGORIES) * MAX_ITEMS_PER_CATEGORY}")

Scraping plan:
  • Finishing → https://con.2merkato.com/prices/cat/9
  • Sanitary → https://con.2merkato.com/prices/cat/10
  • Electrical → https://con.2merkato.com/prices/cat/11
  • Tiles & Ceramics → https://con.2merkato.com/prices/cat/12
  • Wood & Carpentry → https://con.2merkato.com/prices/cat/13
  • Metal Works → https://con.2merkato.com/prices/cat/14
  • Glass & Glazing → https://con.2merkato.com/prices/cat/15
  • Roofing & Ceiling → https://con.2merkato.com/prices/cat/6
Max items per category: 100
Total potential items: up to 800


#### 3. Scrape function for 2merkato table structure

In [3]:
def scrape_2merkato_category(cat_name, cat_url):
    print(f"\nScraping {cat_name} from 2merkato.com")
    print(f"URL: {cat_url}")

    soup = fetch_page(cat_url)
    if not soup:
        return []

    table = soup.find("table")
    if not table:
        print("No table found on page.")
        return []

    rows = table.find_all("tr")[1:]  
    cat_items = []

    for row in tqdm(rows[:MAX_ITEMS], desc=cat_name):
        cells = row.find_all("td")
        if len(cells) >= 5:
            try:
                name_cell = cells[0]
                name_link = name_cell.find("a")
                material_name = name_link.get_text(strip=True) if name_link else name_cell.get_text(strip=True)
                detail_url = ""
                if name_link and name_link["href"].startswith("/"):
                    detail_url = BASE_URL + name_link["href"]

                # Price & unit
                price_cell = cells[1]
                price_text = price_cell.get_text(strip=True)
                price_etb = None
                unit = None
                if "Br" in price_text:
                    parts = price_text.split("Br")
                    try:
                        price_etb = float(parts[0].replace(",", "").strip())
                        unit_part = parts[1].strip() if len(parts) > 1 else ""
                        unit_match = re.search(r"per\s*(\w+)", unit_part, re.IGNORECASE)
                        unit = unit_match.group(1) if unit_match else unit_part.strip()
                    except:
                        pass

                # Last checked
                last_checked = cells[4].get_text(strip=True) if len(cells) > 4 else ""

                cat_items.append({
                    "material_name": material_name,
                    "price_etb": price_etb,
                    "price_text": price_text,
                    "unit": unit,
                    "last_checked": last_checked,
                    "detail_url": detail_url,
                    "category": cat_name,
                    "source": "2merkato.com",
                    "scraped_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                })

            except:
                continue

    print(f"→ Collected {len(cat_items)} items from {cat_name}")
    return cat_items

#### 4. Run scraping

In [4]:
all_items = []

for cat in tqdm(CATEGORIES, desc="Categories"):
    cat_name = cat["name"]
    cat_url = cat["url"]
    
    print(f"\nScraping {cat_name}...")
    
    soup = fetch_page(cat_url, delay=DELAY_BETWEEN_REQUESTS)
    if not soup:
        print(f"  → Failed to load {cat_name}")
        continue

    table = soup.find("table")
    if not table:
        print(f"  → No table found for {cat_name}")
        continue

    rows = table.find_all("tr")[1:]  # skip header
    cat_items = []

    for row in tqdm(rows[:MAX_ITEMS_PER_CATEGORY], desc=cat_name, leave=False):
        cells = row.find_all("td")
        if len(cells) >= 5:
            try:
                name_cell = cells[0]
                name_link = name_cell.find("a")
                material_name = name_link.get_text(strip=True) if name_link else name_cell.get_text(strip=True)
                detail_url = BASE_URL + name_link["href"] if name_link and name_link["href"].startswith("/") else ""

                price_text = cells[1].get_text(strip=True)
                price_etb = None
                unit = None
                if "Br" in price_text:
                    parts = price_text.split("Br")
                    try:
                        price_etb = float(parts[0].replace(",", "").strip())
                        unit_part = parts[1].strip() if len(parts) > 1 else ""
                        unit_match = re.search(r"per\s*(\w+)", unit_part, re.IGNORECASE)
                        unit = unit_match.group(1) if unit_match else unit_part
                    except:
                        pass

                last_checked = cells[4].get_text(strip=True) if len(cells) > 4 else ""

                cat_items.append({
                    "material_name": material_name,
                    "price_etb": price_etb,
                    "price_text": price_text,
                    "unit": unit,
                    "last_checked": last_checked,
                    "detail_url": detail_url,
                    "category": cat_name,
                    "source": "2merkato.com",
                    "scraped_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                })

            except:
                continue

    print(f"  → Collected {len(cat_items)} items from {cat_name}")
    all_items.extend(cat_items)
    time.sleep(4) 

# Save results
if all_items:
    df = pd.DataFrame(all_items)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M")
    output_file = DATA_DIR / f"ethio_interior_raw_2merkato_{timestamp}.csv"
    df.to_csv(output_file, index=False, encoding="utf-8-sig")
    print(f"\n=== Final Results ===")
    print(f"Saved {len(df)} items total to {output_file}")
    
    display(df.head(12))
    print("\nCategory distribution:")
    print(df["category"].value_counts())
    print("\nPrice stats (ETB):")
    print(df["price_etb"].describe())
else:
    print("\nNo items collected from any category. Check connection or site structure.")

Categories:   0%|          | 0/8 [00:00<?, ?it/s]


Scraping Finishing...


Finishing:   0%|          | 0/22 [00:00<?, ?it/s]

  → Collected 21 items from Finishing

Scraping Sanitary...


Sanitary:   0%|          | 0/50 [00:00<?, ?it/s]

  → Collected 49 items from Sanitary

Scraping Electrical...


Electrical:   0%|          | 0/64 [00:00<?, ?it/s]

  → Collected 63 items from Electrical

Scraping Tiles & Ceramics...


Tiles & Ceramics:   0%|          | 0/95 [00:00<?, ?it/s]

  → Collected 94 items from Tiles & Ceramics

Scraping Wood & Carpentry...
Error fetching https://con.2merkato.com/prices/cat/13: 404 Client Error: Not Found for url: https://con.2merkato.com/prices/cat/13
  → Failed to load Wood & Carpentry

Scraping Metal Works...
Error fetching https://con.2merkato.com/prices/cat/14: 404 Client Error: Not Found for url: https://con.2merkato.com/prices/cat/14
  → Failed to load Metal Works

Scraping Glass & Glazing...
Error fetching https://con.2merkato.com/prices/cat/15: 404 Client Error: Not Found for url: https://con.2merkato.com/prices/cat/15
  → Failed to load Glass & Glazing

Scraping Roofing & Ceiling...


Roofing & Ceiling:   0%|          | 0/73 [00:00<?, ?it/s]

  → Collected 72 items from Roofing & Ceiling

=== Final Results ===
Saved 299 items total to ..\data\ethio_interior_raw_2merkato_20260219_1644.csv


Unnamed: 0,material_name,price_etb,price_text,unit,last_checked,detail_url,category,source,scraped_at
0,Clear Glass - 3mm thick,1202.0,"1,202.00 Brper m2",m2,"May 29, 2024",,Finishing,2merkato.com,2026-02-19 16:44:00
1,Clear Glass - 4mm thick,1537.0,"1,537.00 Brper m2",m2,"May 29, 2024",,Finishing,2merkato.com,2026-02-19 16:44:00
2,Clear Glass - 5mm thick,1767.0,"1,767.00 Brper m2",m2,"May 29, 2024",,Finishing,2merkato.com,2026-02-19 16:44:00
3,Clear Glass - 6mm thick,2495.0,"2,495.00 Brper m2",m2,"May 29, 2024",,Finishing,2merkato.com,2026-02-19 16:44:00
4,Figured Glass - 4mm thick,1776.0,"1,776.00 Brper m2",m2,"May 29, 2024",,Finishing,2merkato.com,2026-02-19 16:44:00
5,Clear and Colored Glass: 5mm,1900.0,"1,900.00 Brper m2",m2,"May 29, 2024",,Finishing,2merkato.com,2026-02-19 16:44:00
6,Clear and Colored Glass: 6mm,1350.0,"1,350.00 Brper m2",m2,"Dec 1, 2023",,Finishing,2merkato.com,2026-02-19 16:44:00
7,Figured Glass - 5mm thick,1919.0,"1,919.00 Brper m2",m2,"May 29, 2024",,Finishing,2merkato.com,2026-02-19 16:44:00
8,Frosted Glass - 3mm thick,1967.0,"1,967.00 Brper m2",m2,"May 29, 2024",,Finishing,2merkato.com,2026-02-19 16:44:00
9,Frosted Glass - 4mm thick,1120.0,"1,120.00 Brper m2",m2,"May 29, 2024",,Finishing,2merkato.com,2026-02-19 16:44:00



Category distribution:
category
Tiles & Ceramics     94
Roofing & Ceiling    72
Electrical           63
Sanitary             49
Finishing            21
Name: count, dtype: int64

Price stats (ETB):
count       299.000000
mean       3559.063880
std       14449.810958
min          15.000000
25%         350.000000
50%         815.000000
75%        1794.000000
max      146500.000000
Name: price_etb, dtype: float64
