##### Multi-Category Robust Scraper

- Fetch categories from main page
- Target: Concrete Work, Finishing, Roofing, Painting (relevant to interiors/pricing platform)
- Reuse improved parser 
- Combine data + add category column
- Save: per-category CSVs + master_prices_raw.csv

#### Imports & Reuse Functions

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
from pathlib import Path
from datetime import datetime

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
}

BASE_URL = "https://con.2merkato.com/prices"
DATA_DIR = Path("data")
DATA_DIR.mkdir(exist_ok=True)

def fetch_page(url):
    try:
        response = requests.get(url, headers=HEADERS, timeout=15)
        response.raise_for_status()
        return BeautifulSoup(response.text, 'html.parser')
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return None

# Reuse parser function (enhanced with category param)
def parse_category_table(soup, category_name, category_url):
    if not soup:
        return pd.DataFrame()
    
    table = soup.find('table')
    if not table:
        print(f"No table found in {category_name}")
        return pd.DataFrame()
    
    rows = table.find_all('tr')[1:]  # skip header
    items = []
    
    for row in rows:
        cells = row.find_all('td')
        if len(cells) >= 5:
            name_cell = cells[0]
            name_link = name_cell.find('a')
            if name_link:
                material = name_link.get_text(strip=True)
                href = name_link['href']
                detail_url = 'https://con.2merkato.com' + href if href.startswith('/') else href
                
                price_text = cells[1].get_text(strip=True)
                last_checked = cells[4].get_text(strip=True)
                
                # Parse price & unit (handle "Brper" or "Br per")
                price_match = re.search(r'([\d,]+\.\d{2})\s*Br\s*(?:per\s*)?(\w+)', price_text)
                price_etb = price_match.group(1).replace(',', '') if price_match else None
                unit = price_match.group(2) if price_match else None
                
                items.append({
                    'category': category_name,
                    'material': material,
                    'price_text': price_text,
                    'price_etb': float(price_etb) if price_etb else None,
                    'unit': unit,
                    'last_checked': last_checked,
                    'detail_url': detail_url,
                    'scraped_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                })
    
    df = pd.DataFrame(items)
    print(f"Extracted {len(df)} items from {category_name}")
    return df

#### Get Categories & Loop

##### Fetch & Select Categories
From main page – we target relevant ones for pricing platform

In [2]:
soup_main = fetch_page(BASE_URL)
categories = []

if soup_main:
    # Categories are in ##### [Name](url) pattern – but since Markdown-rendered, look for h5 or strong/a
    for header in soup_main.find_all(['h5', 'h4', 'h3', 'strong']):
        link = header.find('a')
        if link and '/prices/cat/' in link['href']:
            name = link.get_text(strip=True)
            url = 'https://con.2merkato.com' + link['href'] if link['href'].startswith('/') else link['href']
            categories.append({'name': name, 'url': url})

print(f"Found {len(categories)} categories")
df_categories = pd.DataFrame(categories)
display(df_categories)

# Target list
target_categories = [
    "Concrete Work",
    "Finishing",
    "Roofing",
    "Painting"
]

all_data = []
for cat in df_categories.itertuples():
    if cat.name in target_categories:
        print(f"\nScraping {cat.name}...")
        soup_cat = fetch_page(cat.url)
        df_cat = parse_category_table(soup_cat, cat.name, cat.url)
        if not df_cat.empty:
            all_data.append(df_cat)
        time.sleep(3)  # polite delay between categories

if all_data:
    master_df = pd.concat(all_data, ignore_index=True)
    print(f"\nTotal extracted: {len(master_df)} items across categories")
    display(master_df.head(10))
    
    # Save
    today = datetime.now().strftime('%Y%m%d')
    master_df.to_csv(DATA_DIR / f"master_prices_raw_{today}.csv", index=False)
    print(f"Saved master to data/master_prices_raw_{today}.csv")
    
    # Optional: per-category saves
    for df_cat in all_data:
        cat_name_safe = df_cat['category'].iloc[0].replace(' ', '_').lower()
        df_cat.to_csv(DATA_DIR / f"{cat_name_safe}_prices_raw_{today}.csv", index=False)

Found 12 categories


Unnamed: 0,name,url
0,Sanitary,https://con.2merkato.com/prices/cat/12
1,Electrical,https://con.2merkato.com/prices/cat/11
2,Roofing,https://con.2merkato.com/prices/cat/6
3,Finishing,https://con.2merkato.com/prices/cat/8
4,Concrete Work,https://con.2merkato.com/prices/cat/2
5,Carpentry and Joinery,https://con.2merkato.com/prices/cat/5
6,Metal Work,https://con.2merkato.com/prices/cat/7
7,Block Work/ Walling,https://con.2merkato.com/prices/cat/4
8,Glazing,https://con.2merkato.com/prices/cat/9
9,Excavation and Earth Works,https://con.2merkato.com/prices/cat/1



Scraping Roofing...
Extracted 72 items from Roofing

Scraping Finishing...
Extracted 21 items from Finishing

Scraping Concrete Work...
Extracted 35 items from Concrete Work

Scraping Painting...
Extracted 49 items from Painting

Total extracted: 177 items across categories


Unnamed: 0,category,material,price_text,price_etb,unit,last_checked,detail_url,scraped_at
0,Roofing,Gutter-G-28 Galvanized Steel Recta. Downpipe D...,420.00 Brper m,420.0,m,"May 29, 2024",https://con.2merkato.com/prices/material/6/81,2026-02-18 10:51:34
1,Roofing,Gutter-G-28 Galvanized Steel Recta. Downpipe D...,575.00 Brper m,575.0,m,"May 29, 2024",https://con.2merkato.com/prices/material/6/82,2026-02-18 10:51:34
2,Roofing,Gutter-G-28 Galvanized Iron Sheet - Gutter of ...,420.00 Brper m,420.0,m,"May 29, 2024",https://con.2merkato.com/prices/material/6/84,2026-02-18 10:51:34
3,Roofing,Gutter-G-28 Galvanized Iron Sheet - Gutter of ...,889.00 Brper m,889.0,m,"Mar 1, 2024",https://con.2merkato.com/prices/material/6/85,2026-02-18 10:51:34
4,Roofing,Gutter-G-28 Galvanized Iron Sheet - Gutter of ...,744.00 Brper m,744.0,m,"May 29, 2024",https://con.2merkato.com/prices/material/6/86,2026-02-18 10:51:34
5,Roofing,Gutter-G-28 Galvanized Iron Sheet - Gutter of ...,993.00 Brper m,993.0,m,"May 29, 2024",https://con.2merkato.com/prices/material/6/87,2026-02-18 10:51:34
6,Roofing,15 cm Wide Faciaboard,600.00 Brper m,600.0,m,"May 29, 2024",https://con.2merkato.com/prices/material/6/88,2026-02-18 10:51:34
7,Roofing,20 cm. Wide Faciaboard,650.00 Brper m,650.0,m,"May 29, 2024",https://con.2merkato.com/prices/material/6/89,2026-02-18 10:51:34
8,Roofing,G28 Corrugated Galvanized Iron Sheet - Akaki,"1,335.00 Brper pcs",1335.0,pcs,"Mar 1, 2024",https://con.2merkato.com/prices/material/6/90,2026-02-18 10:51:34
9,Roofing,G28 Corrugated Galvanized Iron Sheet - KOSPI,"1,250.00 Brper pcs",1250.0,pcs,"Mar 1, 2024",https://con.2merkato.com/prices/material/6/91,2026-02-18 10:51:34


Saved master to data/master_prices_raw_20260218.csv


#### Summary & Insights
- Total items: 177 across 4 categories
- Strong coverage: Roofing (72), Painting (49), Concrete (35), Finishing (21)
- Data quality: Numeric prices, units extracted, timestamps added
- Note: Many last_checked dates ~May 2024; consider flagging outdated (>1 year) in cleaning phase
- Portfolio value: Demonstrates multi-page scraping, data aggregation, polite delays

In [3]:
if 'master_df' in globals():
    print("Master DataFrame Info:")
    print(master_df.info())
    
    # Count per category
    print("\nItems per category:")
    print(master_df['category'].value_counts())
    
    # Outdated items (last_checked older than, say, 2025)
    master_df['last_checked_dt'] = pd.to_datetime(master_df['last_checked'], errors='coerce')
    outdated = master_df[master_df['last_checked_dt'] < pd.Timestamp('2025-01-01')]
    print(f"\nOutdated items (<2025): {len(outdated)} / {len(master_df)} ({len(outdated)/len(master_df)*100:.1f}%)")
    
    # Save with extra column
    master_df.to_csv(DATA_DIR / "master_prices_raw_with_flags.csv", index=False)
    print("Saved with datetime & potential outdated flag prep")

Master DataFrame Info:
<class 'pandas.DataFrame'>
RangeIndex: 177 entries, 0 to 176
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   category      177 non-null    str    
 1   material      177 non-null    str    
 2   price_text    177 non-null    str    
 3   price_etb     177 non-null    float64
 4   unit          177 non-null    str    
 5   last_checked  177 non-null    str    
 6   detail_url    177 non-null    str    
 7   scraped_at    177 non-null    str    
dtypes: float64(1), str(7)
memory usage: 11.2 KB
None

Items per category:
category
Roofing          72
Painting         49
Concrete Work    35
Finishing        21
Name: count, dtype: int64

Outdated items (<2025): 74 / 177 (41.8%)
Saved with datetime & potential outdated flag prep
