In [1]:
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime, timedelta
import time

def scrape_wg_gesucht(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return []
    
    soup = BeautifulSoup(response.text, 'html.parser')
    listings = []
    
    # Find all listing cards
    cards = soup.find_all('div', class_='card_body')
    
    for card in cards:
        listing = {}
        
        # 1. Title
        title_tag = card.find('h3', class_='truncate_title')
        listing['title'] = title_tag.get_text(strip=True) if title_tag else "N/A"
        
        # 2. Status and location information
        status_location_tag = card.find('div', class_='col-xs-11')
        if status_location_tag:
            status_location_text = status_location_tag.get_text(strip=True)
            parts = [part.strip() for part in status_location_text.split('|')]
            
            # 2. Status
            listing['status'] = parts[0] if len(parts) > 0 else "N/A"
            
            # 3. Location in city
            listing['location_in_city'] = parts[1] if len(parts) > 1 else "N/A"
            
            # 4. Location street
            listing['location_street'] = parts[2] if len(parts) > 2 else "N/A"
        
        # 5. Price, 6. Size, 7-8. Duration
        middle_row = card.find('div', class_='middle')
        if middle_row:
            cols = middle_row.find_all('div', class_=re.compile('col-xs-'))
            
            # 5. Price
            if len(cols) > 0:
                price_text = cols[0].get_text(strip=True)
                listing['price'] = price_text if price_text else "N/A"
            
            # 7-8. Duration
            if len(cols) > 1:
                duration_text = cols[1].get_text(strip=True)
                if ' - ' in duration_text:
                    from_to = duration_text.split(' - ')
                    listing['duration_from'] = from_to[0] if from_to[0] else "N/A"
                    listing['duration_to'] = from_to[1] if len(from_to) > 1 and from_to[1] else "N/A"
                else:
                    listing['duration_from'] = duration_text
                    listing['duration_to'] = "N/A"
            
            # 6. Size
            if len(cols) > 2:
                size_text = cols[2].get_text(strip=True)
                listing['size'] = size_text if size_text else "N/A"
        
        # 9. Advertisement time and 10. Provider
        bottom_row = card.find('div', class_='bottom')
        if bottom_row:
            # 10. Provider name
            provider_tag = bottom_row.find('span', class_='ml5')
            listing['provider'] = provider_tag.get_text(strip=True) if provider_tag else "N/A"
            
            # Check for verified company
            verified_tag = bottom_row.find('a', class_='label_verified')
            if verified_tag:
                listing['provider'] += " (Verified company)"
            
            # 9. Advertisement time
            time_tag = bottom_row.find('span', style='color: #218700;')
            if time_tag:
                time_text = time_tag.get_text(strip=True)
                if 'Online:' in time_text:
                    time_ago_text = time_text.replace('Online:', '').strip()
                    listing['advertisement_time'] = calculate_minutes_ago(time_ago_text)
                else:
                    listing['advertisement_time'] = 0
            else:
                listing['advertisement_time'] = 0
        
        listings.append(listing)
    
    return listings

def calculate_minutes_ago(time_text):
    now = datetime.now()
    
    if 'minute' in time_text:
        minutes = int(re.search(r'\d+', time_text).group())
        return minutes
    elif 'hour' in time_text:
        hours = int(re.search(r'\d+', time_text).group())
        return hours * 60
    elif 'day' in time_text:
        days = int(re.search(r'\d+', time_text).group())
        return days * 24 * 60
    elif 'currently' in time_text.lower() or 'jetzt' in time_text.lower():
        return 0
    else:
        return 0

if __name__ == "__main__":
    url = "https://www.wg-gesucht.de/1-zimmer-wohnungen-in-Berlin.8.1.1.0.html"
    listings = scrape_wg_gesucht(url)
    
    for idx, listing in enumerate(listings, 1):
        print(f"\nListing #{idx}:")
        print(f"Title: {listing.get('title', 'N/A')}")
        print(f"Status: {listing.get('status', 'N/A')}")
        print(f"Location in city: {listing.get('location_in_city', 'N/A')}")
        print(f"Location street: {listing.get('location_street', 'N/A')}")
        print(f"Price: {listing.get('price', 'N/A')}")
        print(f"Size: {listing.get('size', 'N/A')}")
        print(f"Available from: {listing.get('duration_from', 'N/A')}")
        print(f"Available to: {listing.get('duration_to', 'N/A')}")
        print(f"Advertisement time (minutes ago): {listing.get('advertisement_time', 'N/A')}")
        print(f"Provider: {listing.get('provider', 'N/A')}")


Listing #1:
Title: Modern designtes Studio-Apartment zu vermieten - Internationals welcome
Status: N/A
Location in city: N/A
Location street: N/A
Price: ab540 €
Size: N/A
Available from: kurzfristig frei
Available to: N/A
Advertisement time (minutes ago): N/A
Provider: N/A

Listing #2:
Title: Tolles Studio-Apartment im Studentenwohnheim zu vermieten
Status: N/A
Location in city: N/A
Location street: N/A
Price: ab500 €
Size: N/A
Available from: kurzfristig frei
Available to: N/A
Advertisement time (minutes ago): N/A
Provider: N/A

Listing #3:
Title: STACEY Coliving Mitte | www.stacey.de
Status: 1-Zimmer-Wohnung
Location in city: Berlin
                            
                            Mitte
Location street: Fischerinsel 14
Price: 895 €
Size: 16 m²
Available from: 07.04.2025
Available to: N/A
Advertisement time (minutes ago): 0
Provider: STACEY (Verified company)

Listing #4:
Title: STACEY Coliving Mitte | www.stacey.de
Status: 1-Zimmer-Wohnung
Location in city: Berlin
          

In [3]:
#a better model to save data into sql 
import sqlite3
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import os
import time

def create_database():
    """Create the database and table if they don't exist"""
    conn = sqlite3.connect('webscrapping_results/house_and_price.db')
    c = conn.cursor()
    
    c.execute('''CREATE TABLE IF NOT EXISTS listings
                 (id INTEGER PRIMARY KEY AUTOINCREMENT,
                  title TEXT,
                  status TEXT,
                  location_in_city TEXT,
                  location_street TEXT,
                  price TEXT,
                  size TEXT,
                  duration_from TEXT,
                  duration_to TEXT,
                  advertisement_time_minutes INTEGER,
                  provider TEXT,
                  is_verified_company INTEGER,
                  scrape_timestamp TEXT)''')
    
    conn.commit()
    conn.close()

def scrape_listings():
    """Scrape listings from WG-Gesucht website"""
    url = "https://www.wg-gesucht.de/1-zimmer-wohnungen-in-Berlin.8.1.1.0.html"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the webpage: {e}")
        return []
    
    soup = BeautifulSoup(response.text, 'html.parser')
    listings = []
    
    # Find all listing cards
    cards = soup.find_all('div', class_='card_body')
    
    for card in cards:
        try:
            # Extract title
            title_tag = card.find('h3', class_='truncate_title')
            title = title_tag.get_text(strip=True) if title_tag else "N/A"
            
            # Extract status and location info
            status_location = card.find('div', class_='col-xs-11').get_text(strip=True) if card.find('div', class_='col-xs-11') else "N/A"
            parts = [part.strip() for part in status_location.split('|')]
            
            status = parts[0] if len(parts) > 0 else "N/A"
            location_in_city = parts[1] if len(parts) > 1 else "N/A"
            location_street = parts[2] if len(parts) > 2 else "N/A"
            
            # Extract price, dates, and size
            middle_row = card.find('div', class_='middle')
            if middle_row:
                cols = middle_row.find_all('div', class_='col-xs-3') + middle_row.find_all('div', class_='col-xs-5')
                price = cols[0].get_text(strip=True) if len(cols) > 0 else "N/A"
                dates = cols[1].get_text(strip=True) if len(cols) > 1 else "N/A"
                size = cols[2].get_text(strip=True) if len(cols) > 2 else "N/A"
                
                # Split dates into from and to
                if " - " in dates:
                    duration_from, duration_to = dates.split(" - ")
                else:
                    duration_from = dates
                    duration_to = "N/A"
            else:
                price = "N/A"
                size = "N/A"
                duration_from = "N/A"
                duration_to = "N/A"
            
            # Extract provider info and online time
            bottom_row = card.find('div', class_='bottom')
            if bottom_row:
                # Provider name
                provider_span = bottom_row.find('span', class_='ml5')
                provider = provider_span.get_text(strip=True) if provider_span else "N/A"
                
                # Check if verified company
                verified_tag = bottom_row.find('a', class_='label_verified')
                is_verified_company = 1 if verified_tag else 0
                
                # Online time
                online_span = bottom_row.find('span', style='color: #218700;')
                online_text = online_span.get_text(strip=True) if online_span else "N/A"
                
                # Calculate minutes since advertisement
                if "minute" in online_text.lower():
                    minutes = int(''.join(filter(str.isdigit, online_text)))
                elif "hour" in online_text.lower():
                    hours = int(''.join(filter(str.isdigit, online_text)))
                    minutes = hours * 60
                elif "currently" in online_text.lower() or "online" in online_text.lower():
                    minutes = 0
                else:
                    minutes = -1  # Unknown format
            else:
                provider = "N/A"
                is_verified_company = 0
                minutes = -1
            
            # Current timestamp for when we scraped this data
            scrape_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            
            listings.append({
                'title': title,
                'status': status,
                'location_in_city': location_in_city,
                'location_street': location_street,
                'price': price,
                'size': size,
                'duration_from': duration_from,
                'duration_to': duration_to,
                'advertisement_time_minutes': minutes,
                'provider': provider,
                'is_verified_company': is_verified_company,
                'scrape_timestamp': scrape_timestamp
            })
            
        except Exception as e:
            print(f"Error processing a listing: {e}")
            continue
    
    return listings

def save_to_database(listings):
    """Save scraped listings to the database"""
    conn = sqlite3.connect('webscrapping_results/house_and_price.db')
    c = conn.cursor()
    
    for listing in listings:
        # Check if this listing already exists (based on title, provider, and price)
        c.execute('''SELECT id FROM listings 
                     WHERE title = ? AND provider = ? AND price = ?''',
                 (listing['title'], listing['provider'], listing['price']))
        
        if not c.fetchone():  # Only insert if it's not already in the database
            c.execute('''INSERT INTO listings 
                         (title, status, location_in_city, location_street, price, size, 
                          duration_from, duration_to, advertisement_time_minutes, 
                          provider, is_verified_company, scrape_timestamp)
                         VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''',
                     (listing['title'], listing['status'], listing['location_in_city'],
                      listing['location_street'], listing['price'], listing['size'],
                      listing['duration_from'], listing['duration_to'], 
                      listing['advertisement_time_minutes'], listing['provider'],
                      listing['is_verified_company'], listing['scrape_timestamp']))
    
    conn.commit()
    conn.close()

def print_listings(listings):
    """Print the scraped listings for verification"""
    print("\nScraped Listings:")
    print("-" * 100)
    for i, listing in enumerate(listings, 1):
        print(f"Listing {i}:")
        print(f"Title: {listing['title']}")
        print(f"Status: {listing['status']}")
        print(f"Location in City: {listing['location_in_city']}")
        print(f"Location Street: {listing['location_street']}")
        print(f"Price: {listing['price']}")
        print(f"Size: {listing['size']}")
        print(f"Available From: {listing['duration_from']}")
        print(f"Available To: {listing['duration_to']}")
        print(f"Advertisement Time (minutes ago): {listing['advertisement_time_minutes']}")
        print(f"Provider: {listing['provider']}")
        print(f"Verified Company: {'Yes' if listing['is_verified_company'] else 'No'}")
        print(f"Scraped At: {listing['scrape_timestamp']}")
        print("-" * 100)

def main():
    # Create directory if it doesn't exist
    os.makedirs('webscrapping_results', exist_ok=True)
    
    # Create database if it doesn't exist
    create_database()
    
    # Scrape listings
    print("Scraping listings from WG-Gesucht...")
    listings = scrape_listings()
    
    if listings:
        # Print the scraped data
        print_listings(listings)
        
        # Save to database
        save_to_database(listings)
        print(f"\nSuccessfully saved {len(listings)} listings to the database.")
    else:
        print("No listings were scraped.")

if __name__ == "__main__":
    main()

Scraping listings from WG-Gesucht...

Scraped Listings:
----------------------------------------------------------------------------------------------------
Listing 1:
Title: Charmantes und helles 1-Zimmer-Apartment in beliebtem Viertel, für Singles oder Paare
Status: N/A
Location in City: N/A
Location Street: N/A
Price: N/A
Size: N/A
Available From: N/A
Available To: N/A
Advertisement Time (minutes ago): -1
Provider: N/A
Verified Company: No
Scraped At: 2025-04-07 00:16:59
----------------------------------------------------------------------------------------------------
Listing 2:
Title: Tolles Studio-Apartment im Studentenwohnheim zu vermieten
Status: N/A
Location in City: N/A
Location Street: N/A
Price: N/A
Size: N/A
Available From: N/A
Available To: N/A
Advertisement Time (minutes ago): -1
Provider: N/A
Verified Company: No
Scraped At: 2025-04-07 00:16:59
----------------------------------------------------------------------------------------------------
Listing 3:
Title: einzimm

In [7]:
#after checking the issue of some minor errors over the data
import sqlite3
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import os
import re

def create_database():
    """Create the database and table if they don't exist"""
    conn = sqlite3.connect('webscrapping_results/house_and_price.db')
    c = conn.cursor()
    
    c.execute('''CREATE TABLE IF NOT EXISTS listings
                 (id INTEGER PRIMARY KEY AUTOINCREMENT,
                  title TEXT,
                  status TEXT,
                  location_in_city TEXT,
                  location_street TEXT,
                  price TEXT,
                  size TEXT,
                  duration_from TEXT,
                  duration_to TEXT,
                  advertisement_time_minutes INTEGER,
                  provider TEXT,
                  is_verified_company INTEGER,
                  scrape_timestamp TEXT)''')
    
    conn.commit()
    conn.close()

def extract_price_size_dates(middle_row):
    """Extract price, size, and dates from the middle row with better format handling"""
    price = "N/A"
    size = "N/A"
    duration_from = "N/A"
    duration_to = "N/A"
    
    if not middle_row:
        return price, size, duration_from, duration_to
    
    # Get all columns in the middle row
    cols = middle_row.find_all('div', recursive=False)
    
    # Typically there are 3 columns: price, dates, size
    if len(cols) >= 3:
        price = cols[0].get_text(strip=True)
        dates = cols[1].get_text(strip=True)
        size = cols[2].get_text(strip=True)
        
        # Handle cases where size and dates might be swapped
        if 'm²' in dates or any(c.isdigit() for c in dates.split()[0] if dates.split()):
            # Dates and size are swapped
            size, dates = dates, size
        
        # Split dates into from and to
        if " - " in dates:
            duration_from, duration_to = dates.split(" - ")
        else:
            duration_from = dates
            duration_to = "N/A"
    
    return price, size, duration_from, duration_to

def parse_online_time(online_text):
    """Parse the online time text into minutes"""
    if not online_text or online_text == "N/A":
        return -1
    
    online_text = online_text.lower()
    
    # Extract numbers from the text
    numbers = re.findall(r'\d+', online_text)
    if not numbers:
        return 0 if any(word in online_text for word in ['jetzt', 'online', 'current']) else -1
    
    num = int(numbers[0])
    
    if 'minute' in online_text:
        return num
    elif 'hour' in online_text or 'stunde' in online_text:
        return num * 60
    elif 'day' in online_text or 'tag' in online_text:
        return num * 1440
    else:
        return 0 if 'online' in online_text else -1

def scrape_listings():
    """Scrape listings from WG-Gesucht website"""
    url = "https://www.wg-gesucht.de/1-zimmer-wohnungen-in-Berlin.8.1.1.0.html"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the webpage: {e}")
        return []
    
    soup = BeautifulSoup(response.text, 'html.parser')
    listings = []
    
    # Find all listing cards
    cards = soup.find_all('div', class_='card_body')
    
    for card in cards:
        try:
            # Extract title
            title_tag = card.find('h3', class_='truncate_title')
            title = title_tag.get_text(strip=True) if title_tag else "N/A"
            
            # Extract status and location info
            status_location = card.find('div', class_='col-xs-11').get_text(strip=True) if card.find('div', class_='col-xs-11') else "N/A"
            parts = [part.strip() for part in status_location.split('|') if part.strip()]
            
            status = parts[0] if len(parts) > 0 else "N/A"
            location_in_city = parts[1] if len(parts) > 1 else "N/A"
            location_street = parts[2] if len(parts) > 2 else "N/A"
            
            # Extract price, size, and dates with improved parsing
            middle_row = card.find('div', class_='middle')
            price, size, duration_from, duration_to = extract_price_size_dates(middle_row)
            
            # Clean up size (remove any non-size text)
            if size != "N/A":
                size = ''.join([c for c in size if c.isdigit() or c in ['.', ',']])
                if ',' in size:
                    size = size.replace(',', '.')
                size = f"{size} m²"
            
            # Extract provider info and online time
            bottom_row = card.find('div', class_='bottom')
            if bottom_row:
                # Provider name
                provider_span = bottom_row.find('span', class_='ml5')
                provider = provider_span.get_text(strip=True) if provider_span else "N/A"
                
                # Check if verified company
                verified_tag = bottom_row.find('a', class_='label_verified')
                is_verified_company = 1 if verified_tag else 0
                
                # Online time
                online_span = bottom_row.find('span', style='color: #218700;') or \
                              bottom_row.find('span', class_='online_status')
                online_text = online_span.get_text(strip=True) if online_span else "N/A"
                minutes = parse_online_time(online_text)
            else:
                provider = "N/A"
                is_verified_company = 0
                minutes = -1
            
            # Current timestamp for when we scraped this data
            scrape_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            
            listings.append({
                'title': title,
                'status': status,
                'location_in_city': location_in_city,
                'location_street': location_street,
                'price': price,
                'size': size,
                'duration_from': duration_from,
                'duration_to': duration_to,
                'advertisement_time_minutes': minutes,
                'provider': provider,
                'is_verified_company': is_verified_company,
                'scrape_timestamp': scrape_timestamp
            })
            
        except Exception as e:
            print(f"Error processing a listing: {e}")
            continue
    
    return listings

def save_to_database(listings):
    """Save scraped listings to the database"""
    conn = sqlite3.connect('webscrapping_results/house_and_price.db')
    c = conn.cursor()
    
    for listing in listings:
        # Check if this listing already exists (based on title, provider, and price)
        c.execute('''SELECT id FROM listings 
                     WHERE title = ? AND provider = ? AND price = ?''',
                 (listing['title'], listing['provider'], listing['price']))
        
        if not c.fetchone():  # Only insert if it's not already in the database
            c.execute('''INSERT INTO listings 
                         (title, status, location_in_city, location_street, price, size, 
                          duration_from, duration_to, advertisement_time_minutes, 
                          provider, is_verified_company, scrape_timestamp)
                         VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''',
                     (listing['title'], listing['status'], listing['location_in_city'],
                      listing['location_street'], listing['price'], listing['size'],
                      listing['duration_from'], listing['duration_to'], 
                      listing['advertisement_time_minutes'], listing['provider'],
                      listing['is_verified_company'], listing['scrape_timestamp']))
    
    conn.commit()
    conn.close()

def print_listings(listings):
    """Print the scraped listings for verification"""
    print("\nScraped Listings:")
    print("-" * 100)
    for i, listing in enumerate(listings, 1):
        print(f"Listing {i}:")
        print(f"Title: {listing['title']}")
        print(f"Status: {listing['status']}")
        print(f"Location in City: {listing['location_in_city']}")
        print(f"Location Street: {listing['location_street']}")
        print(f"Price: {listing['price']}")
        print(f"Size: {listing['size']}")
        print(f"Available From: {listing['duration_from']}")
        print(f"Available To: {listing['duration_to']}")
        print(f"Advertisement Time (minutes ago): {listing['advertisement_time_minutes']}")
        print(f"Provider: {listing['provider']}")
        print(f"Verified Company: {'Yes' if listing['is_verified_company'] else 'No'}")
        print(f"Scraped At: {listing['scrape_timestamp']}")
        print("-" * 100)

def main():
    # Create directory if it doesn't exist
    os.makedirs('webscrapping_results', exist_ok=True)
    
    # Create database if it doesn't exist
    create_database()
    
    # Scrape listings
    print("Scraping listings from WG-Gesucht...")
    listings = scrape_listings()
    
    if listings:
        # Print the scraped data
        print_listings(listings)
        
        # Save to database
        save_to_database(listings)
        print(f"\nSuccessfully saved {len(listings)} listings to the database.")
    else:
        print("No listings were scraped.")

if __name__ == "__main__":
    main()

Scraping listings from WG-Gesucht...

Scraped Listings:
----------------------------------------------------------------------------------------------------
Listing 1:
Title: Modern designtes Studio-Apartment zu vermieten - Internationals welcome
Status: N/A
Location in City: N/A
Location Street: N/A
Price: N/A
Size: N/A
Available From: N/A
Available To: N/A
Advertisement Time (minutes ago): -1
Provider: N/A
Verified Company: No
Scraped At: 2025-04-07 00:23:19
----------------------------------------------------------------------------------------------------
Listing 2:
Title: Sonnige 1-Zimmer-Wohnung in zentraler Lage zu vermieten
Status: N/A
Location in City: N/A
Location Street: N/A
Price: N/A
Size: N/A
Available From: N/A
Available To: N/A
Advertisement Time (minutes ago): -1
Provider: N/A
Verified Company: No
Scraped At: 2025-04-07 00:23:19
----------------------------------------------------------------------------------------------------
Listing 3:
Title: Ein Zimmer apt
Status: 

In [13]:
import sqlite3
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import os
import re

def create_database():
    """Create the database and table if they don't exist"""
    conn = sqlite3.connect('webscrapping_results/house_and_price.db')
    c = conn.cursor()
    
    c.execute('''CREATE TABLE IF NOT EXISTS listings
                 (id INTEGER PRIMARY KEY AUTOINCREMENT,
                  title TEXT,
                  status TEXT,
                  location_in_city TEXT,
                  location_street TEXT,
                  price TEXT,
                  size TEXT,
                  duration_from TEXT,
                  duration_to TEXT,
                  advertisement_time_minutes INTEGER,
                  provider TEXT,
                  is_verified_company INTEGER,
                  scrape_timestamp TEXT)''')
    
    conn.commit()
    conn.close()

def is_date(text):
    """Check if text matches DD.MM.YYYY date format"""
    return bool(re.match(r'\d{2}\.\d{2}\.\d{4}', text))

def is_size(text):
    """Check if text contains m² or is a number that likely represents size"""
    return 'm²' in text or (text.replace('.', '').isdigit() and float(text) < 200)

def extract_price_size_dates(middle_row):
    """Extract price, size, and dates from the middle row with precise format handling"""
    price = "N/A"
    size = "N/A"
    duration_from = "N/A"
    duration_to = "N/A"
    
    if not middle_row:
        return price, size, duration_from, duration_to
    
    # Get all columns in the middle row
    cols = middle_row.find_all('div', recursive=False)
    
    if len(cols) >= 3:
        price = cols[0].get_text(strip=True)
        
        # Process the middle and right columns
        middle_text = cols[1].get_text(strip=True)
        right_text = cols[2].get_text(strip=True)
        
        # Identify which is dates and which is size
        if is_date(middle_text) or ('-' in middle_text and any(is_date(p) for p in middle_text.split('-'))):
            # Middle column contains dates
            dates = middle_text
            size_candidate = right_text
        elif is_date(right_text) or ('-' in right_text and any(is_date(p) for p in right_text.split('-'))):
            # Right column contains dates
            dates = right_text
            size_candidate = middle_text
        else:
            # Fallback - assume middle is dates, right is size
            dates = middle_text
            size_candidate = right_text
        
        # Extract size
        size_match = re.search(r'(\d+[\.,]?\d*)\s*m²?', size_candidate, re.IGNORECASE)
        if size_match:
            size = size_match.group(1).replace(',', '.') + " m²"
        else:
            size = "N/A"
        
        # Extract dates
        if '-' in dates:
            date_parts = [p.strip() for p in dates.split('-') if p.strip()]
            if len(date_parts) == 2:
                duration_from, duration_to = date_parts
            else:
                duration_from = dates
                duration_to = "N/A"
        else:
            duration_from = dates
            duration_to = "N/A"
    
    return price, size, duration_from, duration_to

def parse_online_time(online_text):
    """Parse the online time text into minutes"""
    if not online_text or online_text == "N/A":
        return -1
    
    online_text = online_text.lower()
    
    # Extract numbers from the text
    numbers = re.findall(r'\d+', online_text)
    if not numbers:
        return 0 if any(word in online_text for word in ['jetzt', 'online', 'current']) else -1
    
    num = int(numbers[0])
    
    if 'minute' in online_text:
        return num
    elif 'hour' in online_text or 'stunde' in online_text:
        return num * 60
    elif 'day' in online_text or 'tag' in online_text:
        return num * 1440
    else:
        return 0 if 'online' in online_text else -1

def scrape_listings():
    """Scrape listings from WG-Gesucht website"""
    url = "https://www.wg-gesucht.de/1-zimmer-wohnungen-in-Berlin.8.1.1.0.html"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the webpage: {e}")
        return []
    
    soup = BeautifulSoup(response.text, 'html.parser')
    listings = []
    
    # Find all listing cards
    cards = soup.find_all('div', class_='card_body')
    
    for card in cards:
        try:
            # Extract title
            title_tag = card.find('h3', class_='truncate_title')
            title = title_tag.get_text(strip=True) if title_tag else "N/A"
            
            # Extract status and location info
            status_location = card.find('div', class_='col-xs-11').get_text(strip=True) if card.find('div', class_='col-xs-11') else "N/A"
            parts = [part.strip() for part in status_location.split('|') if part.strip()]
            
            status = parts[0] if len(parts) > 0 else "N/A"
            location_in_city = parts[1] if len(parts) > 1 else "N/A"
            location_street = parts[2] if len(parts) > 2 else "N/A"
            
            # Extract price, size, and dates with improved parsing
            middle_row = card.find('div', class_='middle')
            price, size, duration_from, duration_to = extract_price_size_dates(middle_row)
            
            # Clean up location street (remove line breaks and extra spaces)
            location_street = ' '.join(location_street.split())
            
            # Extract provider info and online time
            bottom_row = card.find('div', class_='bottom')
            if bottom_row:
                # Provider name
                provider_span = bottom_row.find('span', class_='ml5')
                provider = provider_span.get_text(strip=True) if provider_span else "N/A"
                
                # Check if verified company
                verified_tag = bottom_row.find('a', class_='label_verified')
                is_verified_company = 1 if verified_tag else 0
                
                # Online time
                online_span = bottom_row.find('span', style='color: #218700;') or \
                              bottom_row.find('span', class_='online_status')
                online_text = online_span.get_text(strip=True) if online_span else "N/A"
                minutes = parse_online_time(online_text)
            else:
                provider = "N/A"
                is_verified_company = 0
                minutes = -1
            
            # Current timestamp for when we scraped this data
            scrape_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            
            # Debug print for verification
            print("\nRaw Data:")
            print(f"Middle Row: {middle_row.get_text(strip=True) if middle_row else 'N/A'}")
            print(f"Price: {price}")
            print(f"Size Candidate: {size}")
            print(f"Dates Candidate: {duration_from} - {duration_to}")
            
            listings.append({
                'title': title,
                'status': status,
                'location_in_city': location_in_city,
                'location_street': location_street,
                'price': price,
                'size': size,
                'duration_from': duration_from,
                'duration_to': duration_to,
                'advertisement_time_minutes': minutes,
                'provider': provider,
                'is_verified_company': is_verified_company,
                'scrape_timestamp': scrape_timestamp
            })
            
        except Exception as e:
            print(f"Error processing a listing: {e}")
            continue
    
    return listings

def save_to_database(listings):
    """Save scraped listings to the database"""
    conn = sqlite3.connect('webscrapping_results/house_and_price.db')
    c = conn.cursor()
    
    for listing in listings:
        # Check if this listing already exists (based on title, provider, and price)
        c.execute('''SELECT id FROM listings 
                     WHERE title = ? AND provider = ? AND price = ?''',
                 (listing['title'], listing['provider'], listing['price']))
        
        if not c.fetchone():  # Only insert if it's not already in the database
            c.execute('''INSERT INTO listings 
                         (title, status, location_in_city, location_street, price, size, 
                          duration_from, duration_to, advertisement_time_minutes, 
                          provider, is_verified_company, scrape_timestamp)
                         VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''',
                     (listing['title'], listing['status'], listing['location_in_city'],
                      listing['location_street'], listing['price'], listing['size'],
                      listing['duration_from'], listing['duration_to'], 
                      listing['advertisement_time_minutes'], listing['provider'],
                      listing['is_verified_company'], listing['scrape_timestamp']))
    
    conn.commit()
    conn.close()

def print_listings(listings):
    """Print the scraped listings for verification"""
    print("\nFinal Processed Listings:")
    print("=" * 100)
    for i, listing in enumerate(listings, 1):
        print(f"\nListing {i}:")
        print(f"Title: {listing['title']}")
        print(f"Status: {listing['status']}")
        print(f"Location in City: {listing['location_in_city']}")
        print(f"Location Street: {listing['location_street']}")
        print(f"Price: {listing['price']}")
        print(f"Size: {listing['size']}")
        print(f"Available From: {listing['duration_from']}")
        print(f"Available To: {listing['duration_to']}")
        print(f"Advertisement Time (minutes ago): {listing['advertisement_time_minutes']}")
        print(f"Provider: {listing['provider']}")
        print(f"Verified Company: {'Yes' if listing['is_verified_company'] else 'No'}")
        print(f"Scraped At: {listing['scrape_timestamp']}")
        print("-" * 100)

def main():
    # Create directory if it doesn't exist
    os.makedirs('webscrapping_results', exist_ok=True)
    
    # Create database if it doesn't exist
    create_database()
    
    # Scrape listings
    print("Scraping listings from WG-Gesucht...")
    listings = scrape_listings()
    
    if listings:
        # Print the scraped data
        print_listings(listings)
        
        # Save to database
        save_to_database(listings)
        print(f"\nSuccessfully saved {len(listings)} listings to the database.")
    else:
        print("No listings were scraped.")

if __name__ == "__main__":
    main()

Scraping listings from WG-Gesucht...

Raw Data:
Middle Row: ab530 €sofort frei - auch monatsweise oder für kurze Zeit
Price: N/A
Size Candidate: N/A
Dates Candidate: N/A - N/A

Raw Data:
Middle Row: ab580 €auch monatsweise
Price: N/A
Size Candidate: N/A
Dates Candidate: N/A - N/A

Raw Data:
Middle Row: 340 €10.04.2025
                    
                                            - 06.05.202518 m²
Price: 340 €
Size Candidate: 18 m²
Dates Candidate: 10.04.2025 - 06.05.2025

Raw Data:
Middle Row: 500 €06.04.2025
                    
                                            - 24.06.202747 m²
Price: 500 €
Size Candidate: 47 m²
Dates Candidate: 06.04.2025 - 24.06.2027

Raw Data:
Middle Row: 895 €07.04.202544 m²
Price: 895 €
Size Candidate: 44 m²
Dates Candidate: 07.04.2025 - N/A

Raw Data:
Middle Row: 895 €07.04.202546 m²
Price: 895 €
Size Candidate: 46 m²
Dates Candidate: 07.04.2025 - N/A

Raw Data:
Middle Row: 1200 €01.05.2025
                    
                                    