In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from urllib.parse import urljoin
import re
import os
import json
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from datetime import datetime

In [2]:
base_url = "https://www.gardrops.com/search?subCats=1408%2C1402%2C1403%2C1410%2C1405%2C1409%2C1404%2C1407%2C1413%2C1406%2C1411&page=1"

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'tr-TR,tr;q=0.9,en-US;q=0.8,en;q=0.7',
    'Connection': 'keep-alive',
}

In [3]:
def get_product_details(product_url):    

    product_data = {
        'URL': product_url,
        'Ürün Adı': '',
        'Fiyat': '',
        'Marka': '',
        'Kullanım Durumu': '',
        'Renk': '',
        'Kategori': '',
        'İlan Tarihi': '',
        'Açıklama': '',
        'Satıcı': '',
        'Görsel URL': '',
        'Beğeni Sayısı': '',
        'Görüntülenme': ''
    }
    
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disable-gpu')
    options.add_argument('--disable-blink-features=AutomationControlled')
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    
    try:
        driver.get(product_url)
        
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )
        time.sleep(3)
        
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        page_html = driver.page_source
        
        # Fiyat
        price_match = re.search(r'\\"price\\":(\d+)', page_html)
        if price_match:
            product_data['Fiyat'] = price_match.group(1) + ' ₺'
        
        # Marka
        brand_match = re.search(r'\\"brandName\\":\\"([^"\\\\]+)\\"', page_html)
        if brand_match:
            product_data['Marka'] = brand_match.group(1)
        
        # Kategori
        category_match = re.search(r'\{\\"type\\":\\"CATEGORY\\",\\"key\\":\\"Kategori\\",\\"value\\":\\"([^"\\\\]+)\\"\}', page_html)
        if category_match:
            product_data['Kategori'] = category_match.group(1)
        
        # Kullanım Durumu
        condition_match = re.search(r'\{\\"type\\":\\"CONDITION\\",\\"key\\":\\"Kullanım Durumu\\",\\"value\\":\\"([^"\\\\]+)\\"\}', page_html)
        if condition_match:
            product_data['Kullanım Durumu'] = condition_match.group(1)
        
        # Renk
        color_match = re.search(r'\{\\"type\\":\\"COLOR\\",\\"key\\":\\"Renk\\",\\"value\\":\\"([^"\\\\]+)\\"\}', page_html)
        if color_match:
            product_data['Renk'] = color_match.group(1)
        
        # İlan Tarihi
        date_match = re.search(r'\{\\"type\\":\\"UPLOAD_DATE\\",\\"key\\":\\"Yüklenme Tarihi\\",\\"value\\":\\"([^"\\\\]+)\\"\}', page_html)
        if date_match:
            product_data['İlan Tarihi'] = date_match.group(1)
        
        # Görsel URL
        image_match = re.search(r'\\"images\\":\[\{\\"small\\":\\"([^"\\\\]+)\\"', page_html)
        if image_match:
            product_data['Görsel URL'] = image_match.group(1)
        
        # Satıcı
        seller_match = re.search(r'\\"username\\":\\"([^"\\\\]+)\\"', page_html)
        if seller_match:
            product_data['Satıcı'] = seller_match.group(1)
        
        # Beğeni gerekli olduğunda
        like_match = re.search(r'\\"likeCount\\":(\d+)', page_html)
        if like_match:
            product_data['Beğeni Sayısı'] = like_match.group(1)
        
        # Ürün başlığı ve açıklama
        title_match = re.search(r'\\"title\\":\\"([^"\\\\]+)\\"', page_html)
        if title_match:
            product_data['Ürün Adı'] = title_match.group(1)
        
        description_match = re.search(r'\\"description\\":\\"([^"\\\\]+)\\"', page_html)
        if description_match:
            product_data['Açıklama'] = description_match.group(1)
        
    
        if not product_data['Ürün Adı']:
            meta_title = soup.find('meta', property='og:title')
            if meta_title:
                product_data['Ürün Adı'] = meta_title.get('content', '')
        
        if not product_data['Açıklama']:
            meta_desc = soup.find('meta', property='og:description')
            if meta_desc:
                product_data['Açıklama'] = meta_desc.get('content', '')
        
        if not product_data['Görsel URL']:
            image_link = soup.find('link', rel='preload', attrs={'as': 'image'})
            if image_link:
                product_data['Görsel URL'] = image_link.get('href', '')
        
        return product_data
    
    except Exception as e:
        print(f"✗ Ürün detayı çekilemedi ({product_url}): {e}")
        return None
    
    finally:
        driver.quit()

In [4]:
def scrape_gardrops_with_selenium(base_url, max_pages=50, max_products_per_page=None):
    
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disable-blink-features=AutomationControlled')
    options.add_argument('--window-size=1920,1080')
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    
    all_products = []
    
    try:
        # Tüm sayfaları tara
        for page_num in range(1, max_pages + 1):
            current_url = base_url.replace('page=1', f'page={page_num}')
            
            print(f"Sayfa {page_num}/{max_pages} - URL: {current_url}")
            driver.get(current_url)
            
            time.sleep(8)  
        
            
            print(" Sayfa kaydırılıyor")
            for i in range(3):
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(2)

            # Sayfayı tekrar yukarı getir
            driver.execute_script("window.scrollTo(0, 0);")
            time.sleep(2)
            
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            
            
            all_links = soup.find_all('a', href=True)
            product_links = []
            
            for link in all_links:
                href = link.get('href', '')
               
                if href.startswith('/') and link.find('img'):
                    
                    if not any(x in href.lower() for x in ['premium', 'help', 'download', 'about', 
                                                            'kadin-x', 'erkek-x', 'cocuk-x', '-x-c', 
                                                            '-x-sc', 'cart', 'favorites', 'search']):
                        full_url = urljoin('https://www.gardrops.com', href)
                        if full_url not in product_links:
                            product_links.append(full_url)
            
            print(f"  {len(product_links)} ürün bulundu")
            
            if not product_links:
                print(f"Sayfa {page_num}'de ürün bulunamadı.")
                continue
            
            if max_products_per_page:
                product_links = product_links[:max_products_per_page]
            
            
            page_products = 0
            for idx, product_url in enumerate(product_links, 1):
                print(f"Ürün {idx}/{len(product_links)} işleniyor", end='\r')
                
                try:
                    product_data = get_product_details(product_url)
                    if product_data and product_data['Ürün Adı']:  # Sadece geçerli ürünleri ekle
                        all_products.append(product_data)
                        page_products += 1
                    time.sleep(1)  
                except Exception as e:
                    print(f"\n   ✗ Ürün atlandı: {str(e)[:50]}")
                    continue
            
            print(f"\n  Sayfa {page_num} tamamlandı: {page_products} ürün çekildi (Toplam: {len(all_products)})\n")
        
        
    except Exception as e:
        print(f"\n Hata: {e}")
        import traceback
        traceback.print_exc()
    
    finally:
        driver.quit()
        print("\nTarayıcı kapatıldı")
    
    return all_products

In [5]:
# Ana scraping işlemi - İstediğiniz sayfa sayısını buradan ayarlayın
base_url = "https://www.gardrops.com/search?subCats=1408%2C1402%2C1403%2C1410%2C1405%2C1409%2C1404%2C1407%2C1413%2C1406%2C1411&page=1"

all_products_50_pages = scrape_gardrops_with_selenium(
    base_url=base_url,
    max_pages=50,  # Kaç sayfa çekilecek
    max_products_per_page=None  # Her sayfadaki tüm ürünler (veya sayı belirtin: 5, 10 vs.)
)

if all_products_50_pages and len(all_products_50_pages) > 0:

    df_all_pages = pd.DataFrame(all_products_50_pages)
    
    # Excel'e kaydet
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"gardrops_cantalar_{len(all_products_50_pages)}_urun_{timestamp}.xlsx"
    
    df_all_pages.to_excel(filename, index=False, engine='openpyxl')
    
    print(f"\n BAŞARILI! {len(all_products_50_pages)} ürün '{filename}' dosyasına kaydedildi\n")
    print(" İLK 5 ÜRÜN:")
    print(df_all_pages.head().to_string())
    
else:
    print("\n Hiç ürün çekilemedi!")

Sayfa 1/50 - URL: https://www.gardrops.com/search?subCats=1408%2C1402%2C1403%2C1410%2C1405%2C1409%2C1404%2C1407%2C1413%2C1406%2C1411&page=1

Tarayıcı kapatıldı


KeyboardInterrupt: 