In [4]:
import csv
import time
import random
from pathlib import Path
from typing import List, Dict, Optional

import requests
from bs4 import BeautifulSoup

In [5]:
BASE_URL = "https://quotes.toscrape.com"
START_URL = f"{BASE_URL}/" # ilk sayfa

In [9]:
# HTTP / HTTPS oturumu ve temel ayarlar

In [24]:
session = requests.Session()
session.headers.update({
    "User-Agent":"Educational scraper"
})

In [25]:
DEFAULT_TIMEOUT = 10
MAX_RETRIES = 3
BACKOFF_BASE = 1.5


In [26]:
# URL çekme

In [46]:
def fetch(url : str) -> Optional[requests.Response]:
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            response = session.get(url, timeout = DEFAULT_TIMEOUT)
            if response.status_code == 200:
                return response
            else:
                print(f"[UYARI] {url} -> HTTP {response.status_code}")
        except requests.RequestException as e:
            print(f"[HATA] {url} istek hatası : {e}")

        backoff_time = BACKOFF_BASE ** attempt + random.uniform(0,0.5)
        rime.sleep(backoff_time)
    print(f"[PES ETTİM]")
    return None
                

In [47]:
r = fetch(BASE_URL)

In [48]:
print(r.text) # sayfanın html listesinin döktü

<!DOCTYPE html>
<html lang="en">
<head>
	<meta charset="UTF-8">
	<title>Quotes to Scrape</title>
    <link rel="stylesheet" href="/static/bootstrap.min.css">
    <link rel="stylesheet" href="/static/main.css">
    
    
</head>
<body>
    <div class="container">
        <div class="row header-box">
            <div class="col-md-8">
                <h1>
                    <a href="/" style="text-decoration: none">Quotes to Scrape</a>
                </h1>
            </div>
            <div class="col-md-4">
                <p>
                
                    <a href="/login">Login</a>
                
                </p>
            </div>
        </div>
    

<div class="row">
    <div class="col-md-8">

    <div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
        <span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>
        <span>by <small class="auth

In [63]:
from typing import List, Dict
from bs4 import BeautifulSoup

def quotes_parser(html: str) -> List[Dict]:
    soup = BeautifulSoup(html, "html.parser")
    results = []

    for quote in soup.select("div.quote"):
        text = quote.select_one("span.text")
        author = quote.select_one("small.author")
        tags = quote.select("div.tags a.tag")

        results.append({
            "text": text.get_text(strip=True) if text else "",
            "author": author.get_text(strip=True) if author else "",
            "tags": ",".join(t.get_text(strip=True) for t in tags) if tags else ""
        })
    return results


In [64]:
response = fetch(BASE_URL)
results = quotes_parser(response.text)
print(results)

[{'text': '“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”', 'author': 'Albert Einstein', 'tags': 'change,deep-thoughts,thinking,world'}, {'text': '“It is our choices, Harry, that show what we truly are, far more than our abilities.”', 'author': 'J.K. Rowling', 'tags': 'abilities,choices'}, {'text': '“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”', 'author': 'Albert Einstein', 'tags': 'inspirational,life,live,miracle,miracles'}, {'text': '“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”', 'author': 'Jane Austen', 'tags': 'aliteracy,books,classic,humor'}, {'text': "“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”", 'author': 'Marilyn Monroe', 'tags': 'be-yourself,inspirational'}, {'text': '“Try not to become a man of su

In [71]:
def find_next_page_url(html: str) -> Optional[str]:
    soup = BeautifulSoup(html, "html.parser")
    next_link = soup.select_one("li.next > a")
    if not next_link or not next_link.get("href"):
        return None
    else:
        return BASE_URL + next_link["href"]

In [81]:
def save_to_csv(rows: List[Dict], out_path: Path):
    out_path.parent.mkdir(parents=True, exist_ok=True)
    fieldnames = ["text", "author", "tags"]
    write_header = not out_path.exists()

    with out_path.open("a", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        if write_header:
            writer.writeheader()
        writer.writerows(rows)

In [82]:
def crawl_all_quotes(start_url: str=START_URL, out_csv: str="./data/quotes.csv"):
    current_url = start_url
    csv_path = Path(out_csv)
    page_no = 1
    total = 0

    print(f"[Veri kazıma başladı]")

    while current_url:
        print(f"[GET] {page_no} kazınıyor")
        resp = fetch(current_url)
        if resp is None:
            print("[Durdu]")
            break
        quotes = quotes_parser(resp.text)

        if quotes:
            save_to_csv(quotes, csv_path)
            total +=1
            print(f"[Ok] {page_no} : {len(quotes)} kayıt eklendi")
        else:
            print("[Uyarı] Bu sayfada quote yok")
        time.sleep(random.uniform(0.8,1.6))

        next_url = find_next_page_url(resp.text)
        if next_url:
            current_url = next_url
            page_no += 1
        else:
            print("Bitti")
            break
        

In [83]:
if __name__ == "__main__":
    crawl_all_quotes()

[Veri kazıma başladı]
[GET] 1 kazınıyor
[Ok] 1 : 10 kayıt eklendi
[GET] 2 kazınıyor
[Ok] 2 : 10 kayıt eklendi
[GET] 3 kazınıyor
[Ok] 3 : 10 kayıt eklendi
[GET] 4 kazınıyor
[Ok] 4 : 10 kayıt eklendi
[GET] 5 kazınıyor
[Ok] 5 : 10 kayıt eklendi
[GET] 6 kazınıyor
[Ok] 6 : 10 kayıt eklendi
[GET] 7 kazınıyor
[Ok] 7 : 10 kayıt eklendi
[GET] 8 kazınıyor
[Ok] 8 : 10 kayıt eklendi
[GET] 9 kazınıyor
[Ok] 9 : 10 kayıt eklendi
[GET] 10 kazınıyor
[Ok] 10 : 10 kayıt eklendi
Bitti
