In [None]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import requests
import re
from bs4 import BeautifulSoup
import time
import urllib3
import certifi 
import seaborn as sns
import numpy as np

In [None]:
# Function: to scrape English speeches 

# silence warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# common headers & session
headers = {
    "User-Agent": (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/117.0 Safari/537.36"
    )
}

def f_scrape_tr_speech(listing_path, total_pages=18, pause=1.0):
    session = requests.Session()
    session.verify = certifi.where()
    all_records = []

    # regex to filter out phone calls / visits
    reg_pattern = re.compile(r"\bvisit\b|\bphone call(s)?\b", flags=re.IGNORECASE)

    for page_num in range(1, total_pages + 1):
        url = base_url + listing_path if page_num == 1 else f"{base_url}{listing_path}?&page={page_num}"
        print(f"Scraping page {page_num} → {url}")

        resp = session.get(url, headers=headers, timeout=10, verify=False)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.content, "html.parser")

        for dt in soup.find_all("dt", class_="date"):
            date_str = dt.get_text(strip=True)
            dd = dt.find_next_sibling("dd")
            a = dd.find("a")
            href = a["href"]
            full_url = base_url + href
            title = a.get_text(strip=True)

            # skip titles mentioning visits or phone calls
            if reg_pattern.search(title):
                continue

            # Fetch the detail page
            art_resp = session.get(full_url, headers=headers, timeout=10, verify=False)
            art_resp.raise_for_status()
            art_soup = BeautifulSoup(art_resp.content, "html.parser")

            content_div = (
                art_soup.find(id="divContentArea")
                or art_soup.select_one("div.field--name-body")
                or art_soup.find("article")
            )
            if not content_div:
                continue

            # Extract and clean paragraphs
            paras = [p.get_text(strip=True) for p in content_div.find_all("p")]
            # drop fragments that end with comma (greetings)
            paras = [t for t in paras if t and not t.endswith(',')]

            # remove paragraphs with fewer than 4 words
            paras = [t for t in paras if len(t.split()) >= 4]

            if not paras:
                continue

            for paragraph in paras:
                all_records.append({
                    "date":  date_str,
                    "link":  full_url,
                    "title": title,
                    "text":  paragraph
                })

            time.sleep(pause)

    return pd.DataFrame(all_records, columns=["date", "link", "title", "text"])


# Turkey Articles in English 
base_url = "https://www.tccb.gov.tr"
df_tr_speech_en = f_scrape_tr_speech("/en/receptayyiperdogan/speeches/")

# Save the data to CSV ()
#df_tr_speech_en.to_csv("../data/raw/tr_speech_en.csv", index=False, encoding="utf-8-sig")
