In this final project, i will attempt to measure the effects variables like duration and the legality of strikes have on both strike survival (in a survival analysis manner) and strike success in wage/collective bargaining based strike events, from strike events extracted from newspaper articles. So, this project will consist of three parts: Web scraping, event extraction & detection and survival analysis as well as logit analysis for success.

PART 1 - WEBSCRAPING

I will be scraping all of the article contents, titles, website links and dates of articles in the Evrensel İşçi-Sendika category from the start of 2024 to December. This is a pretty standard webscraping script, which crawls through each page and first scrapes the title, data, link of each article. Then, it opens each article link and extracts the contents. One interesting thing is that i had to use the playwright api for scraping since it gets around Evrensel's firewall protection really well by "acting" as a real user. The downside is that webscraping in this manner takes a long time. After some light text cleaning, i export the results to an excel file.

One small thing: i had to delete the outputs since the file became too big.

In [None]:
import asyncio
from playwright.async_api import async_playwright
import pandas as pd


BASE_URL = "https://www.evrensel.net/kategori/2/isci-sendika"
START_PAGE = 403
END_PAGE = 22 

MAX_CONCURRENT_ARTICLES = 6
ARTICLE_TIMEOUT_MS = 30_000


def make_category_url(page_num: int) -> str:
 
    return f"{BASE_URL}/s/{page_num}#haberler"


async def scrape_category_page(page, url):
    """
    Scrapes one Evrensel category page.
    Returns list of dicts with title, date, link.
    """
    await page.goto(url, wait_until="networkidle")

    news_items = []
    items = page.locator("div.kategoriHaberler > span")
    count = await items.count()

    for i in range(count):
        item = items.nth(i)

        title_loc = item.locator("div.title")
        date_loc = item.locator("div.tarih")
        link_loc = item.locator("a")

        if await title_loc.count() == 0 or await date_loc.count() == 0 or await link_loc.count() == 0:
            continue

        title = (await title_loc.inner_text()).strip()
        date = (await date_loc.inner_text()).strip()

        link = await link_loc.get_attribute("href")
        if not link:
            continue
        if link.startswith("/"):
            link = "https://www.evrensel.net" + link

        news_items.append({"title": title, "date": date, "link": link})

    return news_items



async def extract_article_text(page):
    """
    Extracts clean Evrensel article text:
    ONLY grabs paragraphs inside div.news-content
    """
    paragraphs = page.locator("div.news-content p")
    count = await paragraphs.count()

    if count == 0:
        return None

    parts = []
    for i in range(count):
        t = (await paragraphs.nth(i).inner_text()).strip()
        if t:
            parts.append(t)

    return "\n\n".join(parts)


async def scrape_article(context, url, semaphore):
    async with semaphore:
        page = await context.new_page()
        try:
            await page.goto(url, wait_until="domcontentloaded", timeout=ARTICLE_TIMEOUT_MS)
            await page.wait_for_selector("div.news-content", timeout=10_000)

            content = await extract_article_text(page)
            return {"link": url, "content": content}

        except Exception as e:
            return {"link": url, "content": None, "error": str(e)}

        finally:
            await page.close()



async def crawl_evrensel():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()


        all_articles = []
        for page_num in range(START_PAGE, END_PAGE - 1, -1):
            url = make_category_url(page_num)
            print(f"Scraping category page {page_num} → {url}")

            rows = await scrape_category_page(page, url)

            if not rows:
                print(f"0 items on page {page_num}. Stopping early.")
                break

            all_articles.extend(rows)


        seen = set()
        deduped = []
        for r in all_articles:
            if r["link"] not in seen:
                seen.add(r["link"])
                deduped.append(r)
        all_articles = deduped

   
        semaphore = asyncio.Semaphore(MAX_CONCURRENT_ARTICLES)
        links = [row["link"] for row in all_articles]

        print(f"\nScraping contents for {len(links)} articles...\n")

        tasks = [scrape_article(context, link, semaphore) for link in links]
        article_texts = await asyncio.gather(*tasks)

        await browser.close()


    df_meta = pd.DataFrame(all_articles)
    df_text = pd.DataFrame(article_texts)
    df = df_meta.merge(df_text, on="link", how="left")

    return df


df = await crawl_evrensel()
df.head(), len(df)


In [None]:

df["date"] = df["date"].str.replace("\n", " ", regex=False).str.strip()


df["date"] = df["date"].str.replace(
    "Güncelleme",
    "<span style='color:red; font-weight:bold;'>Güncelleme</span>",
    regex=False
)

# 3. Render HTML in DataFrame
from IPython.display import HTML, display
pd.set_option('display.max_colwidth', None)
pd.set_option('display.html.use_mathjax', False)

display(HTML(df.to_html(escape=False)))

In [None]:
import re
def clean_date(text):
    if not isinstance(text, str):
        return text
    

    text = re.sub(r"<.*?>", "", text)
    
bb
    text = text.replace("\n", " ").strip()
    

    text = re.sub(r"\s+", " ", text)

  
    text = text.replace("Güncelleme:", "— Güncelleme:")
    
    return text

df["date"] = df["date"].apply(clean_date)


df.to_csv("evrensel_isci_sendika_2024_dec2025_clean_fin.csv", index=False, encoding="utf-8-sig")

df.head()

In [None]:
import re
def clean_date(text):
    if not isinstance(text, str):
        return text
    

    text = re.sub(r"<.*?>", "", text)
    

    text = text.replace("\n", " ").strip()
    

    text = re.sub(r"\s+", " ", text)

  
    text = text.replace("Güncelleme:", "— Güncelleme:")
    
    return text

df["date"] = df["date"].apply(clean_date)


df.to_csv("evrensel_isci_sendika_2024_dec2025_clean_fin.csv", index=False, encoding="utf-8-sig")

df.head()

Go to file "detection_extraction_evrensel_FINFIN.ipynb" for the next part. 