<a href="https://colab.research.google.com/github/burakemretetik/news_signal/blob/main/ScrapeNews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Scrape the News Homepage

In [2]:
import requests
import time
import pandas as pd
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
import re
import csv
import json

In [3]:
def get_website_html(url):
    """
    Fetches and returns the complete HTML content from a given URL.

    Args:
        url (str): The URL to fetch HTML from

    Returns:
        str: The HTML content of the page
    """
    # Set realistic browser headers
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
        "Accept-Language": "tr-TR,tr;q=0.9,en-US;q=0.8,en;q=0.7",
        "Referer": "https://www.haberturk.com/",
        "Connection": "keep-alive",
        "Cache-Control": "max-age=0"
    }

    try:
        print(f"Fetching HTML from {url}...")
        response = requests.get(url, headers=headers, timeout=30)
        response.raise_for_status()  # Raise an exception for HTTP errors

        print(f"Successfully retrieved HTML ({len(response.text)} characters)")
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the website: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

In [4]:
haberturk = get_website_html("https://www.haberturk.com/ekonomi/")
trthaber = get_website_html("https://www.trthaber.com/haber/ekonomi/")
cnnhaber = get_website_html("https://www.cnnturk.com/ekonomi-haberleri/")
bloomberght = get_website_html("https://www.bloomberght.com/haberler/turkiye-ekonomisi/")
bigpara = get_website_html("https://bigpara.hurriyet.com.tr/haberler/ekonomi-haberleri/")

Fetching HTML from https://www.haberturk.com/ekonomi/...
Successfully retrieved HTML (217346 characters)
Fetching HTML from https://www.trthaber.com/haber/ekonomi/...
Successfully retrieved HTML (103956 characters)
Fetching HTML from https://www.cnnturk.com/ekonomi-haberleri/...
Successfully retrieved HTML (224008 characters)
Fetching HTML from https://www.bloomberght.com/haberler/turkiye-ekonomisi/...
Successfully retrieved HTML (67537 characters)
Fetching HTML from https://bigpara.hurriyet.com.tr/haberler/ekonomi-haberleri/...
Successfully retrieved HTML (124055 characters)


## Haber Türk

In [5]:
# Parse the HTML content
soup = BeautifulSoup(haberturk, 'html.parser')

# Find all <a> tags
haberturk_links = soup.find_all('a', {'data-newscategory': 'Ekonomi'}, class_ = "block gtm-tracker" , href=True)

# Filter links that match the specific pattern
#filtered_links = [link for link in all_links if link['href'].startswith("/ekonomi/borsa/hisse")]

# Print the filtered links
for link in haberturk_links:
    print(link['href'])

/avrupa-buy-europe-3769666-ekonomi
/turk-is-yoksulluk-siniri-75-bin-lirayi-gecti-3769774-ekonomi
/turkiye-artik-yuksek-gelirli-3769733-ekonomi
/kripto-paraciya-kira-vergisi-soku-3769617-ekonomi
/tarihi-hat-37-milyon-yolcu-tasidi-3769763-ekonomi
/2024-buyume-rakamlari-aciklandi-3769637-ekonomi
/bakan-simsek-ten-buyume-degerlendirmesi-3769695-ekonomi
/ocakta-issizlik-geriledi-3769669-ekonomi
/tava-cigerin-fiyati-et-doneri-solladi-3769679-ekonomi
/coinbasee-karsi-acilan-davanin-dusuruldu-3769590-ekonomi
/kadin-isciyi-gece-calistirma-kurallari-3769462-ekonomi
/kamu-iscisinin-zam-talebi-aciklandi-3769461-ekonomi
/doviz-mevduatlarinda-15-yilin-en-hizli-artisi-3769434-ekonomi
/skype-mayis-da-kapanacak-3769887-teknoloji
/borsa-gunu-dususle-tamamladi-3769871-ekonomi
/piyasa-raporu-28-subat-2025-cuma-borsa-dolar-altin-ve-kripto-paralarda-son-durum-fai-3769865-ekonomi
/turkiye-nin-kadin-girisimcisi-yarismasi-nda-en-basarili-kadin-girisimciler-odullerini-aldi-3769846-ekonomi
/turkmen-gazi-yarin-tu

## TRT Haber

In [6]:
# Parse the HTML content
soup = BeautifulSoup(trthaber, 'html.parser')

# Find all <a> tags
all_links = soup.find_all('a', class_ = "site-url", href=True)

# Filter links that match the specific pattern
trthaber_links = [link for link in all_links if link['href'].startswith("https://www.trthaber.com/haber/ekonomi/") and link['href'].endswith(".html")]

for link in trthaber_links:
    print(link['href'])

https://www.trthaber.com/haber/ekonomi/ciftcilere-43-milyar-lira-destek-odemesi-bugun-yapilacak-899285.html
https://www.trthaber.com/haber/ekonomi/ciftcilere-43-milyar-lira-destek-odemesi-bugun-yapilacak-899285.html
https://www.trthaber.com/haber/ekonomi/ciftcilere-43-milyar-lira-destek-odemesi-bugun-yapilacak-899285.html
https://www.trthaber.com/haber/ekonomi/ciftcilere-43-milyar-lira-destek-odemesi-bugun-yapilacak-899285.html
https://www.trthaber.com/haber/ekonomi/ciftcilere-43-milyar-lira-destek-odemesi-bugun-yapilacak-899285.html
https://www.trthaber.com/haber/ekonomi/ciftcilere-43-milyar-lira-destek-odemesi-bugun-yapilacak-899285.html
https://www.trthaber.com/haber/ekonomi/cevdet-yilmaz-ic-talepte-saglikli-toparlanma-gozlenmektedir-899278.html
https://www.trthaber.com/haber/ekonomi/cevdet-yilmaz-ic-talepte-saglikli-toparlanma-gozlenmektedir-899278.html
https://www.trthaber.com/haber/ekonomi/cevdet-yilmaz-ic-talepte-saglikli-toparlanma-gozlenmektedir-899278.html
https://www.trthabe

## CNN Haber

In [7]:
# Parse the HTML content
soup = BeautifulSoup(cnnhaber, 'html.parser')

# Find all <a> tags
all_links = soup.find_all('a', class_="navigate", href=True)

# Filter links that match the specific pattern
cnnhaber_links = [link for link in all_links if link['href'].startswith("/ekonomi/")]

# Print the filtered links
for link in cnnhaber_links:
    print(link['href'])

/ekonomi/galeri/son-dakika-emekli-haberi-2024-yilinda-rakam-100-bine-dayandi-o-emekli-maaslari-iptal-ediliyor-hizmet-dokumunuzu-kontrol-edin-2247644
/ekonomi/turkmen-gazi-yarin-turkiyede-2247608
/ekonomi/bakan-isikhan-is-gucu-verilerini-degerlendirdi-2247601
/ekonomi/bakan-simsek-2024-yili-buyume-verilerini-degerlendirdi-iyilesen-guven-ekonomiye-olumlu-etkileyecek-2247593
/ekonomi/galeri/ev-sahipleri-dikkat-1-martta-yeni-donem-basliyor-kirada-33-bin-lira-istisnasi-2247551
/ekonomi/son-dakika-haberi-issizlik-verileri-son-12-yilin-en-dusuk-seviyesine-geriledi-2247538
/ekonomi/son-dakika-haberi-turkiye-ekonomisi-2024te-yuzde-3-2-buyudu-2247531
/ekonomi/borsa-istanbul-haftanin-son-islem-gunune-dususle-basladi-28-subat-2025-2247529
/ekonomi/kuresel-piyasalar-negatif-seyrediyor-trumpin-tarifeleri-ve-enflasyon-endiseleri-piyasalari-sarsiyor-2247524
/ekonomi/bakan-kacir-turkiyenin-ingiltere-ve-avrupaya-katacaklari-cok-fazla-2247493
/ekonomi/son-dakika-haberi-turkiye-ekonomisi-2024te-yuzde-3-2-

## Bloomberg HT

In [8]:
# Parse the HTML content
soup = BeautifulSoup(bloomberght, 'html.parser')

# Find all <a> tags
all_links = soup.find_all('a', href=True)

# Filter links ending with a number
bloomberght_links = [link['href'] for link in all_links if re.search(r'\d$', link['href'])]

# Print the filtered links
for link in bloomberght_links:
    print(link)

/cumhurbaskani-yardimcisi-yilmaz-londra-da-yabanci-yatirimcilarla-bulustu-3742819
/kira-geliri-elde-edenler-icin-beyan-suresi-yarin-basliyor-3742811
/spkden-4-sirketin-borclanma-araci-ihracina-onay-3742749
/cevdet-yilmaz-londra-da-uluslararasi-yatirimcilarla-bir-araya-geldi-3742740
/doviz-mevduatlarinda-1-5-yilin-en-hizli-artisi-3742705
/arac-muayenelerinde-ucretsiz-cagir-merkezi-geliyor-3742492
/yapi-kredi-yatirim-dan-2025-hisse-strateji-raporu-3742403
/arac-muayene-istasyonlarinin-ozellestirme-ihaleleri-basladi-3742402
/aselsan-bmc-ile-yeni-sozlesme-imzaladi-3742401
/yapi-kredi-dolar-cinsi-tahvil-ihrac-ediyor-3742361
/sektorel-guven-hizmet-ve-insaatta-azaldi-3742328
/ocak-ayinda-kredi-karti-sayisi-yuzde-9-artti-3742231
/turkiye-de-tasarruf-yapanlarin-ilk-tercihi-altin-3742227
/finansal-hizmetler-guven-endeksi-yukseldi-3742215
/cevdet-yilmaz-dan-merkez-bankasi-rezervlerine-iliskin-aciklama-3742162
/turkiye-deki-startup-yatirimlari-rekor-seviyeye-ulasti-3742136
/yabanci-para-mevduatlar

## Bigpara

In [9]:
# Parse the HTML content
soup = BeautifulSoup(bigpara, 'html.parser')

# Find all <a> tags
all_links = soup.find_all('a', {'data-query-param': "bpc", 'href': True})

# Filter links ending with a number
bigpara_links = [link for link in all_links if link['href'].startswith("/haberler/ekonomi-haberleri/")]

# Print the filtered links
for link in bigpara_links:
    print(link['href'])

/haberler/ekonomi-haberleri/guclu-buyuyen-turkcellden-guclu-yatirim_ID1608314/
/haberler/ekonomi-haberleri/cumhurbaskani-yardimcisi-yilmaz-toplumsal-refahi-artirmayi-hedefliyoruz_ID1608307/
/haberler/ekonomi-haberleri/bakan-bolat-ekonomi-18-ceyrektir-kesintisiz-buyuyor_ID1608302/
/haberler/ekonomi-haberleri/turkiye-oecdnin-en-hizli-buyuyen-iki-ekonomisinden-biri-oldu_ID1608300/
/haberler/ekonomi-haberleri/ito-baskani-avdagic-buyume-kompozisyonu-kalici-refaha-ulastiracak_ID1608298/
/haberler/ekonomi-haberleri/bakan-isikhan-issizlik-yuzde-8-4-olarak-gerceklesti_ID1608295/
/haberler/ekonomi-haberleri/otomobil-devi-stellantis-karini-acikladi_ID1608294/
/haberler/ekonomi-haberleri/bakan-simsekten-buyume-yorumu-temellerimizi-saglamlastirdik_ID1608291/
/haberler/ekonomi-haberleri/son-dakika-issizlik-rakamlari-belli-oldu_ID1608279/
/haberler/ekonomi-haberleri/son-dakika-turkiye-ekonomisi-yuzde-buyudu_ID1608278/
/haberler/ekonomi-haberleri/imf-baskanindan-ekonomi-icin-korkutan-aciklama_ID160826

# Send News to the LLM and Get a Response

In [48]:
# Parse the HTML content
soup = BeautifulSoup(bigpara, 'html.parser')

for ul_tag in soup.find_all('ul'):
    # Extract href
    href_tag = ul_tag.find('a', href=True)
    href = href_tag['href'] if href_tag else None  # Handle if href not found
    href = href[28:-11] if href_tag else None  # Handle if href not found

    # Extract date and time
    date_tag = ul_tag.find('li', class_='cell012')
    date = date_tag.text if date_tag else None  # Handle if date not found

    time_tag = ul_tag.find('li', class_='cell024')
    time = time_tag.text if time_tag else None  # Handle if time not found

    if href and date and time:  # Print only if all elements found
        news_data.append([href, date, time])  # Store as list for CSV writing

news_data

[['guclu-buyuyen-turkcellden-guclu-yatirim', '28.2.2025', '21:42'],
 ['cumhurbaskani-yardimcisi-yilmaz-toplumsal-refahi-artirmayi-hedefliyoruz',
  '28.2.2025',
  '14:43'],
 ['bakan-bolat-ekonomi-18-ceyrektir-kesintisiz-buyuyor', '28.2.2025', '13:45'],
 ['turkiye-oecdnin-en-hizli-buyuyen-iki-ekonomisinden-biri-oldu',
  '28.2.2025',
  '13:08'],
 ['ito-baskani-avdagic-buyume-kompozisyonu-kalici-refaha-ulastiracak',
  '28.2.2025',
  '12:50'],
 ['bakan-isikhan-issizlik-yuzde-8-4-olarak-gerceklesti', '28.2.2025', '12:11'],
 ['otomobil-devi-stellantis-karini-acikladi', '28.2.2025', '11:58'],
 ['bakan-simsekten-buyume-yorumu-temellerimizi-saglamlastirdik',
  '28.2.2025',
  '11:10'],
 ['dye-ne-kadar-2025-ne-zaman-verilir-fidye-kimlere-verilir-kimlere-verilmez-diyanet-fidye-ve-fitre-2025-bedeli',
  '28.2.2025',
  '17:12'],
 ['son-dakika-issizlik-rakamlari-belli-oldu', '28.2.2025', '10:00'],
 ['son-dakika-turkiye-ekonomisi-yuzde-buyudu', '28.2.2025', '10:00'],
 ['imf-baskanindan-ekonomi-icin-kork

In [11]:
# Assuming news_data is a list of lists like [['href', 'date', 'time'], ...]
df = pd.DataFrame(news_data, columns=['Title', 'Date', 'Time'])

# Convert 'Date' and 'Time' to datetime objects
df['DateTime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'], format='%d.%m.%Y %H:%M')

# Get current time
now = datetime.now()

# Filter news from the last hour
recent_news = df[df['DateTime'] > now - timedelta(hours=5)]

# Drop the index
recent_news.reset_index(drop=True, inplace=True)

# Display the filtered news
recent_news

Unnamed: 0,Title,Date,Time,DateTime
0,guclu-buyuyen-turkcellden-guclu-yatirim,28.2.2025,21:42,2025-02-28 21:42:00
1,cumhurbaskani-yardimcisi-yilmaz-toplumsal-refa...,28.2.2025,14:43,2025-02-28 14:43:00
2,dye-ne-kadar-2025-ne-zaman-verilir-fidye-kimle...,28.2.2025,17:12,2025-02-28 17:12:00


In [41]:
recent_news = recent_news['Title'].tolist()
recent_news.append("anadolu-efes-190-milyon-liralık-yatırım-aldı") # Add an extra news to test if LLM responses to a signal
recent_news

['guclu-buyuyen-turkcellden-guclu-yatirim',
 'cumhurbaskani-yardimcisi-yilmaz-toplumsal-refahi-artirmayi-hedefliyoruz',
 'dye-ne-kadar-2025-ne-zaman-verilir-fidye-kimlere-verilir-kimlere-verilmez-diyanet-fidye-ve-fitre-2025-bedeli',
 'anadolu-efes-190-milyon-liralık-yatırım-aldı']

In [42]:
!wget https://raw.githubusercontent.com/burakemretetik/news_signal/master/bist_100_hisseleri.csv

bist_100 = pd.read_csv("bist_100_hisseleri.csv")

stock_list = bist_100["Hisse Adı"].tolist()

my_stocks = stock_list[:10]

--2025-02-28 20:06:33--  https://raw.githubusercontent.com/burakemretetik/news_signal/master/bist_100_hisseleri.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6215 (6.1K) [text/plain]
Saving to: ‘bist_100_hisseleri.csv.8’


2025-02-28 20:06:33 (54.1 MB/s) - ‘bist_100_hisseleri.csv.8’ saved [6215/6215]



In [43]:
my_stocks

['ADEL KALEMCİLİK TİCARET VE SANAYİ',
 'ANADOLU EFES BİRACILIK VE MALT SANAYİİ',
 'AG ANADOLU GRUBU HOLDİNG',
 'AGROTECH YÜKSEK TEKNOLOJİ VE YATIRIM',
 'AHLATCI DOĞAL GAZ DAĞITIM ENERJİ VE YATIRIM',
 'AKBANK',
 'AKÇANSA ÇİMENTO SANAYİ VE TİCARET',
 'AKFEN GAYRİMENKUL YATIRIM ORTAKLIĞI',
 'AKFEN YENİLENEBİLİR ENERJİ',
 'AKSA AKRİLİK KİMYA SANAYİİ']

In [44]:
prompt = f"""Analyze the news headlines below and identify **ONLY** headlines that **explicitly** mention one of my listed stocks.
----------
**Rules**:
1. **Direct Mention Required**: Include ONLY if the headline directly mentions the company (e.g., "COMPANY_NAME announces record profits").
2. **Exclude**:
   - Sector-wide trends.
   - Government policies.
   - Indirect references.
3. **No interpretations/speculations**: If unsure, omit.
----------
**Output Format**:
```json
{{
  "direct_news": [
    {{"hisse_adi": "COMPANY_NAME", "haber_basligi": "exact-headline-slug"}}
  ],
  "no_direct_news_found": true/false
}}
My Stocks: {my_stocks}

News Headlines: {recent_news}"""

In [38]:
!pip install openai
from openai import OpenAI

import os
os.environ["OPENAI_API_KEY"] = "<Your_API_KEY>"
client = OpenAI()



In [45]:
response = client.chat.completions.create(
  model="gpt-4o-mini",
  messages=[
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
  ]
)

print(response.choices[0].message.content)

```json
{
  "direct_news": [
    {"hisse_adi": "ANADOLU EFES BİRACILIK VE MALT SANAYİİ", "haber_basligi": "anadolu-efes-190-milyon-liralık-yatırım-aldı"}
  ],
  "no_direct_news_found": false
}
```
