In [None]:
from bs4 import BeautifulSoup
from datetime import datetime
import requests
import json
import pandas as pd
import html

def get_article_links(category_url):
    page = requests.get(category_url)
    soup = BeautifulSoup(page.text, 'html.parser')

    links = set()
    for link in soup.find_all('a', href=True):
        href = link['href']

        if any(char.isdigit() for char in href) and "-" in href:
            full_url = "https://www.klix.ba" + href if href.startswith("/") else href
            links.add(full_url)

    return list(links)

category_url = "https://www.klix.ba/biznis"
article_links = get_article_links(category_url)

print("Prikupljeni linkovi:")
for link in article_links:
    print(link)

all_data = []

for url in article_links:
  page = requests.get(url)
  soup = BeautifulSoup(page.text, 'html')

  kategorija = soup.find('a', class_="hover:no-underline hover:text-gray-500 dark:hover:text-white block")
  if kategorija:
    href = kategorija['href']
    segments = href.split('/')
    if len(segments) > 1:
        kat = segments[1]

  rubrika_klasa = f'flex-1 mb-1 uppercase text-xs font-title md:text-base text-{kat}'
  rubrika = soup.find('div', class_=rubrika_klasa)

  info = soup.find('script', type='application/ld+json')
  link_info = soup.find('meta', property='og:url')
  if info:
    json_data = json.loads(info.string)
    autori = json_data.get('author', {}).get('name', 'N/A')
    datum_objave = json_data.get('datePublished', 'N/A')
    portal = json_data.get('publisher', {}).get('name', 'N/A')
    full_naslov = json_data.get('headline', 'N/A')

  parts = [part.strip() for part in full_naslov.split('/', 2)]
  if len(parts) == 3:
    nadnaslov, naslov, podnaslov = parts
  elif len(parts) == 2:
    nadnaslov, naslov = parts
    podnaslov = 'N/A'
  else:
    nadnaslov = 'N/A'
    naslov = full_naslov
    podnaslov = 'N/A'

  sadrzaj_parts = []

  # Prvi dio
  excerpt_div = soup.find('div', id='excerpt')
  if excerpt_div:
    excerpt_span = excerpt_div.find('span', class_='lead')
    if excerpt_span:
        sadrzaj_parts.append(excerpt_span.get_text(strip=True))

  # Glavni dio teksta
  main_text_div = soup.find('div', class_='break-words mt-3 text-lg lg:text-xl space-y-4 mb-3 leading-6 md:leading-8 dark:text-gray-200')
  if main_text_div:
    sadrzaj_parts.append(main_text_div.get_text(separator='\n', strip=True))

  # Zadnji dio
  end_text_div = soup.find('div', class_='lg:max-w-2xl lg:mx-auto mt-3 text-lg md:text-xl mb-3 leading-6 md:leading-8 dark:text-gray-200')
  if end_text_div:
    sadrzaj_parts.append(end_text_div.get_text(separator='\n', strip=True))

  # Spajanje svih dijelova u jedan tekst
  sadrzaj = '\n\n'.join(sadrzaj_parts) if sadrzaj_parts else 'N/A'


  rubrika_tekst = rubrika.text.strip() if rubrika else 'N/A'
  if link_info:
    link = link_info['content']
  datum_objekt = datetime.strptime(datum_objave, "%Y-%m-%dT%H:%M:%SZ")
  datum_objave = datum_objekt.strftime("%d.%m.%Y.")
  all_data.append({
    "Portal": portal,
    "Datum Objave": datum_objave,
    "Rubrika": kat,
    "Nadnaslov": nadnaslov,
    "Naslov": html.unescape(naslov),
    "Podnaslov": podnaslov,
    "Link": link,
    "Autori": autori,
    "Sadržaj": sadrzaj
})

df = pd.DataFrame(all_data)
df.head(55)
df.to_json('KlixScrap.json', orient='records', lines=True, force_ascii=False)
df.to_excel('KlixScrap.xlsx', index=False)
