In [1]:
from datetime import datetime
import aiohttp
import asyncio
import time
from bs4 import BeautifulSoup, SoupStrainer
from io import BytesIO
from PyPDF2 import PdfReader
import requests
import csv

async def fetch_news(issuer):
    url = f"https://www.mse.mk/en/symbol/{issuer}"
    
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as response:
            response_text = await response.text()
            soup = BeautifulSoup(response_text, "lxml", parse_only=SoupStrainer("main", {"id": "main"}))
            
            news_section = soup.select_one("#seiNetIssuerLatestNews")

            if not news_section:
                return []
            
            return [a.get("href").split("/")[-1] for a in news_section.select("a")]

async def fetch_article(news_id):
    url = f"https://api.seinet.com.mk/public/documents/single/{news_id}"

    async with aiohttp.ClientSession() as session:
        async with session.get(url) as response:
            json_data = await response.json()

            if not json_data.get("data"):
                return None

            article = json_data["data"]
            seinet_id = article.get("documentId")
            content = article.get("content", "")
            date = datetime.strptime(article.get("publishedDate").split(".")[0], "%Y-%m-%dT%H:%M:%S").date()
            
            attachment_content = []
            attachments = article.get("attachments", [])
            
            if attachments:
                for attachment in attachments:
                    if "application/pdf" in attachment["attachmentType"]["mimeType"]:
                        pdf_text = fetch_attachment(attachment["attachmentId"])
                        attachment_content.extend(pdf_text)

            return seinet_id, content, date, attachment_content

def fetch_attachment(attachment_id):
    url = f"https://api.seinet.com.mk/public/documents/attachment/{attachment_id}"

    response = requests.get(url)
    pdf = PdfReader(BytesIO(response.content))
    contents = []
    
    for page in pdf.pages:
        text = page.extract_text()
        contents.extend(line.strip() for line in text.split("\n") if line.strip())
    
    return contents

async def process_news(issuer, csv_writer):
    latest_news = await fetch_news(issuer)

    for news_id in latest_news:
        if result := await fetch_article(news_id):
            seinet_id, content, date, attachments = result
            csv_writer.writerow([issuer, seinet_id, content, date, " ".join(attachments)])


async def main():
    issuers = ['ADIN', 'ALK', 'ALKB', 'AMEH', 'APTK', 'ATPP', 'AUMK', 'BANA',
       'BGOR', 'BIKF', 'BIM', 'CDHV', 'CEVI', 'CKB', 'CKBKO', 'DEBA',
       'DIMI', 'EDST', 'ENER', 'EUHA', 'EVRO', 'FAKM', 'FERS', 'FROT',
       'FUBT', 'GALE', 'GDKM', 'GECK', 'GECT', 'GIMS', 'GRDN', 'GRNT',
       'GRZD', 'GTC', 'GTRG', 'IJUG', 'INB', 'INPR', 'JAKO', 'JUSK',
       'KARO', 'KJUBI', 'KLST', 'KMB', 'KMPR', 'KOMU', 'KONF', 'KONZ',
       'KPSS', 'KVAS', 'LHND', 'LOTO', 'LOZP', 'MAGP', 'MAKP', 'MAKS',
       'MB', 'MERM', 'MKSD', 'MLKR', 'MODA', 'MPOL', 'MPT', 'MTUR',
       'MZPU', 'NEME', 'NOSK', 'OILK', 'OKTA', 'OMOS', 'OPFO', 'OPTK',
       'ORAN', 'OSPO', 'PELK', 'PKB', 'POPK', 'PPIV', 'PROD', 'PTRS',
       'RADE', 'REPL', 'RIMI', 'RINS', 'RZEK', 'RZIT', 'RZIZ', 'RZLE',
       'RZLV', 'RZTK', 'RZUG', 'RZUS', 'SBT', 'SDOM', 'SIL', 'SKP',
       'SLAV', 'SNBTO', 'SOLN', 'SPAZ', 'SPAZP', 'SSPR', 'STB', 'STBP',
       'STIL', 'STOK', 'TAJM', 'TBKO', 'TEAL', 'TEHN', 'TEL', 'TETE',
       'TIKV', 'TKPR', 'TKVS', 'TNB', 'TRPS', 'TRUB', 'TSMP', 'TSZS',
       'TTK', 'TTKO', 'UNI', 'USJE', 'VARG', 'VITA', 'VROS', 'VSC',
       'VTKS', 'ZAS', 'ZILU', 'ZIMS', 'ZKAR', 'ZPKO', 'ZPOG']  
    
    # Open CSV file for writing
    with open("news_data.csv", mode="w", newline="", encoding="utf-8") as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(["Issuer", "Seinet ID", "Content", "Date", "Attachments"])
        
        # Process each issuer sequentially
        for issuer in issuers:
            print(f"Processing news for issuer: {issuer}")
            await process_news(issuer, csv_writer)

# Run the main function
await main()


Processing news for issuer: ADIN
Processing news for issuer: ALK
Processing news for issuer: ALKB
Processing news for issuer: AMEH
Processing news for issuer: APTK
Processing news for issuer: ATPP
Processing news for issuer: AUMK
Processing news for issuer: BANA
Processing news for issuer: BGOR
Processing news for issuer: BIKF
Processing news for issuer: BIM
Processing news for issuer: CDHV
Processing news for issuer: CEVI
Processing news for issuer: CKB
Processing news for issuer: CKBKO
Processing news for issuer: DEBA
Processing news for issuer: DIMI
Processing news for issuer: EDST
Processing news for issuer: ENER
Processing news for issuer: EUHA
Processing news for issuer: EVRO
Processing news for issuer: FAKM
Processing news for issuer: FERS
Processing news for issuer: FROT
Processing news for issuer: FUBT
Processing news for issuer: GALE
Processing news for issuer: GDKM
Processing news for issuer: GECK
Processing news for issuer: GECT
Processing news for issuer: GIMS
Processing n