In [None]:
import csv
import io
import pandas as pd

import requests
from bs4 import BeautifulSoup
import pdfplumber

def extract_pdf_text(pdf_url):
    # Downloading PDF
    response = requests.get(pdf_url)
    response.raise_for_status()

    tekst = ""
    with pdfplumber.open(io.BytesIO(response.content)) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                tekst += page_text + "\n"
    return tekst

def extract_summary(tekst):
    # Searching for 'Summary' and capturing everything that follows
    if "Sažetak" in tekst:
        return tekst.split("Sažetak", 1)[1].strip()
    return None

def clean_article_links(article_links):
    cleaned_links = []
    for link in article_links:
        # Check if the link contains '/view/'
        if '/view/' in link:
            view_index = link.index('/view/') + len('/view/')
            cleaned_link = link[:view_index]  # Start with the base up to /view/
            
            # Loop through the characters after '/view/'
            for char in link[view_index:]:
                if char.isdigit():
                    cleaned_link += char  # Append digits
                elif char == '/':
                    break  # Stop if we find a slash
            
            cleaned_links.append(cleaned_link)  # Append cleaned link
        else:
            cleaned_links.append(link)  # Keep it the same if it doesn't have /view/
    return cleaned_links

def get_article_links(archive_url):
    links = set()
    page = requests.get(archive_url)
    soup = BeautifulSoup(page.text, 'html.parser')
    
    # Extract links for each issue in the archive
    issue_links = [a['href'] for a in soup.find_all('a', href=True) if 'issue/view' in a['href']]
    
    # Iterate over each issue to get article links
    for issue_url in issue_links:
        issue_page = requests.get(issue_url)
        issue_soup = BeautifulSoup(issue_page.text, 'html.parser')
        
        # Find all article links within the issue page
        article_links = [a['href'] for a in issue_soup.find_all('a', href=True) if 'article/view' in a['href']]
        links.update(article_links)
    
    return list(links)

archive_url = "https://www.pregled.unsa.ba/index.php/pregled/issue/archive"
article_links = get_article_links(archive_url)
print("Original Links:", article_links)

# Clean the links
article_links = clean_article_links(article_links)
print("Cleaned Links:", article_links)  # Data collection list
all_data = []

for url in article_links:
    # Fetching page content
    response = requests.get(url)
    html_content = response.content

    # Parsing HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Data extraction
    autori = [meta['content'] for meta in soup.find_all('meta', attrs={'name': 'DC.Creator.PersonalName'})]

    datum_izdanja = soup.find('meta', attrs={'name': 'citation_date'})['content']
    godina = datum_izdanja.split('/')[0]

    naslov = soup.find('meta', attrs={'name': 'DC.Title'})['content']

    stranice = soup.find('meta', attrs={'name': 'DC.Identifier.pageNumber'})['content']

    publikacija = soup.find('meta', attrs={'name': 'DC.Source'})['content']

    # URL extraction for PDF
    pdf_url = soup.find('meta', attrs={'name': 'citation_pdf_url'})['content']

    # Text extraction for PDF
    tekst = extract_pdf_text(pdf_url)
    procisceni = extract_summary(tekst)

    data = {
        'Naslov teksta': naslov,
        'Naslov publikacije': publikacija,
        'Autori': '; '.join(autori),
        'Godina': godina,
        'Stranice': stranice,
        'Tekst': procisceni
    }

    all_data.append(data)
   
df = pd.DataFrame(all_data)
df.head(55)  # Writing all data to a CSV file
with open('Metadata.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=all_data[0].keys())
    writer.writeheader()
    writer.writerows(all_data)
   
# Writing all data to a JSON file
df.to_json('PDFScrap.json', orient='records', lines=True, force_ascii=False)  # Writing all data into a EXCEL file
df.to_excel('ExcelScrap.xlsx', index=False)
