In [None]:
!pip install pdfplumber -q
import csv
import io
import pandas as pd
import requests
from bs4 import BeautifulSoup
import pdfplumber
from requests.exceptions import ChunkedEncodingError, ConnectionError, Timeout

def extract_pdf_text(pdf_url):
    # Downloading PDF
    response = requests.get(pdf_url)
    response.raise_for_status()

    tekst = ""
    with pdfplumber.open(io.BytesIO(response.content)) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                tekst += page_text + "\n"
    return tekst

def extract_summary(tekst):
    # Searching for "Summary" and capturing everything that follows
    if "Sažetak" in tekst:
        return tekst.split("Sažetak", 1)[1].strip()
    return tekst

def clean_article_links(article_links):
    cleaned_links = []
    for link in article_links:
        # Check if the link contains '/view/'
        if '/view/' in link:
            view_index = link.index('/view/') + len('/view/')
            cleaned_link = link[:view_index]  # Start with the base up to /view/

            # Loop through the characters after '/view/'
            for char in link[view_index:]:
                if char.isdigit():
                    cleaned_link += char  # Append digits
                elif char == '/':
                    break  # Stop if we find a slash

            cleaned_links.append(cleaned_link)  # Append cleaned link
        else:
            cleaned_links.append(link)  # Keep it the same if it doesn't have /view/

    return cleaned_links

def get_article_links(archive_url):
    links = set()
    page = requests.get(archive_url)
    soup = BeautifulSoup(page.text, 'html.parser')

    # Extract links for each issue in the archive
    issue_links = [a['href'] for a in soup.find_all('a', href=True) if 'issue/view' in a['href']]

    # Iterate over each issue to get article links
    for issue_url in issue_links:
        issue_page = requests.get(issue_url)
        issue_soup = BeautifulSoup(issue_page.text, 'html.parser')

        # Find all article links within the issue page
        article_links = [a['href'] for a in issue_soup.find_all('a', href=True) if 'article/view' in a['href']]
        links.update(article_links)

    return list(links)

# Function to handle retries
def fetch_with_retries(url, retries=3, delay=5, timeout=10):
    for attempt in range(retries):
        try:
            response = requests.get(url, timeout=timeout, stream=True)
            response.raise_for_status()
            return response
        except (requests.exceptions.RequestException, ConnectionError, ChunkedEncodingError, Timeout) as e:
            print(f"Error fetching {url}: {e}")
            if attempt < retries - 1:
                print(f"Retrying... Attempt {attempt + 1} of {retries}")
                sleep(delay)
            else:
                print("Max retries reached. Skipping.")
                return None

archive_url = "https://www.pregled.unsa.ba/index.php/pregled/issue/archive"
article_links = get_article_links(archive_url)
print("Original Links:", article_links)

# Clean the links
article_links = clean_article_links(article_links)
print("Cleaned Links:", article_links)

all_data = []
processed_titles = set()

for url in article_links:
    # Fetching page content with retries
    response = fetch_with_retries(url)
    if response is None:
        continue

    html_content = b''  # Initialize an empty byte string to store content

    # Stream the response in chunks
    for chunk in response.iter_content(chunk_size=1024):
        html_content += chunk

    # Parsing HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Data extraction with error handling
    autori = [meta['content'] for meta in soup.find_all('meta', attrs={'name': 'DC.Creator.PersonalName'})]

    # Get the publication date if available
    datum_izdanja_meta = soup.find('meta', attrs={'name': 'citation_date'})
    datum_izdanja = datum_izdanja_meta['content'] if datum_izdanja_meta else ''
    godina = datum_izdanja.split('/')[0] if datum_izdanja else ''

    # Get the title if available
    naslov_meta = soup.find('meta', attrs={'name': 'DC.Title'})
    naslov = naslov_meta['content'] if naslov_meta else None
    print(naslov)

    # Check for duplicates
    if naslov in processed_titles:
        print(f"Skipping duplicate: {naslov}")
        continue

    # Mark this title as processed
    processed_titles.add(naslov)

    # Get the page number if available
    stranice_meta = soup.find('meta', attrs={'name': 'DC.Identifier.pageNumber'})
    stranice = stranice_meta['content'] if stranice_meta else None

    # Get the publication name if available
    publikacija_meta = soup.find('meta', attrs={'name': 'DC.Source'})
    publikacija = publikacija_meta['content'] if publikacija_meta else None

    # URL extraction for PDF
    pdf_url_meta = soup.find('meta', attrs={'name': 'citation_pdf_url'})
    pdf_url = pdf_url_meta['content'] if pdf_url_meta else None

    if pdf_url == None:
        continue

    # Text extraction for PDF if URL exists
    tekst = extract_pdf_text(pdf_url) if pdf_url else ''
    procisceni = extract_summary(tekst)

    # Append the data to the list
    data = {
        'Naslov teksta': naslov,
        'Naslov publikacije': publikacija,
        'Autori': '; '.join(autori),
        'Godina': godina,
        'Stranice': stranice,
        'Sadržaj': procisceni
    }

    all_data.append(data)
df = pd.DataFrame(all_data)
df.head(251)  # Writing all data to a CSV file
with open('Metadata.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=all_data[0].keys())
    writer.writeheader()
    writer.writerows(all_data)
# Writing all data to a JSON file
df.to_json('PDFScrap.json', orient='records', lines=True, force_ascii=False)
# Writing all data into an EXCEL file
df.to_excel('ExcelScrap.xlsx', index=False)