In [1]:
!pip install pdfminer.six



In [73]:
import io, os, csv
import re
from pdfminer.high_level import extract_text_to_fp, extract_pages
from pdfminer.layout import LAParams

In [71]:
def extract_info(file_path):
    # Create a buffer to store the extracted text
    buf = io.StringIO()
    
    # Extract the text using pdfminer
    with open(file_path, 'rb') as fp:
        extract_text_to_fp(fp, buf, laparams=LAParams())
    
    # Preprocess the text to remove unnecessary whitespace and line breaks
    text = buf.getvalue().strip()

    # Exclude pages that contain "DAFTAR ISI" or "DAFTAR TABEL"
    pages = text.split('\x0c')
    exclude_list = ["DAFTAR ISI", "DAFTAR TABEL", "PRAKATA", "PENDAHULUAN", "LAMPIRAN I", 
                    "Format 1", "Format 4", "Format 5", "Format 6", "PROGRAM PEMAGANGAN", 
                    "KURIKULUM PROGRAM PEMAGANGAN", "SILABUS PROGRAM PEMAGANGAN", "LAMPIRAN II",
                    "LAMPIRAN III", "   1. JADWAL PEMAGANGAN", "2. KEGIATAN HARIAN PESERTA YANG DIKETAHUI PEMBIMBING",
                    "TABEL", "LAMPIRAN"
                    ]
    pages = [page for page in pages if not any(exclude_str in page for exclude_str in exclude_list)]
    text = '\x0c'.join(pages)
    
    # Split the text into paragraphs based on subheadings
    subheading_pattern = re.compile(r'^[A-Z]\.\s|\bBAB\s+(?![ivx]+\b)|\bBAB\s+[ivx]+\s+', re.MULTILINE)
    paragraphs = []
    
    matches = subheading_pattern.finditer(text)
    start = 0
    
    for match in matches:
        paragraph = text[start:match.start()].strip()
        paragraph = re.sub(r'\s+', ' ', paragraph)
        # Remove numbers between dashes
        paragraph = re.sub(r'-\s*\d+\s*-', ' ', paragraph)
        # Remove numbers followed by "http://jdih.pu.go.id"
        paragraph = re.sub(r'\d+\s*http://jdih\.pu\.go\.id', '', paragraph)
        # Remove Roman numerals
        paragraph = re.sub(r'\b[ivx]+\b', '', paragraph)
        paragraphs.append(paragraph)
        start = match.start()

    paragraph = text[start:].strip()
    paragraph = re.sub(r'\s+', ' ', paragraph)
    # Remove numbers between dashes
    paragraph = re.sub(r'-\s*\d+\s*-', ' ', paragraph)
    # Remove numbers followed by "http://jdih.pu.go.id"
    paragraph = re.sub(r'\d+\s*http://jdih\.pu\.go\.id', '', paragraph)
    # Remove Roman numerals
    paragraph = re.sub(r'\b[ivx]+\b', '', paragraph)
    paragraphs.append(paragraph)

    # Remove specific words from each paragraph
    remove_words = ["Dokumen ini tidak dikendalikan jika di unduh/Uncontrolled when downloaded", 
                    "Ditetapkan di Jakarta pada tanggal 6 Desember 2019 MENTERI PEKERJAAN UMUM DAN PERUMAHAN RAKYAT, ttd",
                    "M. BASUKI HADIMULJONO",
                    "www.djpp.depkumham.go.id",
                    "Format 2:",
                    "2. Tahap Pemagangan dan Evaluasi Pelaksanaan Pemagangan 8 9 10 11 SELESAI Persiapan Pemagangan Pelaksanaan Pemagangan Pengawasan Pelaksanaan Pemagangan Evaluasi Pelaksanaan Pemagangan",
                    "SALINAN"]
    for i in range(len(paragraphs)):
        for word in remove_words:
            paragraphs[i] = paragraphs[i].replace(word, "")
        
    return paragraphs

In [74]:
# Directory path containing PDF files
pdf_dir_path = r"C:\Users\Annisa Rizki\Desktop\Annisa Lianda\Job Freelance\chatbot_using_openai\pdf"

# Output CSV file path
output_csv = "output.csv"

# List to store the extracted paragraphs
paragraphs_list = []

# Loop through all PDF files in the directory
for filename in os.listdir(pdf_dir_path):
    if filename.endswith(".pdf"):
        # Extract paragraphs from the PDF file
        paragraphs = extract_info(os.path.join(pdf_dir_path, filename))
        
        # Append the paragraphs to the list with the filename
        for paragraph in paragraphs:
            paragraphs_list.append([paragraph, filename])
        
# Write the paragraphs to a CSV file
with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["Paragraph", "Filename"])
    writer.writerows(paragraphs_list)
    