# **Library**

In [14]:
# Import Library
import PyPDF2
from tqdm import tqdm
from docx import Document
from transformers import pipeline
import time
#from google.colab import files
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas

# **Input**

PDF File

In [15]:
def open_PDF (path) : # Open the PDF file
  # Menginisialisasi string kosong
  concatenated_text = ""

  with open(path, "rb") as file:
      reader = PyPDF2.PdfReader(file)
      pages = reader.pages
      # Extract text from each page
      for i in tqdm(range(len(reader.pages))):
          text = pages[i].extract_text().strip()

          # Remove the references section (assuming it starts with 'References' and ends at the end of the document)
          if "References" in text:
              text = text.split("References")[0]
  # Loop through the pages
  for page in tqdm(pages):
      # Extract the text from the page and remove any leading or trailing whitespace
      text = page.extract_text().strip()

      # Concat the extracted text to the concatenated text
      concatenated_text += text
      # Check token length
  return concatenated_text

Text File

In [16]:
def open_text (path) : # Mebaca file txt dan mengisi variable text
  with open(path, 'r') as file:
      text = file.read()
  return text

Docx File

In [5]:
def open_docx (path) : # Membaca file docx
  doc = Document(path)

  # Extract text
  text = []
  for paragraph in doc.paragraphs:
      text.append(paragraph.text)

  # Join paragraphs into a single string
  document_text = '/n'.join(text)
  return document_text

**Input Classifier**

In [6]:
def input_classifier (Path) : # Menentukan jenis file berdasarkan path yang diberikan
  Jenis_File = Path[-3:]
  # Memanggil function sesuai dengan path file
  if Jenis_File == "pdf" :
    text = open_PDF (Path)
  elif Jenis_File == "txt" :
    text = open_text (Path)
  else :
    text = open_docx (Path)
  return text

# Bart Model (Extractive)

In [7]:
def bart_summarizer (text) :
  pipe = pipeline('summarization', model='facebook/bart-large-cnn')
  inputs = pipe.tokenizer(text, return_tensors="pt", truncation=False)
  summaries = []

  if len(inputs['input_ids'][0]) > 1024:  # Adjust the limit based on model's max tokens
      # Split the text into smaller chunks
      chunk_size = 1024  # Define your chunk size
      chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

      for chunk in chunks:
          pipe_out = pipe(chunk,max_length=130, min_length=30, do_sample=False)
          print (pipe_out[0]['summary_text'])
          summaries.append(pipe_out[0]['summary_text'])
  else:
      # If length is within limit, proceed with summarization
      pipe_out = pipe(text)
      summaries.append(pipe_out[0]['summary_text'])
  paragraph = "/n".join(summaries)

  return paragraph

# Pegasus Model (Abstractive)

In [8]:
def pegasus_summarizer (text) :
  pipe = pipeline('summarization', model ='google/pegasus-cnn_dailymail')
  inputs = pipe.tokenizer(text, return_tensors="pt", truncation=False)
  summaries = []
  if len(inputs['input_ids'][0]) > 1024:  # Adjust the limit based on model's max tokens
      # Split the text into smaller chunks
      chunk_size = 1024  # Define your chunk size
      chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

      for chunk in chunks:
          pipe_out = pipe(chunk,max_length=130, min_length=30, do_sample=False)
          summaries.append(pipe_out[0]['summary_text'])
  else:
      # If length is within limit, proceed with summarization
      pipe_out = pipe(text)
      summaries.append(pipe_out[0]['summary_text'])
  paragraph = "/n".join(summaries)

  return paragraph

# **Output**

Output Txt

In [9]:
def Output_txt(path,Summary_Bart,Summary_Pegasus) :  # Fungsi untuk mengexport summary ke file txt
  # Open the file in write mode (creates the file if it doesn't exist)
  with open(path, 'a') as file:
      # Write the string to the file
      file.write("Summary menggunakan model bart : ")
      file.write(Summary_Bart +"/n")
      file.write("Summary menggunakan model pegasus : ")
      file.write(Summary_Pegasus)
  #files.download(path)

Output PDF

In [10]:
def Output_pdf(path,Summary_Bart,Summary_Pegasus) :  # Fungsi untuk mengexport summary ke file pdf

  def wrap_text(text, width, font, size): # menyesuaikan output pdf agar tidak terpotong
    # Create a list of lines
    lines = []
    words = text.split(' ')
    current_line = ''

    for word in words:
        # Try adding the word to the current line and check its length
        test_line = current_line + ' ' + word if current_line else word
        test_width = canvas.Canvas('', pagesize=letter).stringWidth(test_line, font, size)

        if test_width <= width:
            # If the line fits, add the word to the current line
            current_line = test_line
        else:
            # Otherwise, start a new line
            lines.append(current_line)
            current_line = word

    # Add the last line
    if current_line:
        lines.append(current_line)

    return lines


  pdf_filename = path
  # Mendefinisikan summary bart dan summary pegasus
  my_string1 = "Summary menggunakan model bart : " + Summary_Bart
  my_string2 = "Summary menggunakan model pegasus : " + Summary_Pegasus
  c = canvas.Canvas(pdf_filename, pagesize=letter)

  # Set font and size
  c.setFont("Helvetica", 12)
  # Membuat margin untuk panjang dan lebar
  margin = 40
  width = letter[0] - 2 * margin
  height = letter[1] - 2 * margin

  # Wrap the text
  lines1 = wrap_text(my_string1, width, "Helvetica", 12)
  lines2 = wrap_text(my_string2, width, "Helvetica", 12)

  y_position = height - margin  # Start at the top of the page
  for line in lines1:
      c.drawString(margin, y_position, line)
      y_position -= 14  # Move down for the next line
  for line in lines2:
      c.drawString(margin, y_position, line)
      y_position -= 14  # Move down for the next line
  c.save()


Output Docx

In [11]:
def Output_docx(path,Summary_Bart,Summary_Pegasus) :  # Fungsi untuk mengexport summary ke file docx
  doc = Document()
  # Membuat Heading
  doc.add_heading('Summaries', level=1)
  doc.add_heading('Summary Bart', level=2)
  # Menuliskan summary
  doc.add_paragraph(Summary_Bart)
  # Membuat Heading
  doc.add_heading('Summary Pegasus', level=2)
  # Menuliskan summary
  doc.add_paragraph(Summary_Pegasus)

  # Save the document
  doc.save(path)


Output Docx

In [12]:
#
def output_classifier (Path,Summary_Bart,Summary_Pegasus) :
  Jenis_File = Path[-3:]
  if Jenis_File == "pdf" :
    Output_pdf(Path,Summary_Bart,Summary_Pegasus)
  elif Jenis_File == "txt" :
    Output_txt(Path,Summary_Bart,Summary_Pegasus)
  else :
    Output_docx(Path,Summary_Bart,Summary_Pegasus)

# **Main**

In [None]:
start_time = time.time()  # Record the start time
path = "C:/Users/firmansyah.atmojo/OneDrive - PT. Bumi Serpong Damai Tbk/5 Syllabus/Property Report[1].pdf"
text = input_classifier (path)

word_count = len(text.split())
print (word_count)

# Bart Model
Summary_Bart = bart_summarizer (text)
print(Summary_Bart)
print (len (Summary_Bart))

word_count_bart = len(Summary_Bart.split())
print (word_count_bart)

# Pegasus Model
Summary_Pegasus = pegasus_summarizer (text)

word_count_pegasus = len(Summary_Pegasus.split())
print (word_count_pegasus)

print(Summary_Pegasus)
print (len (Summary_Pegasus))

Output_path = str(input(" Output Path File : "))
output_classifier (Output_path,Summary_Bart,Summary_Pegasus)

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Runtime: {elapsed_time} seconds")

  0%|          | 0/15 [00:00<?, ?it/s]

100%|██████████| 15/15 [00:01<00:00, 12.37it/s]
100%|██████████| 15/15 [00:00<00:00, 16.68it/s]


2727


Device set to use cpu


Malaysia Property Market Report Q1 2023. Malaysia's economy expanded by 8.7% YoY in 2022. Service sector was the main contributor to Malaysia's economic growth.
Malaysia is unlikely to experience a recession. Growing domestic demand and an increase in electrical and electronics, and the tourism sector job openings will help offset the slower external demand.
Demand for rentals decreased by 4.0% QoQ in Q4 2022. The market may see greater optimism following the formation of the new government. The relaxation of the COVID-19 measures in China would boost the economy.
PropertyGuru Malaysia Property Market Report Q1 2023. The asking prices of properties in Malaysia went up by 1.5% QoQ in Q4. The Sale Demand Index declined by 14.8%. The rental market fell by 4.0%.
For the whole of 2022, the asking prices listed on PropertyGuru.com.my went up by 5.0%. The stock market and ringgit rose after the announcement of the Prime Minister. Existing homeowners are likely to increase housing prices.
In 2

config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]