In [None]:
import fitz  # PyMuPDF
import re
from collections import defaultdict
import json
import operators

In [None]:
def styles_match(style1, style2):
    """Compares two style dictionaries to determine if they match."""
    return style1["font"] == style2["font"] and abs(style1["size"] - style2["size"]) < 1

# Open the PDF file
doc = fitz.open("The Digital Code of DNA.pdf")  # Replace with the actual PDF file path

segments = []
current_segment = {"text": "", "style": None, "bbox": None}

# Process each page in the document
for page in doc:
    blocks = page.get_text("dict")["blocks"]
    for block in blocks:
        for line in block.get("lines", []):
            for span in line.get("spans", []):
                text = span["text"].strip()
                style = {"font": span["font"], "size": span["size"]}
                if text:
                    if current_segment["style"] and styles_match(current_segment["style"], style):
                        # Same style as previous, merge text
                        current_segment["text"] += " " + text
                    else:
                        # Different style, start a new segment
                        if current_segment["text"]:
                            segments.append(current_segment)
                        current_segment = {"text": text, "style": style}

# Don't forget to add the last segment
if current_segment["text"]:
    segments.append(current_segment)
    
# Close the document
doc.close()

# Example output
for segment in segments[10:13]:  # Display the first few segments for brevity
    print(f"Text: {segment['text']}\nsegments: {segment['text'].split('.')}\nStyle: {segment['style']}\n---\n")


### mark down

In [None]:
def flag_text(text, style):
  if style['size'] >= 20: return 'large'
  elif style['size'] < 20 and style['size'] >= 10: return 'heading'
  elif style['size'] < 10 and style['size'] >= 9: return 'small'
  elif style['size'] < 9 and style['size'] > 7: return 'text'
  else: return "span"

print(flag_text("Tools to modify DNA", {"size": 9.5}))
print(flag_text("The enzymes that function in cells to copy, cut and join DNA molecules were also exploited as key tools for revolutionary new tech- niques in molecular biology, including the cloning of genes and expression of their proteins, and mapping the location of genes on chromosomes. The ability to recreate the process of DNA replication artificially in the laboratory led to the development of two techniques", {
  "size": 8.835000038146973}))
print(flag_text("2", {"size": 5.301000118255615}))


In [None]:
for segment in segments:
  segment['flag'] = flag_text(segment['text'], segment['style'])

In [None]:
len(segments)

### remove the small spans inside text
which is actually a marker or flag but can't handle that right now

In [None]:
# Creating a new list with the items that should remain
new_segments = []
for i in range(len(segments)):
    if not (segments[i]['flag'] == "span" and i > 0 and segments[i-1]['flag'] == 'text'):
        new_segments.append(segments[i])

segments = new_segments


### merge segments of the same style

In [None]:
while True:
    global segments
    new_segments = []
    i = 0
    while i < len(segments) - 1:  # Adjust condition to ensure i+1 is always valid
        if segments[i]['style']['size'] == segments[i+1]['style']['size']:
            segments[i]['text'] += segments[i+1]['text']
            new_segments.append(segments[i])
            i += 2  # Skip the next segment since it's merged with the current one
        else:
            new_segments.append(segments[i])
            i += 1

    # Handle the last segment if it wasn't merged
    if i == len(segments) - 1:
        new_segments.append(segments[i])

    print(len(new_segments))
    if (len(segments) == len(new_segments)): break
    else: segments = new_segments



In [None]:
len(segments)

## Add Id

In [None]:
id_counter = 0
for segment in segments:
  segment['id'] = id_counter
  id_counter += 1

### generate summeries

In [None]:
for segment in segments:
  if (segment['flag'] != 'span' and len(segment['text'])>20):
    print(segment['id'])
    segment['summery'] = operators.summerize(segment['text'])

### Split to smaller chonks

In [None]:
def chop_down(text):
  # remove some dots that are not meant as end of text
  text = text.replace("(ref. ", "(ref ")
  text = text.replace("(Fig. ", "(Fig ")

  subtexts = text.split('.')
  return subtexts

In [None]:
for segment in segments:
  if segment['flag'] != 'span':
    segment['parts'] = chop_down(segment['text'])
    if 'summery' in segment:
      segment['summery_parts'] = chop_down(segment['summery'])

## generate voices

In [None]:
# for the summery texts
for segment in segments:
  if 'summery_parts' in segment:
    # main part:
    thesummarysplitted = [substr for substr in re.split(r"[.!?;:]", segment['summery']) if substr]
    print(thesummarysplitted)
    for tmptext in thesummarysplitted:
      if len(tmptext) > 1: operators.readoutload(tmptext, f"./voices/{segment['id']}.summary.wav", False)

    # sub parts
    index = 0
    for summery_part in segment['summery_parts']:
      print(segment['id'], index)
      thesummarysplitted = [substr for substr in re.split(r"[!?;:]", summery_part) if substr]
      print(thesummarysplitted)
      for tmptext in thesummarysplitted:
        if len(tmptext) > 1: operators.readoutload(tmptext, f"./voices/{segment['id']}.summary_part.{index}.wav", False)
      # operators.readoutload(segment['text'], f"./voices/{segment['id']}.main.wav", False)
      index+=1

In [None]:
# for the original texts
for segment in segments:
  if 'parts' in segment:
    # main part:
    thesummarysplitted = [substr for substr in re.split(r"[.!?;:]", segment['text']) if substr]
    print(thesummarysplitted)
    for tmptext in thesummarysplitted:
      if len(tmptext) > 1: operators.readoutload(tmptext, f"./voices/{segment['id']}.original.wav", False)

    # sub parts
    index = 0
    for summery_part in segment['parts']:
      print(segment['id'], index)
      thesummarysplitted = [substr for substr in re.split(r"[!?;:]", summery_part) if substr]
      print(thesummarysplitted)
      for tmptext in thesummarysplitted:
        if len(tmptext) > 1: operators.readoutload(tmptext, f"./voices/{segment['id']}.part.{index}.wav", False)
      # operators.readoutload(segment['text'], f"./voices/{segment['id']}.main.wav", False)
      index+=1

# Store

In [None]:
with open('./output.json', 'w') as json_file:
    json.dump(segments, json_file, indent=4)