In [26]:
!pip install PyMuPDF fitz openai tiktoken numpy pandas umap-learn 

Collecting nbformat
  Using cached nbformat-5.10.4-py3-none-any.whl.metadata (3.6 kB)
Collecting fastjsonschema>=2.15 (from nbformat)
  Using cached fastjsonschema-2.19.1-py3-none-any.whl.metadata (2.1 kB)
Collecting jsonschema>=2.6 (from nbformat)
  Using cached jsonschema-4.22.0-py3-none-any.whl.metadata (8.2 kB)
Collecting attrs>=22.2.0 (from jsonschema>=2.6->nbformat)
  Using cached attrs-23.2.0-py3-none-any.whl.metadata (9.5 kB)
Collecting jsonschema-specifications>=2023.03.6 (from jsonschema>=2.6->nbformat)
  Using cached jsonschema_specifications-2023.12.1-py3-none-any.whl.metadata (3.0 kB)
Collecting referencing>=0.28.4 (from jsonschema>=2.6->nbformat)
  Using cached referencing-0.35.1-py3-none-any.whl.metadata (2.8 kB)
Collecting rpds-py>=0.7.1 (from jsonschema>=2.6->nbformat)
  Using cached rpds_py-0.18.1-cp311-cp311-macosx_11_0_arm64.whl.metadata (4.1 kB)
Using cached nbformat-5.10.4-py3-none-any.whl (78 kB)
Using cached fastjsonschema-2.19.1-py3-none-any.whl (23 kB)
Using c

In [49]:
from pdftools import *
from llm import clear_history, complete, print_history, client, num_tokens
import os
import shutil
import numpy as np
import pandas as pd
import umap

OUTPUT_DIR = "../html/data/"
CLEAN = False
START_STOP = {
    "fpö": [4, 15],
    "grüne": [4, 106],
    "kpö": [4, 27],
    "neos": [4, 39],
    "övp": [3, 56],
    "spö": [4, 26]
}

def summarize_file(party, filename, outputfile):    
    file_content = read_file(filename)
    pages = file_content.split('========== PAGE ')
    summaries = []
    start_stop = START_STOP[party]

    for page in pages[1:]:        
        page_number, page_text = page.split('\n', 1)   
        print(f"Page {page_number}/{len(pages) - 1}")
        if int(page_number) < start_stop[0] or int(page_number) > start_stop[1] or not page_text.strip():        
            print("Skipping")
            summary = ''
        else:
            clear_history()              
            summary = complete(f"""
Dieser Text ist Teil eines Wahlprogrammers. Extrahiere die Schlüsselpunkte des Textes, der durch 3 Backticks delimitiert ist, mit einem speziellen Fokus auf Forderungen bzw. Plännen der Partei. Inkludiere Zahlen zu den Forderungen und Plännen, so vorhanden:
            
```
{page_text}
```
        """, 4096)
        summaries.append(f'========== PAGE {page_number}\n{summary}\n')

    summary = ''.join(summaries)    
    write_file(outputfile, summary)

def vectorize_file(filename, label):
    lines = read_file(filename).splitlines()    
    batchLines = []    
    batchPages = []
    current_tokens = 0    
    page = 0

    for line in lines:
        line = line.strip()
        line_tokens = num_tokens(line)
        if line.startswith("======"):
            page += 1
            continue
        if len(line) == 0:
            continue
        if current_tokens + line_tokens > 8000:
            response = client.embeddings.create(input=batchLines, model="text-embedding-3-small")
            save_vectors(response, label, batchLines, batchPages)
            batchLines = []
            batchPages = []
            current_tokens = 0

        batchLines.append(line)
        batchPages.append(page)
        current_tokens += line_tokens

    if batchLines:
        response = client.embeddings.create(input=batchLines, model="text-embedding-3-small")
        save_vectors(response, label, batchLines, batchPages)    

def save_vectors(response, label, lines, pages):    
    with open(OUTPUT_DIR + "vectors.tsv", 'a') as f:
        for i, embedding in enumerate(response.data):
            f.write('\t'.join(map(str, embedding.embedding)) + '\n')
    with open(OUTPUT_DIR + "vectors.meta.tsv", 'a') as f:
        for i, line in enumerate(lines):
            f.write(label + "\t" + str(pages[i]) + "\t" + label + line + "\n")

def convert():
    for file in files:
        text = convert_pdf(file)
        party = os.path.splitext(os.path.basename(file))[0]
        txt_file = OUTPUT_DIR + party + ".txt"
        summary_file = OUTPUT_DIR + party + "-summary.txt"
        write_file(txt_file, text)
        if not os.path.exists(summary_file):
            summarize_file(party, txt_file, summary_file)
        vectorize_file(summary_file, party)

def project():
    data = pd.read_csv(OUTPUT_DIR + "vectors.tsv", sep='\t', header=None)
    umap_2d = umap.UMAP(n_components=2, n_neighbors=10, n_epochs=500, random_state=42, metric="cosine")
    projection_2d = umap_2d.fit_transform(data)    

    np.savetxt(OUTPUT_DIR + "projection-2d.tsv", projection_2d, delimiter='\t')    

files = ["data/fpö.pdf", "data/grüne.pdf", "data/kpö.pdf", "data/neos.pdf", "data/övp.pdf", "data/spö.pdf"]
if CLEAN and os.path.exists(OUTPUT_DIR):
    shutil.rmtree(OUTPUT_DIR)
if not os.path.exists(OUTPUT_DIR):
    os.mkdir(OUTPUT_DIR)
#if os.path.exists(OUTPUT_DIR + "vectors.tsv"):
#    os.remove(OUTPUT_DIR + "vectors.tsv")
#if os.path.exists(OUTPUT_DIR + "vectors.meta.tsv"):
#    os.remove(OUTPUT_DIR + "vectors.meta.tsv")
#with open(OUTPUT_DIR + "vectors.meta.tsv", 'a') as f:
#    f.write("party\tpage\tstatement\n")
#convert()

project()


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.

