In [1]:
import asyncio
import os
import re
import json
from copy import copy, deepcopy
from pathlib import Path
from pprint import pprint, PrettyPrinter
from time import time, sleep
from typing import List, Dict
from uuid import uuid4
from collections import defaultdict

import evaluate
import openai
import requests
import tiktoken
from bs4 import BeautifulSoup, Comment
from doctran import Doctran, ExtractProperty
from dotenv import load_dotenv, find_dotenv
from evaluate import load
from langchain.chat_models import ChatOpenAI
from langchain.docstore.document import Document
from langchain.document_loaders import WebBaseLoader
from langchain.embeddings import (
    OpenAIEmbeddings,
    HuggingFaceEmbeddings,
)
from langchain.llms import OpenAI
from langchain.text_splitter import (
    MarkdownTextSplitter,
    MarkdownHeaderTextSplitter,
    LineType,
    RecursiveCharacterTextSplitter,
)
# Load model directly
from transformers import AutoProcessor, AutoModelForTokenClassification
from loguru import logger
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from pdfminer.high_level import extract_text

from main import (
    divide_sections_if_too_large,
    extract_plan_and_content_wikipedia,
    compare_documents_content,
    compare_documents_sections,
    extract_plan_and_content_patent,
    extract_plan_and_content_arxiv,
    load_arxiv_paper,
)

import pytesseract

# If you don't have tesseract executable in your PATH, include the following:
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract"

_ = load_dotenv(find_dotenv())
openai.api_key = os.getenv('OPENAI_API_KEY')

llm_default = ChatOpenAI(model_name="gpt-3.5-turbo", streaming=True)
llm_16k = ChatOpenAI(model_name="gpt-3.5-turbo-16k", streaming=True)

def num_tokens_from_string(string: str, encoding_name: str = "gpt-3.5-turbo") -> int:
    """Returns the number of tokens in a text string."""
    try:
        encoding = tiktoken.get_encoding(encoding_name)
    except ValueError:
        encoding = tiktoken.encoding_for_model(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def convert_to_markdown(article_dict):
    md_text = ""

    for heading, content in article_dict.items():
        # heading is of form: 'h3 Example'
        # Define the markdown equivalent for the heading level
        heading_level = "#" * int(heading[1])
        heading = heading[3:]
        # Append the heading and the content to the markdown text
        md_text += f"{heading_level} {heading}\n\n{content}\n\n"

    return md_text


def truncated_pprint(obj, N=5):
    """Pretty print an object, truncating lists and strings to N items/characters
    for easier viewing of plan_json objects"""
    def truncate(item, N):
        if isinstance(item, list) and N is not None:
            return item[:N] + (['...'] if len(item) > N else [])
        if isinstance(item, str) and N is not None:
            N = 125
            return item[:N] + ('...' if len(item) > N else '')
        return item

    def trunc_recursive(item, N):
        if isinstance(item, list):
            return [trunc_recursive(i, N) for i in truncate(item, N)]
        elif isinstance(item, dict):
            return {k: trunc_recursive(v, N) for k, v in item.items()}
        else:
            return truncate(item, N)

    truncated_obj = trunc_recursive(obj, N)
    pprint(truncated_obj, sort_dicts=False)

# Test
data = {
    'long_list': list(range(100)),
    'long_string': 'a' * 100,
    'nested': {
        'nested_list': list(range(50)),
        'nested_string': 'b' * 50
    }
}

# truncated_pprint(data, 5)


processor = AutoProcessor.from_pretrained("nielsr/layoutlmv3-finetuned-funsd")
model = AutoModelForTokenClassification.from_pretrained("nielsr/layoutlmv3-finetuned-funsd")

Downloading (…)rocessor_config.json:   0%|          | 0.00/275 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/348 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RobertaTokenizer'. 
The class this function is called from is 'LayoutLMv3TokenizerFast'.


Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

In [None]:
from tests import run_tests

run_tests()

[32m2023-08-16 14:28:12.872[0m | [1mINFO    [0m | [36mmain[0m:[36m_compare_documents[0m:[36m629[0m - [1m
	Starting to compare two documents on section:
		ID: ffb4e5bd-d27b-44e2-a56f-da9090853629 Title: Dual-phase evolution
		ID: ffb4e5bd-d27b-44e2-a56f-da9090853629 Title: Dual-phase evolution[0m


Downloading builder script:   0%|          | 0.00/6.63k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

[32m2023-08-16 14:28:15.831[0m | [1mINFO    [0m | [36mmain[0m:[36m_compare_documents[0m:[36m642[0m - [1m
	ID: ffb4e5bd-d27b-44e2-a56f-da9090853629 Title: Dual-phase evolution has 14 sections.
	ID: ffb4e5bd-d27b-44e2-a56f-da9090853629 Title: Dual-phase evolution has 14 sections.[0m


Loading tokenizer


Downloading (…)lve/main/config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Tokenizing text...
Loading tokenizer
Loading model


Downloading model.safetensors:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

In [None]:
result = compare_documents_sections(
    'output/wikipedia/Dual-phase evolution.json',
    'output/wikipedia/Dual-phase evolution.json',
)
result

In [None]:
def is_approximately_equal(x, y, epsilon=1e-10):
    return abs(x - y) < epsilon

for value in result['plan_total_similarity'].values():
    assert is_approximately_equal(value, 1.0)

In [None]:
result = compare_documents_sections(
    "output/wikipedia/Dual-phase evolution.json",
    "output/wikipedia/Climate Change.json",
)
result

In [None]:
result = compare_documents_sections(
    "output/wikipedia/Dual-phase evolution.json",
    "output/wikipedia/Climate Change.json",
)

In [None]:
result

In [None]:
list(Path('data/patents').glob('*'))

In [None]:
path = 'data/arxiv/A_Survey_of_Software-Defined_Smart_Grid_Networks_Security_Threats_and__Defense_Techniques.pdf'
plan_json = await extract_plan_and_content_arxiv(path)

In [None]:
truncated_pprint(plan_json)

In [None]:
path = 'data/arxiv/A_Survey_of_Software-Defined_Smart_Grid_Networks_Security_Threats_and__Defense_Techniques.pdf'
article_dict = load_arxiv_paper(path)
article_dict = await divide_sections_if_too_large(article_dict, doc_type="arxiv")

In [None]:
article_dict.keys()

In [None]:
arxiv_papers = [
    "https://arxiv.org/pdf/2307.04438.pdf",
    "https://arxiv.org/pdf/2306.14697.pdf",
    "https://arxiv.org/pdf/2302.09051.pdf",
    "https://arxiv.org/pdf/2305.10091.pdf",
    "https://arxiv.org/pdf/2305.17474.pdf",
    "https://arxiv.org/pdf/2306.16960.pdf",
    "https://arxiv.org/pdf/2305.20069.pdf",
    "https://arxiv.org/pdf/2306.08451.pdf",
    "https://arxiv.org/pdf/2306.17003.pdf",
    "https://arxiv.org/pdf/2307.07573.pdf",
]

In [None]:
import requests
import re
import os

def sanitize_filename(filename):
    """Convert string to a valid filename."""
    s = str(filename).strip().replace(' ', '_')
    # Remove any character that is not a word character
    # (alphanumeric + underscore), not a hyphen, or not a period.
    return re.sub(r'(?u)[^-\w.]', '', s)

def get_arxiv_metadata(arxiv_id):
    """Fetch metadata for the given arXiv ID."""
    url = f'https://export.arxiv.org/api/query?id_list={arxiv_id}'
    response = requests.get(url)
    response.raise_for_status()
    data = response.text

    # Use regex to extract the title. There are better ways (like parsing XML),
    # but this is simple and should work for our purpose.
    match = re.search(r'<title>([^<]+)</title>', data)
    title = match.group(1) if match else None
    return {'title': title}

def download_arxiv_pdf(arxiv_url):
    """Given a arxiv_url, download the PDF to the data/arxiv directory."""
    arxiv_id = arxiv_url.split('/')[-1].replace('.pdf', '')
    metadata = get_arxiv_metadata(arxiv_id)

    if metadata.get('title'):
        filename = sanitize_filename(metadata['title']) + '.pdf'
    else:
        filename = f"{arxiv_id}.pdf"

    arxiv_dir = Path('data/arxiv')
    os.makedirs(arxiv_dir, exist_ok=True)

    response = requests.get(arxiv_url)
    output_file = arxiv_dir / filename
    with open(output_file, 'wb') as f:
        f.write(response.content)
    print(f"Downloaded to {output_file}")

# Example usage:
arxiv_url = 'https://arxiv.org/pdf/2306.14697.pdf'
# download_arxiv_pdf(arxiv_url)


In [None]:
for paper in arxiv_papers:
    download_arxiv_pdf(paper)

In [None]:
doc = Path('data/arxiv/A_Survey_of_Software-Defined_Smart_Grid_Networks_Security_Threats_and__Defense_Techniques.pdf')
text = extract_text(doc)

doc_2 = Path('data/arxiv/A_Survey_on_Blood_Pressure_Measurement_Technologies_Addressing__Potential_Sources_of_Bias.pdf')
text_2 = extract_text(doc_2)

doc_3 = Path('data/arxiv/A_survey_on_algebraic_dilatations.pdf')
text_3 = extract_text(doc_3)

doc_4 = Path('data/arxiv/A_survey_on_the_complexity_of_learning_quantum_states.pdf')
text_4 = extract_text(doc_4)

In [None]:
title = doc.stem
title = title.replace('_', ' ')
title

In [None]:
# The pattern searches for "abstract" followed by any content.
# Then, it looks for one of the potential following sections:
# "I. Introduction", "1. Introduction", or "Contents".
# pattern = r'abstract(.*?)(i\. introduction|1\. introduction|contents)'

# The pattern searches for "abstract" followed by any content.
# Then, it looks for one of the potential following sections:
# "I. Introduction", "1. Introduction", or "Contents".
# We use a positive lookahead (?=...) to assert that the introduction or contents
# pattern exists, but we don't include it in the main match.
pattern = r'abstract(.*?)(?=(i\. introduction|1\. introduction|contents))'


# The re.DOTALL flag allows the . in the pattern to match newline characters,
match = re.search(pattern, text.lower(), re.DOTALL)

if match:
    abstract_start = match.start()
    abstract_end = match.end()
    abstract = match.group(1).strip()  # Extracted abstract content
    print(abstract)

In [None]:
pattern = r'references\n'
# regions = []
matches = [match for match in re.finditer(pattern, text.lower())]

references = ''
if matches:
    final_match = matches[-1]
    reference_start = final_match.start()
    reference_end = final_match.end()
    references = text[reference_start:]
print(references[:250])

In [None]:
content = text[abstract_end:reference_start]
content

In [None]:
article_dict = {
    'Title': title,
    'Abstract': abstract,
    'Content': content,
    'References': references,
}
truncated_pprint(article_dict)

In [None]:
split_dict = await divide_sections_if_too_large(article_dict, doc_type='arxiv')

In [None]:
pprint(list(split_dict.keys()))

In [None]:
text[reference_start: reference_start+250]

In [None]:
content = text[abstract_end:reference_start]
content[:250]

In [None]:
texts = [text, text_2, text_3, text_4]
for t in texts:
    print(repr(t[:500]))
    print()

In [None]:
# Pattern details:
# 1. `^` asserts start of a line.
# 2. `(.*?)` captures everything lazily.
# 3. The lookahead `(?=...)` asserts that what directly follows is:
#   a. an email-like pattern, OR
#   b. words like "University", "Department", "Research Center", OR
#   c. a date pattern (e.g., "July 12, 2023").
pattern = r'^.*?(?=\S+@\S+|\bUniversity\b|\bDepartment\b|\bResearch Center\b|\b[\w\s]{3,20}\b \d{1,2}, \d{4})'

for text in texts:
    match = re.search(pattern, text, re.DOTALL | re.MULTILINE)
    if match:
        title = match.group(0).strip()
        print(title)
        print('-' * 50)

In [None]:
print(text)

In [None]:
'references\n' in text.lower()

In [None]:
re.findall(r'references\n', text.lower())

In [None]:
pattern = r'abstract'
regions = []
for match in re.finditer(pattern, text.lower()):
    # regions.append(text[match.start():])
    start_index = max(0, match.start() - 250)
    end_index = min(len(text), match.end() + 250)
    regions.append(text[start_index:end_index])

# Print the regions
for i, region in enumerate(regions, 1):
    print(f"Match {i}:\n{region}\n{'-'*50}")

In [None]:
re.findall(r'abstract', text.lower())

In [None]:
text[:1000]

In [None]:
article_dict = {'article': text}
split_dict = await divide_sections_if_too_large(article_dict, doc_type='arxiv')

In [None]:
pprint(list(split_dict.keys()), width=125)

In [None]:
num_tokens_from_string(text.lower().split('references\n')[-1])

In [None]:
[num_tokens_from_string(x) for x in split_dict.values()]

In [None]:
print(split_dict['A Survey of Software-Defined Smart Grid Networks: Security Threats and Defense Techniques'])

In [None]:
for x in sorted(split_dict.keys(), key=len, reverse=True):
    print(f"Length: {len(x)}, num tokens: {num_tokens_from_string(x)}")
    print(x)
    print()

In [None]:
num_tokens_from_string(text)

In [None]:
print(text)

In [None]:
from io import StringIO
from pdfminer.high_level import extract_text_to_fp
from pdfminer.layout import LAParams
output_string = StringIO()
with open(doc, 'rb') as fin:
    extract_text_to_fp(fin, output_string, laparams=LAParams(),
                       output_type='html', codec=None)

In [None]:
soup = BeautifulSoup(output_string.getvalue(), "html.parser")

In [None]:
soup