In [26]:
import time,re
from concurrent.futures import ThreadPoolExecutor, as_completed
import pdfplumber, random, string
from openai.error import RateLimitError
import os
import openai
from tqdm import tqdm

openai.api_key = os.getenv('OPENAI_API_KEY')
MAX_TOKENS = 4097
OVERLAP = 200  # Number of tokens to overlap between chunks

In [27]:
def generate_random_string(length=10):
    letters = string.ascii_lowercase
    return ''.join(random.choice(letters) for _ in range(length))

In [28]:
def extract_text(file_path):
    with pdfplumber.open(file_path) as pdf:
        text_list = []
        for page in pdf.pages:
            text_list.append(page.extract_text().lower())
    return ' '.join(text_list)

In [29]:
def summarize_text(text):
    summary_chunks=[]
    return ' '.join(summary_chunks)

In [30]:
import tinycss2.tokenizer


class Session:
    def __init__(self):
        self.conversation_history = [
            {"role": "system", "content": "You are a helpful assistant."},
        ]

    def add_message(self, role, content):
        self.conversation_history.append({"role": role, "content": content})

    def create_responses(self, chunk, prompt):
        self.add_message("user", chunk)
        self.add_message("user", prompt)

        try:
            response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=self.conversation_history[-(MAX_TOKENS//2):]  # Limit to the last part of conversation that fits in MAX_TOKENS
            )
            response_content = response['choices'][0]['message']['content'].strip()
            self.add_message("assistant", response_content)
            return response_content
        except RateLimitError:
            print("Rate limit exceeded. Retrying after delay...")
            time.sleep(5)  # Delay for 5 seconds
            return self.create_responses(chunk, prompt)  # Retry

    def answer_prompt(self, text, prompt):
        chunks = []
        current_chunk = ''

        # Split the text into words
        words = text.split(' ')

        for word in words:
            # If adding the next word doesn't exceed the maximum length, add the word to the chunk
            if len(tinycss2.tokenizer.encode(current_chunk + ' ' + word)) < MAX_TOKENS:
                current_chunk += ' ' + word
            else:
                # If adding the next word would exceed the maximum length, save the current chunk and start a new one
                chunks.append(current_chunk)
                current_chunk = word

        # Make sure to add the last chunk if it's non-empty
        if current_chunk:
            chunks.append(current_chunk)

        responses = []

        # use ThreadPoolExecutor to process chunks in parallel
        with ThreadPoolExecutor() as executor:
            # create a tqdm progress bar
            with tqdm(total=len(chunks), desc="Answering", unit="chunk") as pbar:
                futures = [executor.submit(self.create_responses, chunk, prompt) for chunk in chunks]
                for future in as_completed(futures):
                    try:
                        response = future.result()
                        responses.append(response)
                    except Exception as exc:
                        print('A chunk generated an exception: %s' % exc)
                    # increment the progress bar
                    pbar.update()

        # return the first response
        return responses[0]

    def interactive_session(self, file_path):
        text = extract_text(file_path)
        last_action = None  # variable to remember the last action

        while True:
            action = last_action if last_action else input(
                "Type 'ask' to ask a question, 'summarize' to summarize the PDF, or 'exit' to quit: ")

            if action.lower() == 'exit':
                break
            elif action.lower() == 'ask':
                prompt = input("Ask your question: ")
                response = self.answer_prompt(text, prompt)
                print(f"AI's response: {response}")

                # Ask user for desired file type
                file_type = input("Please enter the desired file type (txt, doc, csv, etc.): ")
                filename = sanitize_filename(prompt) + '.' + file_type  # generate filename

                # Save response to file
                with open(f'output/{filename}', 'w') as f:
                    f.write(f'Question: {prompt}\nAI Response: {response}')

                last_action = None  # Reset last_action
            elif action.lower() == 'summarize':
                start_time = time.time()  # Start timer
                summary, time_taken = summarize_pdf(file_path)

                # Save summary to file
                with open('output_summary.txt', 'w',encoding='utf-8') as f:
                    f.write(summary)

                print(f"Summarized text: {summary}")
                print(f"Time taken: {time_taken:.2f} seconds or {time_taken / 60:.2f} minutes")
                last_action = None  # Reset last_action
            else:
                print("Invalid option. Please try again.")
                last_action = None  # Reset last_action

In [31]:
def sanitize_filename(prompt):
            # Remove any characters that aren't alphanumeric, underscores, or hyphens
    filename = re.sub(r'[^a-zA-Z0-9_-]', '', prompt.replace(' ', '_'))  # replace spaces with underscores
    return filename


In [32]:
def summarize_pdf(file_path):
    text = extract_text(file_path)
    start_time = time.time()  # Start timer
    summary = summarize_text(text)
    elapsed_time = time.time() - start_time  # Calculate elapsed time
    return summary, elapsed_time

In [33]:
session = Session()
session.interactive_session(r"C:\Users\GF 63\PycharmProjects\pythonProject1\lemh207.pdf")

Summarized text: 
Time taken: 0.00 seconds or 0.00 minutes


AttributeError: module 'tinycss2.tokenizer' has no attribute 'encode'