In [1]:
import os
import tempfile
from docx import Document
import PyPDF2
import textract

In [2]:
def process_input_file(input_file_path):
    '''
    process_input_text() helper function takes the input file in txt, docx or pdf format
    as an argument and removes empty lines and non-essential characters. The output is saved
    in a temporary directory.
    
    Parameters:
        input_file_path (str): path to the input text file
    
    Returns:
        processed temporary text file path saved in temp/
    '''
    # Create a temporary file in the same directory as the input file
    temp_dir = os.path.join(os.path.dirname(input_file_path), "temp")
    os.makedirs(temp_dir, exist_ok=True)

    temp_file = tempfile.NamedTemporaryFile(mode='w', delete=False, dir=temp_dir, encoding='UTF-8')

    try:
        file_extension = os.path.splitext(input_file_path)[1].lower()

        # Read the contents of the file based on its type
        if file_extension == '.txt':
            with open(input_file_path, 'r', encoding='UTF-8') as input_file:
                lines = input_file.readlines()
        elif file_extension == '.docx':
            doc = Document(input_file_path)
            lines = [p.text for p in doc.paragraphs]
        elif file_extension == '.pdf':
            with open(input_file_path, 'rb') as input_file:
                reader = PyPDF2.PdfFileReader(input_file)
                lines = []
                for page_num in range(reader.numPages):
                    page = reader.getPage(page_num)
                    lines.append(page.extract_text())
        else:
            raise ValueError("Unsupported file format")

        # Remove empty lines and lines consisting only of '-' or '_'
        non_empty_lines = [line.strip() for line in lines if line.strip() and not all(char in {'-', '_'} for char in line.strip())]

        # Write processed text to the temporary file
        temp_file.write('\n'.join(non_empty_lines))
    finally:
        # Close the temporary file
        temp_file.close()

    # Get the path of the temporary file
    temp_file_path = temp_file.name

    return temp_file_path

In [3]:
temp_file = process_input_file("Software_Engineering_Practices.txt")

temp_file

FileNotFoundError: [Errno 2] No such file or directory: 'Software_Engineering_Practices.txt'

In [4]:
pwd

'/Users/alexanderarefolov/Dropbox/Coding_Projects/knowledge_retrieval_LLM_chatbot'

In [5]:
cd /Users/alexanderarefolov/Dropbox/Coding_Projects/knowledge_retrieval_LLM_chatbot/knowledge_retrieval_LLM_chatbot_v2

/Users/alexanderarefolov/Dropbox/Coding_Projects/knowledge_retrieval_LLM_chatbot/knowledge_retrieval_LLM_chatbot_v2


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [6]:
pwd

'/Users/alexanderarefolov/Dropbox/Coding_Projects/knowledge_retrieval_LLM_chatbot/knowledge_retrieval_LLM_chatbot_v2'

In [7]:
temp_file = process_input_file("Software_Engineering_Practices.txt")

temp_file

'/Users/alexanderarefolov/Dropbox/Coding_Projects/knowledge_retrieval_LLM_chatbot/knowledge_retrieval_LLM_chatbot_v2/temp/tmpclc113lv'

In [8]:
file_extension = os.path.splitext("Software_Engineering_Practices.txt")[1].lower()
file_extension

'.txt'

In [9]:
len("asjksjhdf")

9