In [2]:
import os
import re
import json
import glob

from langchain_community.document_loaders import DirectoryLoader, TextLoader
from data_process import data_processor
from chunking import chunking
from prompt import prompt

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [3]:
def data_processing(path, raw_data_folder):

    processor = data_processor(path=path)   # claim the processing object

    loader = processor._extract_text_from_pdf       
    papers = processor.load_all_papers(folder=raw_data_folder, loader=loader)        # load papers in from pdf
    sections = processor.extract_section(papers)              # automated cleaning.
    processor.save_to_text(sections, os.path.join(path, 'cleaned_papers'))       # save cleaning results as text file. 
    return sections

In [None]:
path = os.getcwd()
work_dir = os.path.join(path, 'work_dir')
os.makedirs(work_dir, exist_ok=True)


In [5]:
outcome_filename = 'outcome_definition.json'    # definition of the outcome terminologies. 
with open(os.path.join(path, outcome_filename)) as outcome_file:
    outcome_definition = json.load(outcome_file)

In [None]:
# preprocess papers

paper = data_processing(path=work_dir, raw_data_folder='pdfs')

In [18]:
# chunking

# Load data
# Define data directory
cleaned_data_folder = os.path.join(work_dir, 'cleaned_papers/tochunk/')
assert(os.path.exists(cleaned_data_folder))
# Load data
loader = DirectoryLoader(path=cleaned_data_folder, glob="**/*.txt", show_progress=True, loader_cls=TextLoader)
documents = loader.load()

# Define chunking write-to directory
chunk_folder = os.path.join(work_dir, 'chunks')
os.makedirs(chunk_folder, exist_ok=True)
# Chunk
chunk = chunking(chunk_folder)
chunks = chunk.make_chunks(documents)
# chunk.save_chunks_to_text(docs=chunks)
# chunk.save_chunk_for_later_loading(docs=chunks)

100%|██████████| 1/1 [00:00<00:00, 1436.41it/s]


In [19]:
# generate response by promt

response_dir = os.path.join(work_dir, 'responses/response.txt')

generate_prompt = prompt(chunked_docs = chunks, outcome_definition = outcome_definition)
message_func = generate_prompt._message_find_factors_to_result
results = generate_prompt.generate_response(message_func, include_chunk=False)
generate_prompt.write_summaries_to_txt(results, response_dir)

    

processing chunk 0
processing chunk 1
processing chunk 2
processing chunk 3
processing chunk 4
processing chunk 5
processing chunk 6
processing chunk 7
processing chunk 8
processing chunk 9
processing chunk 10
processing chunk 11
processing chunk 12
processing chunk 13
processing chunk 14
processing chunk 15
processing chunk 16
processing chunk 17
processing chunk 18
processing chunk 19
processing chunk 20
processing chunk 21
processing chunk 22
processing chunk 23
processing chunk 24
processing chunk 25
processing chunk 26
processing chunk 27
processing chunk 28
processing chunk 29
processing chunk 30
processing chunk 31
processing chunk 32
processing chunk 33
processing chunk 34
processing chunk 35
processing chunk 36
processing chunk 37
processing chunk 38
processing chunk 39
processing chunk 40
processing chunk 41
processing chunk 42
processing chunk 43
processing chunk 44
processing chunk 45
processing chunk 46
processing chunk 47
processing chunk 48
processing chunk 49
processing

In [20]:
# remove no-related-content responses
def remove_paragraphs_with_string(input_file, output_file, target_string):
    with open(input_file, 'r') as file:
        content = file.read()
    
    # Split the content into paragraphs
    paragraphs = content.split('\n\n')
    
    # Filter out paragraphs containing the target string
    filtered_paragraphs = [
        paragraph for paragraph in paragraphs if target_string not in paragraph
    ]
    
    # Join the remaining paragraphs with double newline characters
    result_content = '\n\n'.join(filtered_paragraphs)
    
    # Write the result to the output file
    with open(output_file, 'w') as file:
        file.write(result_content)

In [21]:
response_folder = 'work_dir/responses/'

# Using glob to find all .txt files in the directory and subdirectories
input_filenames = glob.glob(os.path.join(response_folder, '**', '*.txt'), recursive=True)
output_filenames = [input_filename[:-4]+'_clean.txt' for input_filename in input_filenames]
target_string = 'There are no related factors in this chunk'

for i in range(len(input_filenames)):
    print(f'Start cleaning {input_filenames[i]}')
    remove_paragraphs_with_string(input_file=input_filenames[i], output_file=output_filenames[i], target_string=target_string)
    print(f'Completed cleaning {input_filenames[i]}')

Start cleaning work_dir/responses/response_100.txt
Completed cleaning work_dir/responses/response_100.txt
Start cleaning work_dir/responses/response_616.txt
Completed cleaning work_dir/responses/response_616.txt
Start cleaning work_dir/responses/response_199.txt
Completed cleaning work_dir/responses/response_199.txt
Start cleaning work_dir/responses/response_388.txt
Completed cleaning work_dir/responses/response_388.txt
Start cleaning work_dir/responses/response_191.txt
Completed cleaning work_dir/responses/response_191.txt
Start cleaning work_dir/responses/response_92.txt
Completed cleaning work_dir/responses/response_92.txt
Start cleaning work_dir/responses/response_86.txt
Completed cleaning work_dir/responses/response_86.txt
Start cleaning work_dir/responses/response_168.txt
Completed cleaning work_dir/responses/response_168.txt


In [23]:
input_filenames

['responses/response_100.txt',
 'responses/response_616.txt',
 'responses/response_199.txt',
 'responses/response_75.txt',
 'responses/response_49.txt',
 'responses/response_388.txt',
 'responses/response_151.txt',
 'responses/response_623.txt',
 'responses/response_144.txt',
 'responses/response_191.txt',
 'responses/response_92.txt',
 'responses/response_86.txt',
 'responses/response_168.txt',
 'responses/response_642.txt',
 'responses/response_118.txt']