In [1]:
import os
import re
import json
import glob

from langchain_community.document_loaders import DirectoryLoader, TextLoader
from data_process import data_processor
from chunking import chunking
from prompt import prompt

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [5]:
path = os.getcwd()
work_dir = os.path.join(path, 'work_dir')
os.makedirs(work_dir, exist_ok=True)


In [4]:
def data_processing(path, raw_data_folder):

    processor = data_processor(path=path)   # claim the processing object

    loader = processor._extract_text_from_pdf       
    papers = processor.load_all_papers(folder=raw_data_folder, loader=loader)        # load papers in from pdf
    sections = processor.extract_section(papers)              # automated cleaning.
    processor.save_to_text(sections, os.path.join(path, 'cleaned_papers'))       # save cleaning results as text file. 
    return sections

In [6]:
outcome_filename = 'outcome_definition.json'    # definition of the outcome terminologies. 
with open(os.path.join(path, outcome_filename)) as outcome_file:
    outcome_definition = json.load(outcome_file)

In [None]:
# preprocess papers

paper = data_processing(path=work_dir, raw_data_folder='pdfs')

In [12]:
''' chunking for papers'''

# Load data
# Define data directory
cleaned_data_folder = os.path.join(work_dir, 'cleaned_papers/tochunk/')
assert(os.path.exists(cleaned_data_folder))
# Load data
loader = DirectoryLoader(path=cleaned_data_folder, glob="**/*.txt", show_progress=True, loader_cls=TextLoader)
documents = loader.load()

# Define chunking write-to directory
chunk_folder = os.path.join(work_dir, 'chunks')
os.makedirs(chunk_folder, exist_ok=True)
# Chunk
chunk = chunking(chunk_folder)
chunks = chunk.make_chunks(documents)
# chunk.save_chunks_to_text(docs=chunks)
# chunk.save_chunk_for_later_loading(docs=chunks)

100%|██████████| 1/1 [00:00<00:00, 1296.94it/s]


In [None]:
''' raw factor evaluation
input = a json file of sample paragraphs (contexts).
chunks = each line in the input json file '''
evaluation_file = os.path.join(work_dir, 'evaluation_dataset_2_1205.json')

try:
    with open(evaluation_file, "r") as file:
        data = json.load(file)
except FileNotFoundError:
    print(f"Error: {evaluation_file} not found.")
except json.JSONDecodeError:
    print(f"Error: Could not decode JSON from {evaluation_file}. Check file format.")

chunks = data['paragraphs']

In [None]:
'''generate response by promt'''

# response_dir = os.path.join(work_dir, 'responses/response.txt')
response_dir = os.path.join(work_dir, 'responses/evaluation_response_Alaina.txt')

generate_prompt = prompt(chunked_docs = chunks, outcome_definition = outcome_definition)
message_func = generate_prompt._message_find_factors_to_result
results = generate_prompt.generate_response(message_func, include_chunk=True)
generate_prompt.write_summaries_to_txt(results, response_dir)

    

processing chunk 0
processing chunk 1
processing chunk 2
processing chunk 3
processing chunk 4
processing chunk 5
processing chunk 6
processing chunk 7
processing chunk 8
processing chunk 9
processing chunk 10
processing chunk 11
processing chunk 12
processing chunk 13
processing chunk 14
processing chunk 15
processing chunk 16
processing chunk 17
processing chunk 18
processing chunk 19
processing chunk 20
processing chunk 21
processing chunk 22
processing chunk 23
processing chunk 24
processing chunk 25
processing chunk 26
processing chunk 27
Analyze results saved to text file. 


### remove no-related-content responses

In [6]:
# remove no-related-content responses
def remove_paragraphs_with_string(input_file, output_file, target_string):
    with open(input_file, 'r') as file:
        content = file.read()
    
    # Split the content into paragraphs
    paragraphs = content.split('\n\n')
    
    # Filter out paragraphs containing the target string
    filtered_paragraphs = [
        paragraph for paragraph in paragraphs if target_string not in paragraph
    ]
    
    # Join the remaining paragraphs with double newline characters
    result_content = '\n\n'.join(filtered_paragraphs)
    
    # Write the result to the output file
    with open(output_file, 'w') as file:
        file.write(result_content)

In [None]:
response_folder = 'work_dir/raw_factors/'

# Using glob to find all .txt files in the directory and subdirectories
input_filenames = glob.glob(os.path.join(response_folder, '**', '*.txt'), recursive=True)
output_filenames = [input_filename[:-4]+'_clean.txt' for input_filename in input_filenames]
target_string = 'There are no related factors in this chunk'

for i in range(len(input_filenames)):
    print(f'Start cleaning {input_filenames[i]}')
    remove_paragraphs_with_string(input_file=input_filenames[i], output_file=output_filenames[i], target_string=target_string)
    print(f'Completed cleaning {input_filenames[i]}')

Start cleaning work_dir/validation/2nd round_evaluate_response_study 75-151.txt
Completed cleaning work_dir/validation/2nd round_evaluate_response_study 75-151.txt


### count number of factors related to each outcome respectively

In [8]:
''' count number of factors related to each outcome respectively'''
import re
response_folder = 'work_dir/raw_factors/'

def count_factors(file_path):
    # Initialize counters for each category
    adherence_count = 0
    dropout_count = 0
    accessibility_count = 0

    # Open and read the file
    with open(file_path, 'r') as file:
        lines = file.readlines()

    # Initialize a variable to keep track of which block we are in
    current_block = None

    # Iterate through each line in the file
    for line in lines:
        # Strip whitespace from the line
        line = line.strip()

        # Check the start of a new factor block
        if line.startswith("The factors contributing to"):
            if "Adherence" in line:
                current_block = 'adherence'
            elif "Dropout" in line:
                current_block = 'dropout'
            elif "Accessibility" in line:
                current_block = 'accessibility'
            else:
                current_block = None  # Reset if it's an unexpected line format
                
        # Count factors based on the current block
        if current_block:
            # Count the factors that start with a dash
            if line.startswith("-"):
                if current_block == 'adherence':
                    adherence_count += 1
                elif current_block == 'dropout':
                    dropout_count += 1
                elif current_block == 'accessibility':
                    accessibility_count += 1
                    
            # Check for inline factors immediately after the category using quotes
            quoted_factors = re.findall(r'"(.*?)"', line)
            if quoted_factors:
                if current_block == 'adherence':
                    adherence_count += len(quoted_factors)
                elif current_block == 'dropout':
                    dropout_count += len(quoted_factors)
                elif current_block == 'accessibility':
                    accessibility_count += len(quoted_factors)
                continue  # Skip the rest of the processing for this line

            # Check for inline factors immediately after the category without quotes
            inline_factors = re.search(r'are:\s*(\S[^.]+)', line)
            if inline_factors:
                if current_block == 'adherence':
                    adherence_count += 1
                elif current_block == 'dropout':
                    dropout_count += 1
                elif current_block == 'accessibility':
                    accessibility_count += 1
                continue  # Skip the rest of the processing for this line

        

    # Return the counts as a tuple
    return adherence_count, dropout_count, accessibility_count

# Example usage (assuming 'input.txt' is your file name):
file_path = os.path.join(response_folder, '2nd round_evaluate_response_study 75-151_clean.txt')
adherence, dropout, accessibility = count_factors(file_path)
print(f"Adherence factors: {adherence}")
print(f"Dropout factors: {dropout}")
print(f"Accessibility factors: {accessibility}")


FileNotFoundError: [Errno 2] No such file or directory: 'work_dir/raw_factors/2nd round_evaluate_response_study 75-151_clean.txt'

### load and chunk for validation dataset

In [20]:
''' extract factors '''
''' chunking for papers'''

# Load data
# Define data directory
cleaned_data_folder = os.path.join(work_dir, 'factors/')
assert(os.path.exists(cleaned_data_folder))
# Load data
loader = DirectoryLoader(path=cleaned_data_folder, glob="**/*.txt", show_progress=True, loader_cls=TextLoader)
documents = loader.load()

# Define chunking write-to directory
chunk_folder = os.path.join(work_dir, 'chunks')
os.makedirs(chunk_folder, exist_ok=True)
# Chunk
chunk = chunking(chunk_folder,chunk_size=5000, chunk_overlap=100)
chunks = chunk.make_chunks(documents)
# chunk.save_chunks_to_text(docs=chunks)
# chunk.save_chunk_for_later_loading(docs=chunks)


100%|██████████| 3/3 [00:00<00:00, 1943.01it/s]


In [21]:
chunks

[Document(metadata={'source': '/Users/yushuhuang/Documents/research/dmbi_proj/work_dir/factors/accessibility.txt'}, page_content='"Build on favourite tracks" (Atreya et al., 2018)\n"Provide longer and shorter track options" (Atreya et al., 2018)\n"Select male or female voices" (Atreya et al., 2018)\n"Flexible and accessible interventions" (Lange, 2020)\n"Flexible and accessible intervention" (Llaneza et al., 2022)\n"Allow personalised individual schedule" (Llaneza et al., 2022)\n"Live support options" (Llaneza et al., 2022)\n"Flexible and accessible interventions" (Lunsky et al., 2021) - The flexibility and accessibility of the interventions make them easier to reach and utilize, enhancing accessibility.\n"More instructions to participate in an online forum" (Lunsky et al., 2021) - Providing more instructions can help users navigate and utilize the online forum more effectively, improving accessibility.\n"Flexible and accessible intervention" (Stjernsward and Hansson, 2020): The interv

### separate out factors corresponding to three outcomes


In [4]:
def parse_factors(filename):
    # Initialize lists to store factors
    adherence_factors = []
    dropout_factors = []
    accessibility_factors = []

    # Variable to keep track of the current category
    current_category = None

    # Open and read the file line by line
    with open(filename, 'r') as file:
        for line in file:
            line = line.strip()  # Remove leading and trailing whitespace
            if not line:  # Skip empty lines
                continue

            # Determine the current category
            if "The factors contributing to Adherence are" in line:
                current_category = "Adherence"
            elif "The factors contributing to Dropout are" in line:
                current_category = "Dropout"
            elif "The factors contributing to Accessibility are" in line:
                current_category = "Accessibility"
            
            # Separate factors into lists. 
            if current_category:
                # Count the factors that start with a dash
                if line.startswith("-"):
                    factor = line[1:].strip()  # Remove the dash and strip whitespace
                elif ":" in line:
                    # In case the factor is on the same line, after a colon
                    factor = line.split(":",1)[1].strip() or None
                    if factor==None:
                        continue
                if current_category == 'Adherence':
                    adherence_factors.append(factor)
                elif current_category == 'Dropout':
                    dropout_factors.append(factor)
                elif current_category == 'Accessibility':
                    accessibility_factors.append(factor)

    return adherence_factors, dropout_factors, accessibility_factors

# Use the function by providing the filename
filename = os.path.join(work_dir, 'raw_factors/response_49_clean.txt')  # Replace with your actual file name
adherence, dropout, accessibility = parse_factors(filename)

print("Adherence Factors:", adherence)
print("Dropout Factors:", dropout)
print("Accessibility Factors:", accessibility)


Adherence Factors: ['"Better support for the care recipient" (Atreya et al., 2018)', '"Desire to help with research" (Atreya et al., 2018)', '"Learn a new skill" (Atreya et al., 2018)', '"Help with sleep" (Atreya et al., 2018)', '"Promote relaxation" (Atreya et al., 2018)', '"Curiosity" (Atreya et al., 2018)', '"Assist to focus/train/organise thoughts" (Atreya et al., 2018)', '"Positive experience with meditation" (Atreya et al., 2018)', '"Assist to stay in the present" (Dragomanovich et al., 2021)', '"Help with sleep" (Dragomanovich et al., 2021)', '"Assist to focus/train/organise thoughts" (Dragomanovich et al., 2021)', '"Learn a new skill" (Dragomanovich et al., 2021)', '"Reduce anxiety" (Dragomanovich et al., 2021)', '"Flexible and accessible interventions" (Lange, 2020)', '"Improve coping skills" (Lange, 2020)', '"Opportunity for self-care" (Lange, 2020)', '"Positive experience with meditation" (Lange, 2020)', '"More mid-week practice reminders" (Lange, 2020)', '"Flexible and acce

In [7]:
# Write the factors to separate file
response_folder = 'work_dir/raw_factors/'
with open(os.path.join(response_folder,'adherence.txt'), 'w') as file1:
    for factor in adherence:
        file1.write(f"{factor}\n")

with open(os.path.join(response_folder,'dropout.txt'), 'w') as file2:
    for factor in dropout:
        file2.write(f"{factor}\n")
        
with open(os.path.join(response_folder,'accessibility.txt'), 'w') as file3:
    for factor in accessibility:
        file3.write(f"{factor}\n")