In [None]:
import csv, json, glob, random
from datetime import datetime
import pandas as pd
from pathlib import Path

In [None]:
DM_transcript_dir = "./transcripts/gpt4/DM"
SM_transcript_dir = "./transcripts/gpt4/SM"

In [None]:
training_data_dir = Path("./training_data")
training_data_dir.mkdir(exist_ok=True)

#used during training
eval_data_dir = Path(training_data_dir, "evaluation_data")
eval_data_dir.mkdir(exist_ok=True)

#used to test model after training
test_data_dir = Path(training_data_dir, "test_data")
test_data_dir.mkdir(exist_ok=True)

In [None]:
patient_prompt_path = 'prompts/double_model_prompts/patient_prompt.txt'
asst_prompt_path ='prompts/double_model_prompts/assistant_prompt.txt'
questions_path = 'prompts/questionbank.txt'

with open(questions_path, "r") as f:
    questions_str1 = f.read().replace("\n", "\\n")

print(questions_str1)

In [None]:
questions_str = Path(questions_path).read_text()
print(questions_str)

In [None]:
'''
thoughts about using previous history in the training itself:
- need to account for context size-- should only append certain number of past messages or the context window will be too large
- need to account for the chat history? maybe store a list of past messages and write some other code that transforms it into the desired chat model's past history format
    - to do this i should also update the textgen formatting dictionary...
'''
def process_data_with_history(input_files):
    ''':input_files: (list)'''
    def append_history(msg_list):
        pass ##TODO: implement gradual addition
    # Load the data from the original JSON file
    
    # Prepare the new dataset
    new_dataset = []
    current_pair = {}
    for input_file in input_files:
        with open(input_file, 'r') as file:
            data = json.load(file)
        
        # append_history(msg_list) ##TODO: implement
        # Iterate through the list of dictionaries
        for item in data[1:]:
            if item['role'] == 'user':
                current_pair['user'] = item['content']
            elif item['role'] == 'assistant':
                current_pair['assistant'] = item['content']
                # Make sure both user and assistant messages are present
                if 'user' in current_pair and 'assistant' in current_pair:
                    new_dataset.append(current_pair)
                    current_pair = {}
    
    return new_dataset

In [None]:
def process_data(input_files):
    ''':input_files: (list)'''
    # Load the data from the original JSON file
    
    # Prepare the new dataset
    new_dataset = []
    current_pair = {}
    for input_file in input_files:
        with open(input_file, 'r') as file:
            data = json.load(file)
        
        # Iterate through the list of dictionaries
        for item in data[1:]: #do not include the assistant intro message
            if item['role'] == 'user':
                current_pair['user'] = item['content']
            elif item['role'] == 'assistant':
                current_pair['assistant'] = item['content']
                # Make sure both user and assistant messages are present
                if 'user' in current_pair and 'assistant' in current_pair:
                    new_dataset.append(current_pair)
                    current_pair = {}
    
    return new_dataset

In [None]:
# Example usage:
dm_json_files = glob.glob(DM_transcript_dir+"/*.json")
print(dm_json_files)

sm_json_files = glob.glob(SM_transcript_dir+"/*.json")
print(sm_json_files)

do we want to use fraction of all transcripts as the evaluation set (evaluation set comprised of complete, full transcript)?
- to do this: randomize the order of the json files -> select fraction -> call process_data separately
or do we want to use fraction of all json pairings (evaluation set comprised of random pairings)?
- to do this: call process_data -> randomize -> select fractions
currently implements the second type with test/evaluation sets compromising 15% of the pairings. my logic is that each pairing is trained independently anyway, so there shouldn't be as much of an influence...

In [None]:
dm_dataset = process_data(dm_json_files)

divider = int(len(dm_dataset)*0.15) #15% of the dataset will be used for training
print(divider)

random.shuffle(dm_dataset)

eval_dataset = dm_dataset[:divider] #first 15%
test_dataset = dm_dataset[divider:divider+divider] #second 15%
training_dataset = dm_dataset[divider+divider:] #the rest 70% of the data

In [None]:
print(len(eval_dataset), len(test_dataset), len(training_dataset))

In [None]:
output_file = str(training_data_dir)+f'/{Path(DM_transcript_dir).stem}_{datetime.now().strftime("%Y%m%d")}_{len(training_dataset)}i.json'

eval_file = str(eval_data_dir)+f'/{Path(DM_transcript_dir).stem}_{datetime.now().strftime("%Y%m%d")}_{len(eval_dataset)}i.json'

test_file = str(test_data_dir)+f'/{Path(DM_transcript_dir).stem}_{datetime.now().strftime("%Y%m%d")}_{len(test_dataset)}i.json'

In [None]:
with open(output_file, 'w') as file:
    json.dump(training_dataset, file, indent=4)

In [None]:
# Save the new dataset to a new JSON file
with open(eval_file, 'w') as file:
    json.dump(eval_dataset, file, indent=4)

with open(test_file, 'w') as file:
    json.dump(test_dataset, file, indent=4)

## do the same for single model transcripts

In [None]:
sm_dataset = process_data(sm_json_files)

output_file = str(training_data_dir)+f'/{Path(SM_transcript_dir).stem}_{datetime.now().strftime("%Y%m%d")}_{len(sm_dataset)}i.json'
eval_file = str(eval_data_dir)+f'/{Path(DM_transcript_dir).stem}_{datetime.now().strftime("%Y%m%d")}_{len(dm_dataset)}i.json'

# Save the new dataset to a new JSON file
with open(output_file, 'w') as file:
    json.dump(sm_dataset, file, indent=4)