In [1]:
import argparse
import os
import pandas as pd
import sys

parent_dir = os.path.abspath(os.path.join(os.getcwd(), '../2codes'))
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

# load local function
from utils import generate_response, get_prompt_by_type, data_processor, save_results_to_folder, build_index, load_index_from_storage

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--QAType", default=2, type=int, help="Which Type of Question to be asked, we have 1.MCQ, 2.ResponseQ, 3.MathQ, 4.ImageQ")
    parser.add_argument("--verbose", default=True, type=bool, help="Print the output or not")
    parser.add_argument("--evidence", default=False, type=bool, help="Print and Plot the source of the answer")
    parser.add_argument("--api_key", default=os.getenv("OPENAI_API_KEY"), type=str, help="OpenAI API Key")
    parser.add_argument("--top_k", default=3, type=int, help="Top k answer to be generated")
    parser.add_argument("--image_top_k", default=3, type=int, help="Top k images to be generated")
    parser.add_argument("--retrive", default=False, type=bool, help="Retrieve the document or answer_question")
    parser.add_argument("--interactive", default=False, type=bool, help="User can input the question or use default")
    parser.add_argument("--model", default="gpt-4o", type=str, help="The model to be used, currently supports gpt-4o")
    parser.add_argument("--rebuild_index", default=False, type=bool, help="Rebuild the index or not")
    parser.add_argument("--document", default="1Book/3Book_txt_images", type=str, help="The document (Book) to be used")
    parser.add_argument("--chunk_size", default=600, type=int, help="The chunk size of the document")
    parser.add_argument("--chunk_overlap", default=100, type=int, help="The chunk overlap of the document")

    # This line filters out the jupyter notebook arguments
    if sys.argv and len(sys.argv) > 1:
        return parser.parse_args(args=[])
    else:
        return parser.parse_args()

# Parsing arguments
args = parse_args()
print(args)


Namespace(QAType=2, verbose=True, evidence=False, api_key='sk-n9LlKPTL5ms9htKOjatqCHU7RJsdYcLwVXgVPM6aPdT3BlbkFJ8WSvM3GvJE-jqJ79k8yojNSe4rjCb744mY5ElabzkA', top_k=3, image_top_k=3, retrive=False, interactive=False, model='gpt-4o', rebuild_index=False, document='1Book/3Book_txt_images', chunk_size=600, chunk_overlap=100)


In [2]:
import os
os.chdir("../../") # change to main directory
!pwd

/Users/zihan/Desktop/Manufacturing_QA


In [3]:
# Set VectorDB path
args.MMVectorDB_path = "1Book/4.High_Quality_WholeBook_storage"
# list of all question folders
RQfolder = "2QuestionsData/1RQ"
MCQfolder = "2QuestionsData/2MCQ"
ImgQfolder = "2QuestionsData/3ImgQ/"
MathQfolder = "2QuestionsData/4MathQ/"

In [4]:
def load_csv_files_from_folder(folder_path):
    csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
    
    # Load all CSVs into dataframes
    dfs = [pd.read_csv(os.path.join(folder_path, f), header=0) for f in csv_files]
    
    if dfs:
        args.questions_df = pd.concat(dfs, ignore_index=True)  # Concatenate into one dataframe
        
        return args
    else:
        raise ValueError(f"No CSV files found in folder: {folder_path}")

In [8]:
# Easy handle for quick testing
args.QAType = 4
if args.QAType == 1:
    load_csv_files_from_folder(MCQfolder)    
elif args.QAType == 2:
    load_csv_files_from_folder(RQfolder)
elif args.QAType == 3:
    load_csv_files_from_folder(ImgQfolder)
elif args.QAType == 4:
    load_csv_files_from_folder(MathQfolder)
else:
    raise ValueError("Invalid QAType. Please specify a valid QAType from 1 to 4.")


Give a virtualization for data and use small testset for demo

In [9]:
args.questions_df = args.questions_df[:3]
args.questions_df

Unnamed: 0,chapter,question_number,Question,Answer
0,23,23.01,23.01 Flank wear data were collected in a seri...,Solution: (a) and (b) Student exercises. For p...
1,23,23.02,23.02 Solve Problem 23.1 except that the tool ...,Solution: (a) and (b) Student exercises. For p...
2,23,23.03,23.03 A series of turning tests were conducted...,"Solution: (a) Using the graph, at 350 ft/min ..."


In [10]:
os.environ["OPENAI_API_KEY"] = args.api_key
args.GPT_results = []
args.GPT_cot_results = []
prompt_data = get_prompt_by_type(args)
for question in args.questions_df['Question']:
    # GPT (no prompt design)
    GPT_result = generate_response(args, query=question, image_paths="")
    
    # GPT_cot (with prompt design)
    prompt = prompt_data.format(context_str="", query_str=question)
    GPT_cot_result = generate_response(args, query=prompt, image_paths="")
    
    # Append results to args
    args.GPT_results.append(GPT_result)
    args.GPT_cot_results.append(GPT_cot_result)

# Define a dictionary to loop over results and process/save them
results_mapping = {
    "GPT_results": args.GPT_results,
    "GPT_cot_results": args.GPT_cot_results,
}

# Process and save each result type
for result_type, result_list in results_mapping.items():
    if result_list:  # Only process if the result list is not empty
        results_df = data_processor(result_type,result_list, args)
        save_results_to_folder(results_df, args, result_type)


{'type': 'MathQ', 'index': 4, 'content': 'You are a top student in the manufacturing major, and you are taking an exam. You need to solve the following math question based on the context provided and your knowledge of manufacturing calculations. Make sure to clearly show all your steps and explain your reasoning for each step. Provide the final answer at the end.\n\n{context_str}\n\nSolve the following question:\n{query_str}\nAnswer:'}
Results saved to: 4Results/MathQ/GPT_results_2024_10_12_20_55.csv
Results saved to: 4Results/MathQ/GPT_cot_results_2024_10_12_20_55.csv
