In [None]:
import os, sys, time
import llama3

def process_row(z, tableA, tableB, label, tableA_df, tableB_df, columns, prompt_techniques, extra_correction, dataset_name, save_folder, csv_name, llama3_model):
    # Your code here
    
    idA, idB, single_label = tableA.iloc[z], tableB.iloc[z], label.iloc[z]
    rowA = tableA_df[tableA_df['id'] == idA].drop(columns='id')
    rowB = tableB_df[tableB_df['id'] == idB].drop(columns='id')
    sentenceA = llama3.format_columns_string(*columns).format(**rowA.to_dict('records')[0])
    sentenceB = llama3.format_columns_string(*columns).format(**rowB.to_dict('records')[0])

    for prompt in prompt_techniques:
        for force in extra_correction:
            domain = llama3.determine_domain(dataset_name) if prompt not in [llama3.general_simple, llama3.general_complex] else None
            prompt_sentence = llama3.generate_prompt_sentence(sentenceA, sentenceB, force, prompt, domain)

            start = time.time()
            print(prompt_sentence)
            response = llama3_model.llama_chat_get_response(prompt_sentence)
            print(response)
            end = time.time()
            time_taken = end - start

            pred = llama3.parse_response(response)
            simple_or_complex = llama3.determine_complexity(prompt)
            general_or_domain = 'domain' if prompt in [llama3.domain_simple, llama3.domain_complex] else 'general'
            yes_or_no = 1 if force else 0

            # llama3.save_predictions(f"{save_folder}/{csv_name}.csv", general_or_domain, simple_or_complex, yes_or_no, idA, idB, pred, single_label, time_taken)


sys.path.append("..")
import config
    
# Different prompt-techniques
prompt_techniques = [llama3.general_simple, llama3.general_complex, llama3.domain_simple, llama3.domain_complex]

# 1 for force yes or no response, 0 for not
extra_correction = [1]

# Different folder and datasets
folders = [config.STRUCTURED_DIR, config.DIRTY_DIR]
datasets = [config.DBLP_GOOGLESCHOLAR_DIR]

# NOT DONE: config.DBLP_GOOGLESCHOLAR_DIR

done_dir = [config.AMAZON_GOOGLE_DIR, config.BEER_DIR, config.FODORS_ZAGATS_DIR, 
            config.ITUNES_AMAZON_DIR, config.WALMART_AMAZON_DIR, config.ABT_BUY_DIR, 
            config.DBLP_ACM_DIR ]

save_folder = 'llama3_predictions'
if not os.path.exists(save_folder):
    os.makedirs(save_folder)

total_preds = 0
for folder_name in folders:
    for dataset_name in datasets:
        try:
            train, val, test = config.load_datasets(folder_name, dataset_name)
            total_preds += len(test)*4
        except:
            print(f"Dataset {folder_name}_{dataset_name} does not exist.")
            continue
print(f"Total predictions: {total_preds}")

llama3_model = llama3.LLama3()
for x, folder_name in enumerate(folders):
    print(f"Starting: {folder_name}...")
    for y, dataset_name in enumerate(datasets):
        try:
            print(f"    Processing {dataset_name}...") 
            csv_name = f"{folder_name}_{dataset_name}"

            # if not os.path.exists(f"{save_folder}/{csv_name}.csv"):
            #     with open(f"{save_folder}/{csv_name}.csv", 'w') as f:
            #         f.write("general_or_domain,simple_or_complex,force_or_not,tableA_id,tableB_id,pred,label,time\n")

            train, val, test = config.load_datasets(folder_name, dataset_name)
            tableA_df, tableB_df = config.tableA_tableB(folder_name, dataset_name)

            columns = tableA_df.columns
            if 'id' in columns:
                columns = columns.drop('id')

            tableA, tableB, label = test['ltable_id'], test['rtable_id'], test['label']

            for z in range(len(tableA)):
                process_row(z, tableA, tableB, label, tableA_df, tableB_df, columns, prompt_techniques, extra_correction, dataset_name, save_folder, csv_name, llama3_model)
                
        except:
            print(f"Dataset {folder_name}_{dataset_name} does not exist.")
            continue
    
    # 5743 * 4 = 22972 for dirty      DBLP_GOOGLE_SCHOLAR
    # 5743 * 4 = 22972 for structured DBLP_GOOGLE_SCHOLAR
