## Generate Synthetic Data

Use this notebook to generate the synthetic data for the datasets offline.

The synthetic data will be stored in files named ```sync_data/ClaudeFewShot_{dataset}-{group}.csv``` and also uploaded to the EC2 instance.

In [1]:
import os
os.chdir("..")
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ["TOKENIZERS_PARALLELISM"]="false"
os.environ["AWS_REGION_NAME"] = 'us-east-1'

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd

In [4]:
from src.sync_data.evaluators import NLIFinetuningEvaluation
from src.sync_data.compute_entailments import EntailmentCheckModel
from src.utils.script_utils import get_datasets
from src.sync_data.initial_generators import FewShotPromptingGenerator

src.cross_encoder_model.my_evaluator


In [5]:
from src.utils.s3 import upload_csv_to_s3
from src.utils.constants import _BUCKET_NAME, _BUCKET_ROOTPATH

In [6]:
os.makedirs("sync_data", exist_ok=True)

In [None]:
lfqa_group=["all"] #specify groups here
dataset = "lfqa-veri"
model_name = "claude3-haiku" # or use "gpt-4o-mini"
for group in lfqa_group:
    print(group)
    group_train, _ , _ = get_datasets(dataset, group=group)
    existing_df = None
    mygen = FewShotPromptingGenerator(EntailmentCheckModel("vectara_v2"), model_name=model_name, min_req_examples=1, prompt_mode="qa2" if dataset in ["expertqa", "lfqa-veri"] else "summ")
    res = mygen.generate_samples(group_train, 8)
    res = res.to_dataframe()
    if existing_df is not None:
        res = pd.concat((existing_df, res), axis=0, ignore_index=True)
    target_filename = f"sync_data/ClaudeFewShot_{dataset}-{group}.csv"
    res.to_csv(target_filename, index=False) # Save locally
    try:
        upload_csv_to_s3(res, _BUCKET_NAME, os.path.join(_BUCKET_ROOTPATH, target_filename)) # Save in S3
    except:
        print("unable to upload to S3")