In [None]:
from google.cloud import bigquery, storage
import pandas as pd
from datetime import datetime
import json
from sklearn.model_selection import train_test_split

In [None]:
# Set your BigQuery and GCS configurations
project_id = "health-ai-agent-sjsu"  # Replace with your Google Cloud project ID
bq_table_id = "health-ai-agent-sjsu.transformed_data.all_merged"  # Replace with your BigQuery table ID
bucket_name = "llm-prepared-data"  # Replace with your GCS bucket name


In [None]:



# Initialize BigQuery client
bq_client = bigquery.Client(project=project_id)

# Query data from BigQuery
query = f"""
SELECT Title, Question, Answer
FROM `{bq_table_id}`
"""
df = bq_client.query(query).to_dataframe()

# Prepare data in JSONL format with 'context', 'input', 'output'
def prepare_jsonl_entry(row):
    return {
        "context": row["Title"],
        "input": row["Question"],
        "output": row["Answer"]
    }

jsonl_data = df.apply(prepare_jsonl_entry, axis=1).tolist()

# Split data into train, validation, and test sets
train_data, test_data = train_test_split(jsonl_data, test_size=0.2, random_state=42)
valid_data, test_data = train_test_split(test_data, test_size=0.5, random_state=42)  # 10% each for validation and test

# Get the current date
current_date = datetime.now().strftime("%Y%m%d")

# Define file paths with date-stamped filenames
train_jsonl_file = f"llm_training_data_train_{current_date}.jsonl"
valid_jsonl_file = f"llm_training_data_valid_{current_date}.jsonl"
test_jsonl_file = f"llm_training_data_test_{current_date}.jsonl"

# Save each split as a JSONL file
def save_jsonl(data, filename):
    with open(filename, 'w') as f:
        for entry in data:
            json.dump(entry, f)
            f.write('\n')

save_jsonl(train_data, train_jsonl_file)
save_jsonl(valid_data, valid_jsonl_file)
save_jsonl(test_data, test_jsonl_file)

# Initialize GCS client
storage_client = storage.Client()

# Function to upload a file to GCS under specified folder
def upload_file_to_gcs(local_file_path, bucket_name, destination_blob_name):
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(local_file_path)
    print(f"File {local_file_path} uploaded to gs://{bucket_name}/{destination_blob_name}")

# Define GCS paths with folder structure and upload files
upload_file_to_gcs(train_jsonl_file, bucket_name, f"llm-prepared-data/train/{train_jsonl_file}")
upload_file_to_gcs(valid_jsonl_file, bucket_name, f"llm-prepared-data/valid/{valid_jsonl_file}")
upload_file_to_gcs(test_jsonl_file, bucket_name, f"llm-prepared-data/test/{test_jsonl_file}")


File llm_training_data_train_20241105.jsonl uploaded to gs://llm-prepared-data/llm-prepared-data/train/llm_training_data_train_20241105.jsonl
File llm_training_data_valid_20241105.jsonl uploaded to gs://llm-prepared-data/llm-prepared-data/valid/llm_training_data_valid_20241105.jsonl
File llm_training_data_test_20241105.jsonl uploaded to gs://llm-prepared-data/llm-prepared-data/test/llm_training_data_test_20241105.jsonl
