In [3]:
import os
import numpy as np
from src.utils.query_bigquery import query_bigquery
from src.utils.sample import get_stratified_sample
from src.utils.jsonify_data import jsonify_data
from src.utils.async_call_openai import gather_responses

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
FEEDBACK_PROJECT_ID = os.getenv("FEEDBACK_PROJECT_ID")
PUBLISHING_PROJECT_ID = os.getenv("PUBLISHING_PROJECT_ID")
FEEDBACK_TABLE = os.getenv("FEEDBACK_TABLE")
PUBLISHING_TABLE = os.getenv("PUBLISHING_TABLE")
LABELLED_FEEDBACK_TABLE = os.getenv("LABELLED_FEEDBACK_TABLE")
OPENAI_LABEL_FEEDBACK_TABLE = os.getenv("OPENAI_LABELLED_FEEDBACK_TABLE")
LABELLED_FEEDBACK_DATASET = os.getenv("LABELLED_FEEDBACK_DATASET")

ModuleNotFoundError: No module named 'src'

In [None]:
# Query BQ to pull the human labelled feedback data
query_read = """
SELECT * FROM  @feedback_sample_table
"""
query_read = query_read.replace("@feedback_sample_table", str(LABELLED_FEEDBACK_TABLE))

# Call the function to execute the query
labelled_sample = query_bigquery(
    PUBLISHING_PROJECT_ID,
    LABELLED_FEEDBACK_DATASET,
    query_read,
)

# Get a stratified sample of the labelled feedback data
stratified_sample = get_stratified_sample(
    records=labelled_sample,
    total_sample_size=20,
    id_key="feedback_record_id",
    label_key="labels",
)

stratified_sample

In [None]:
unlabelled_data_query = """
SELECT
          feedback_record_id,
          STRING_AGG(response_value, ' '
          ORDER BY
            created) AS concatenated_response_value,
            rand() as r
        FROM
          @publishing_table
          WHERE DATE(created) >= "2024-01-01"
        GROUP BY
          feedback_record_id
        ORDER BY
          r
      LIMIT (5)"""
unlabelled_data_query = unlabelled_data_query.replace(
    "@publishing_table",
    str(PUBLISHING_TABLE),
)
unlabelled_data = query_bigquery(
    PUBLISHING_PROJECT_ID,
    LABELLED_FEEDBACK_DATASET,
    unlabelled_data_query,
)
unlabelled_data

In [None]:
labelled_subs_json = jsonify_data(records=stratified_sample, labelled=True)
new_subs_json = jsonify_data(records=unlabelled_data, labelled=False)

In [None]:
responses = gather_responses(
    labelled_subs_json=labelled_subs_json,
    new_subs_json=new_subs_json,
    api_key=OPENAI_API_KEY,
)

In [None]:
prompt_tokens = [response["prompt_tokens"] for response in responses]
mean_prompt_tokens = np.mean(prompt_tokens)
completion_tokens = [response["completion_tokens"] for response in responses]
mean_completion_tokens = np.mean(completion_tokens)
print(mean_prompt_tokens)
print(mean_completion_tokens)

In [None]:
for response in responses:
    print(response["open_labelled_records"])