In [1]:
import os
import numpy as np
from src.utils.query_bigquery import query_bigquery
from src.utils.sample import get_stratified_sample
from src.utils.jsonify_data import jsonify_data
from src.utils.async_call_openai import gather_responses

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
FEEDBACK_PROJECT_ID = os.getenv("FEEDBACK_PROJECT_ID")
PUBLISHING_PROJECT_ID = os.getenv("PUBLISHING_PROJECT_ID")
FEEDBACK_TABLE = os.getenv("FEEDBACK_TABLE")
PUBLISHING_TABLE = os.getenv("PUBLISHING_TABLE")
LABELLED_FEEDBACK_TABLE = os.getenv("LABELLED_FEEDBACK_TABLE")
OPENAI_LABEL_FEEDBACK_TABLE = os.getenv("OPENAI_LABELLED_FEEDBACK_TABLE")
LABELLED_FEEDBACK_DATASET = os.getenv("LABELLED_FEEDBACK_DATASET")

In [2]:
# Query BQ to pull the human labelled feedback data
query_read = """
SELECT * FROM  @feedback_sample_table
"""
query_read = query_read.replace("@feedback_sample_table", str(LABELLED_FEEDBACK_TABLE))

# Call the function to execute the query
labelled_sample = query_bigquery(
    PUBLISHING_PROJECT_ID,
    LABELLED_FEEDBACK_DATASET,
    query_read,
)

# Get a stratified sample of the labelled feedback data
stratified_sample = get_stratified_sample(
    records=labelled_sample,
    total_sample_size=20,
    id_key="feedback_record_id",
    label_key="labels",
)

stratified_sample

[{'feedback_record_id': '237482944',
  'concatenated_response_value': 'Attendance allowance This needs to be a form that can be submitted online, aswell as posting.',
  'labels': ['benefits claim', 'accessibility'],
  'labelling_method': 'human',
  'created': datetime.date(2024, 2, 12)},
 {'feedback_record_id': '237461690',
  'concatenated_response_value': 'To report change of phone number Cannot proceed with clain re gift aid until wrong phone number corrected',
  'labels': ['change details', 'gift aid'],
  'labelling_method': 'human',
  'created': datetime.date(2024, 2, 12)},
 {'feedback_record_id': '237509915',
  'concatenated_response_value': "To get my photocard driving licence renewed but I was not happy because he renewed it took the money for another licence that was it.then he rang back later and said it can't be renewed without applying at a post office. All.my details are on file .but I assume that I have to get a photo for a licence more than ten years old. But it seems tha

In [3]:
unlabelled_data_query = """
SELECT
          feedback_record_id,
          STRING_AGG(response_value, ' '
          ORDER BY
            created) AS concatenated_response_value,
            rand() as r
        FROM
          @publishing_table
          WHERE DATE(created) >= "2024-01-01"
        GROUP BY
          feedback_record_id
        ORDER BY
          r
      LIMIT (5)"""
unlabelled_data_query = unlabelled_data_query.replace(
    "@publishing_table",
    str(PUBLISHING_TABLE),
)
unlabelled_data = query_bigquery(
    PUBLISHING_PROJECT_ID,
    LABELLED_FEEDBACK_DATASET,
    unlabelled_data_query,
)
unlabelled_data

[{'feedback_record_id': '236165404',
  'concatenated_response_value': 'To seek if I need a NINO-Card at 79yrs of age?',
  'r': 5.683694935303314e-05},
 {'feedback_record_id': '236499624',
  'concatenated_response_value': 'Trying to find out if this site has been updated',
  'r': 0.0001324155422643731},
 {'feedback_record_id': '236257986',
  'concatenated_response_value': "I live in South Africa and have taken a lump sum payout of a pension i had and j believe the tax is too much and need to do a tax refund query however I left the UK in 2012 so all my ID doesn't match your online system and doing it via post doesn't work from SA so I would like a consultant to contact me to d8scuss how I can go through this process less painfully. 0027737772193 Please can someone call me on [PHONE_NUMBER] to discuss my small pension payout tax which j believe is do to be refunded to me \r\n[EMAIL_ADDRESS] \r\nI have expires drivers lic so your system doesn't allow me to get anywhere \r\n[PERSON_NAME]",

In [4]:
labelled_subs_json = jsonify_data(records=stratified_sample, labelled=True)
new_subs_json = jsonify_data(records=unlabelled_data, labelled=False)

In [6]:
responses = gather_responses(
    labelled_subs_json=labelled_subs_json,
    new_subs_json=new_subs_json,
    open_api_key=OPENAI_API_KEY,
)

In [None]:
prompt_tokens = [response["prompt_tokens"] for response in responses]
mean_prompt_tokens = np.mean(prompt_tokens)
completion_tokens = [response["completion_tokens"] for response in responses]
mean_completion_tokens = np.mean(completion_tokens)
print(mean_prompt_tokens)
print(mean_completion_tokens)

In [None]:
for response in responses:
    print(response["open_labelled_records"])