In [None]:
from google.cloud import bigquery
import pandas as pd
import json

In [None]:
def query_bigquery(project_id, dataset_id, query):
    """
    Extracts feedback records from BigQuery
    :return: DataFrame containing feedback records
    """
    # Initialize a BigQuery client
    client = bigquery.Client(project=project_id)

    # Construct a reference to the dataset
    dataset_ref = client.dataset(dataset_id)

    # Make a BigQuery API request to run the query
    query_job = client.query(query)

    # Wait for the query to complete
    query_job.result()

    # Fetch the results of the query
    # results = query_job.result()

    # Write to a df
    results_df = query_job.to_dataframe()

    return results_df

In [None]:
project_id = "govuk-user-feedback-dev"
dataset_id = "govuk-user-feedback-dev"
query_read = """
SELECT * FROM `govuk-user-feedback-dev.analysis.feedback_metrics` 
"""

# Call the function to execute the query
feedback_df = query_bigquery(project_id, dataset_id, query_read)
print(len(feedback_df))

In [None]:
feedback_df.head()

In [None]:
feedback_df.sort_values(
    by=["feedback_record_id", "prompt_value"], inplace=True, ascending=True
)

In [None]:
# Create labels

feedback_record_id = feedback_df["feedback_record_id"].tolist()

# Create a dictionary from the column data
data_dict = {key: None for key in feedback_record_id}

# Define the path for the output JSON file
output_json_file = "../data/labels.json"

# Write the dictionary to a JSON file
with open(output_json_file, "w") as json_file:
    json.dump(data_dict, json_file, indent=4)

In [None]:
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", 200)

In [None]:
# Remove any records with PII
import csv

file = open("../data/pii_exclusions.csv", "r")
pii_ids = list(csv.reader(file, delimiter=","))
file.close()

print(pii_ids)

feedback_df.drop(
    feedback_df[feedback_df.feedback_record_id.isin(pii_ids[0])].index, inplace=True
)

In [None]:
# Check records have been dropped
len(feedback_df)

In [None]:
# Concatenate feedback records together on feedback_record_id, ordered by prompt
# Function to concatenate two string columns

# Concatenate response values for same feedback record, to make prompting easier.
feedback_df["concatenated_response_value"] = feedback_df.groupby("feedback_record_id")[
    "response_value"
].transform(lambda x: " ".join(x))

print(feedback_df)

In [None]:
feedback_distinct_df = feedback_df[
    ["feedback_record_id", "concatenated_response_value"]
].drop_duplicates()
print(len(feedback_distinct_df))

In [None]:
# Manually label samples...

In [None]:
# Apply labels

# Opening JSON file
f = open("../data/labels.json")

# returns JSON object as
# a dictionary
labels_data = json.load(f)

# Closing file
f.close()

print(labels_data)

labels_df = (
    pd.DataFrame.from_dict(labels_data, orient="index", columns=["labels"])
    .reset_index()
    .rename(columns={"index": "feedback_record_id"})
)

In [None]:
# Merge labels with feedback
merged_df = feedback_distinct_df.merge(labels_df, how="left", on="feedback_record_id")[
    ["feedback_record_id", "concatenated_response_value", "labels"]
]
print(len(merged_df))

In [None]:
merged_df.head()

In [None]:
# Write labels data to new table
def write_to_bigquery(table_id: str, df: pd.DataFrame):
    """
    Writes data to BigQuery
    """
    # Initialize a BigQuery client
    client = bigquery.Client()

    # Define schema for the table
    schema = [
        bigquery.SchemaField("feedback_record_id", "STRING"),
        bigquery.SchemaField("concatenated_response_value", "STRING"),
    ]

    # Define job configuration
    job_config = bigquery.LoadJobConfig(
        schema=schema, write_disposition="WRITE_TRUNCATE"
    )

    # Write DataFrame to BigQuery
    job = client.load_table_from_dataframe(df, table_id, job_config=job_config)

    # Wait for the job to complete
    job.result()

    print(f"Table {table_id} created in dataset {dataset_id}")

In [None]:
table_id = "govuk-user-feedback-dev.analysis.feedback_metrics_distinct_labelled"

write_to_bigquery(project_id, dataset_id, table_id, df=merged_df)