# Dataset creation

## Pre-processing

In [None]:
"""
End-to-end workflow to create a dataset in Argilla with text measurements as metadata.
This aids in quickly identifying and improving potential dataset issues.
"""
import pandas as pd
import textdescriptives as td
from datasets import load_dataset
import re

# --- Functions ---
def clean_column_name(col_name):
    """Clean a column name to fit a specific regex pattern."""
    col_name = col_name.lower()  # Convert to lowercase
    col_name = re.sub(r'[^a-z0-9_]', '_', col_name)  # Replace non-alphanumeric characters with underscores
    return col_name

def create_metadata_properties(df, prefix):
    """Generate metadata properties based on dataframe columns and data types."""
    properties = []
    for col, dtype in df.dtypes.items():
        name = f"{prefix}_{clean_column_name(col)}"
        title = name.replace('_', ' ').title()

        if dtype == 'object':
            prop = rg.TermsMetadataProperty(name=name, title=title)
        elif dtype == 'int64':
            prop = rg.IntegerMetadataProperty(name=name, title=title)
        elif dtype == 'float64':
            prop = rg.FloatMetadataProperty(name=name, title=title)
        elif dtype == 'bool':
            prop = rg.TermsMetadataProperty(name=name, title=title)
        else:
            print(f"Unhandled data type for column {col}: {dtype}")
            continue
        properties.append(prop)
    return properties

def cast_to_python_types(df):
    """
    Convert integer and boolean columns to Python native types.
    """
    int_cols = df.select_dtypes(include=['int64']).columns
    bool_cols = df.select_dtypes(include=['boolean']).columns

    # Explicitly cast integers using Python's native int type
    for col in int_cols:
        df[col] = df[col].apply(int)

    # Convert booleans to strings using Python's native str type
    for col in bool_cols:
        df[col] = df[col].apply(str)

    return df

def detect_language(text):
    """
    Detect the language of a given text.

    Args:
    - text (str): Input text.

    Returns:
    - str: Detected language (ISO 639-1 code).
    """
    try:
        return detect(text)
    except:
        return "unknown"  # In case the language detection fails



# --- Data Collection ---
dataset = load_dataset("totally-not-an-llm/sharegpt-hyperfiltered-3k", split="train")
dataset = dataset.filter(lambda x: x["conversations"][0]["from"] == "human")
dataset = dataset.map(lambda x: {"prompt": x["conversations"][0]["value"], "response": x["conversations"][1]["value"]})

# Extract metrics
df_prompt = td.extract_metrics(text=dataset["prompt"], lang="en").drop(columns=['text'])
df_response = td.extract_metrics(text=dataset["response"], lang="en").drop(columns=['text'])

# Identify integer and boolean columns for prompts and responses
int_cols_prompts = df.select_dtypes(include=['int64']).columns.tolist()
bool_cols_prompts = df.select_dtypes(include=['boolean']).columns.tolist()

int_cols_responses = df_response.select_dtypes(include=['int64']).columns.tolist()
bool_cols_responses = df_response.select_dtypes(include=['boolean']).columns.tolist()

# Combine column lists for prompts and responses
int_cols = list(set(int_cols_prompts + int_cols_responses))
bool_cols = list(set(bool_cols_prompts + bool_cols_responses))

# --- Metadata Preparation ---
metadata_prompt = create_metadata_properties(df_prompt, 'prompt')
metadata_response = create_metadata_properties(df_response, 'response')

all_metadata = metadata_prompt + metadata_response

ds = rg.FeedbackDataset.for_supervised_fine_tuning(context=True, use_markdown=True, guidelines=None)
for m in all_metadata:
    ds.add_metadata_property(m)

# --- Record Preparation ---
records = []

# Prepare feedback records with metadata and suggestions

# Identify columns with values other than zeros or NaN for both prompt and response
cols_with_values_other_than_zeros_or_nan_prompt = df_prompt.columns[~(df_prompt.fillna(0) == 0).all()].tolist()
cols_with_values_other_than_zeros_or_nan_response = df_response.columns[~(df_response.fillna(0) == 0).all()].tolist()


records = []

cols_with_values_other_than_zeros_or_nan_prompt = df_prompt.columns[~(df_prompt.fillna(0) == 0).all() & ~df_prompt.isnull().any()].tolist()
cols_with_values_other_than_zeros_or_nan_response = df_response.columns[~(df_response.fillna(0) == 0).all() & ~df_response.isnull().any()].tolist()

ds = rg.FeedbackDataset.for_supervised_fine_tuning(context=True, use_markdown=True, guidelines=None)
for m in all_metadata:
    ds.add_metadata_property(m)

for i, record in enumerate(dataset):
    # Prepare metadata for prompts
    metadata_prompts = {f"prompt_{col}": value for col, value in df_prompt[cols_with_values_other_than_zeros_or_nan_prompt].iloc[i].items()}
    # Prepare metadata for responses
    metadata_response = {f"response_{col}": value for col, value in df_response[cols_with_values_other_than_zeros_or_nan_response].iloc[i].items()}
    if "prompt_smog" in metadata_prompts.keys():
      print(metadata_prompts)

    # Explicitly cast integers using Python's native int type
    for col in int_cols:
        if f"prompt_{col}" in metadata_prompts:
            metadata_prompts[f"prompt_{col}"] = int(metadata_prompts[f"prompt_{col}"])
        if f"response_{col}" in metadata_response:
            metadata_response[f"response_{col}"] = int(metadata_response[f"response_{col}"])

    # Convert booleans to strings using Python's native str type
    for col in bool_cols:
        if f"prompt_{col}" in metadata_prompts:
            metadata_prompts[f"prompt_{col}"] = str(metadata_prompts[f"prompt_{col}"])
        if f"response_{col}" in metadata_response:
            metadata_response[f"response_{col}"] = str(metadata_response[f"response_{col}"])

    # Combine both metadata dictionaries into one
    metadata = {**metadata_prompts, **metadata_response}

    records.append(
        rg.FeedbackRecord(
            fields={"prompt": record["prompt"]},
            metadata=metadata,
            suggestions=[{"question_name": "response", "value": record["response"]}]
        )
    )

# Add records to the dataset and push to Argilla
ds.add_records(records)
ds.push_to_argilla(name="share-gpt-descriptives", workspace="admin")

## Questions

You can reuse the dataset https://huggingface.co/datasets/argilla/sharegpt-text-descriptives