# Converts Parquet data to JSONL format

### HuggingFace dataset
Data prepared from the following dataset:

https://huggingface.co/datasets/Arsive/toxicity_classification_jigsawD output IS NOT NULL;

### Cohere console for Fine Tuning
https://dashboard.cohere.com/fine-tuning/create?endpoint=classify

### Cohere fine-tuning data requirements
https://docs.cohere.com/docs/classify-preparing-the-data


### HuggingFace SQL console
https://huggingface.co/blog/sql-console


### SQL used for downloading the data
PS: Deliberately removed the rows that have *obscene*=1

### Code
PS: This can also be done with a DuckDB struct pack feature

* Used SQL console to download 100 rows from the dataset
* Used the code below to convert the dataset from Parquet to JSONL format
* Renamed the attributes, as per Cohere's multi-label dataset requirement

In [13]:
import pandas as pd
import json

# Load the parquet file into a pandas DataFrame
parquet_file_train = "c:\\Users\\raj\\Downloads\\multi_label_comment_classification_train.parquet"
parquet_file_test = "c:\\Users\\raj\\Downloads\\multi_label_comment_classification_test.parquet"



## Cohere dataset

In [2]:
# List of labels to check
label_columns = ["toxic", "severe_toxic", "threat", "insult", "identity_hate"]

# Function to process each row
def generate_output_cohere(row):
       
    # Extract the comment text
    text = row['comment_text']
    
    # Create a list of labels where the value is 1
    labels = [label for label in label_columns if row[label] == 1]
    
    # Format as desired output
    return {"text": text, "label": labels}

def read_parquet_generate_jsonl_cohere(parquet_file, output_file):
    # Generate the training set
    df = pd.read_parquet(parquet_file)
    output_df = df.apply(generate_output_cohere, axis=1)
    # output = output_df.to_list()

    # Convert the DataFrame to a JSONL file (one JSON object per line)
    
    output_df.to_json(output_file, orient='records', lines=True)

    print(f"Successfully converted {parquet_file_train} to {output_file}")

# Print the output in the desired format
# for entry in output:
#     print(entry)

#### Training set

* Run the following against the **train** set
* Download the parquet file
* Rename file to : multi_label_comment_classification_train.parquet
 
SELECT comment_text, toxic, severe_toxic, threat, insult, identity_hate  FROM (
   SELECT * FROM train  where obscene=0 and toxic=1 LIMIT 10
)
UNION
SELECT comment_text, toxic, severe_toxic, threat, insult, identity_hate  FROM (
   SELECT * FROM train  where obscene=0 and severe_toxic=1 LIMIT 10
)
UNION
SELECT comment_text, toxic, severe_toxic, threat, insult, identity_hate  FROM (
   SELECT * FROM train  where obscene=0 and threat=1 LIMIT 10
)
UNION
SELECT comment_text, toxic, severe_toxic, threat, insult, identity_hate  FROM (
   SELECT * FROM train  where obscene=0 and identity_hate=1 LIMIT 10
)
UNION
SELECT comment_text, toxic, severe_toxic, threat, insult, identity_hate  FROM (
   SELECT * FROM train  where obscene=0 and insult=1 LIMIT 10
);

In [3]:
jsonl_file_train = "c:\\Users\\raj\\Downloads\\multi_label_comment_classification_train_cohere.jsonl"
read_parquet_generate_jsonl_cohere(parquet_file_train, jsonl_file_train)

Successfully converted c:\Users\raj\Downloads\multi_label_comment_classification_train.parquet to c:\Users\raj\Downloads\multi_label_comment_classification_train_cohere.jsonl


#### Validation

* Run the following against the **test** set
* Download the parquet file
* Rename file to : multi_label_comment_classification_test.parquet

SELECT comment_text, toxic, severe_toxic, threat, insult, identity_hate  FROM (
   SELECT * FROM test  where obscene=0 and toxic=1 LIMIT 5
)
UNION
SELECT comment_text, toxic, severe_toxic, threat, insult, identity_hate  FROM (
   SELECT * FROM test  where obscene=0 and severe_toxic=1 LIMIT 5
)
UNION
SELECT comment_text, toxic, severe_toxic, threat, insult, identity_hate  FROM (
   SELECT * FROM test  where obscene=0 and threat=1 LIMIT 5
)
UNION
SELECT comment_text, toxic, severe_toxic, threat, insult, identity_hate  FROM (
   SELECT * FROM test  where obscene=0 and identity_hate=1 LIMIT 5
)
UNION
SELECT comment_text, toxic, severe_toxic, threat, insult, identity_hate  FROM (
   SELECT * FROM test  where obscene=0 and insult=1 LIMIT 5
);

In [5]:
jsonl_file_test = "c:\\Users\\raj\\Downloads\\multi_label_comment_classification_test_cohere.jsonl"
read_parquet_generate_jsonl_cohere(parquet_file_test, jsonl_file_test)

Successfully converted c:\Users\raj\Downloads\multi_label_comment_classification_train.parquet to c:\Users\raj\Downloads\multi_label_comment_classification_test_cohere.jsonl


## OpenAI GPT 

https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset

* Requires the dataset to be in chat message format
* For non-chat use cases such as classification, use single-turn format with 3 messages ["system", "user", "assistant"]
  e.g., ```{"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "What's the capital of France?"}, {"role": "assistant", "content": "Paris, as if everyone doesn't know that already."}]}```

In [20]:
# Function to process each row
def generate_output_openai(row):
       
    # Extract the comment text
    text = row['comment_text']
    
    # Create a list of labels where the value is 1
    labels = [label for label in label_columns if row[label] == 1]

    system_message = "you will categorize the user's input into one or more categories: "+str(["toxic", "severe_toxic", "threat", "insult", "identity_hate"])

    # The output must be in string format i.e., can't be array so use json.dumps to convert to 
    json_l = {
        "messages":[
            {
                "role": "system",
                "content": system_message
            },{
                "role": "user",
                "content": text
            },{
                "role": "assistant",
                "content": json.dumps(labels)
            }
        ]
    }
    
    # Format as desired output
    # return json.dumps(json_l)
    return json_l


def read_parquet_generate_jsonl_openai(parquet_file, output_file):
    # Generate the training set
    df = pd.read_parquet(parquet_file)
    output_df = df.apply(generate_output_openai, axis=1)
    # output = output_df.to_list()

    # Convert the DataFrame to a JSONL file (one JSON object per line)
    
    output_df.to_json(output_file, orient='records', lines=True)

    print(f"Successfully converted {parquet_file_train} to {output_file}")

## Training data

In [21]:
jsonl_file_train = "c:\\Users\\raj\\Downloads\\multi_label_comment_classification_train_openai.jsonl"
read_parquet_generate_jsonl_openai(parquet_file_train, jsonl_file_train)

Successfully converted c:\Users\raj\Downloads\multi_label_comment_classification_train.parquet to c:\Users\raj\Downloads\multi_label_comment_classification_train_openai.jsonl


## Validation data

In [22]:
jsonl_file_test = "c:\\Users\\raj\\Downloads\\multi_label_comment_classification_test_openai.jsonl"
read_parquet_generate_jsonl_openai(parquet_file_test, jsonl_file_test)

Successfully converted c:\Users\raj\Downloads\multi_label_comment_classification_train.parquet to c:\Users\raj\Downloads\multi_label_comment_classification_test_openai.jsonl
