# Extracting the required data from the dataset

The dataset should be located at **'data/Personal_Annotation_{annotator_number}.csv'** and should contain at least the three required specific columns:

1. **question_id**: This column identifies each entry based on its corresponding policy.

2. **english**: This column contains the data entries translated into English.

3. **Arguments**: This column contains annotations associated with each entry.

After running this notebook, the extracted data will be saved to a JSON file located at **'data_extraction/personal_annotation_{annotator_number}.json'**. The JSON file will have the following format:

```json
{
  "question_id": [],
  "english_text": [],
  "labels": []
}
```
Ensure to replace {annotator_number} in the file path with the appropriate annotator number used.

In [None]:
import pandas as pd
import re
import math
import json
import os

In [None]:
# load personal annotation CSV file into a DataFrame
annotator_number = 1 # adjust as needed
df = pd.read_csv(f'data/Personal_Annotation_{annotator_number}.csv')

number_of_annotations = 50 #adjust as needed

# extract the required data columns
question_id = df['question_id'].head(number_of_annotations).tolist()
english_text = df['english'].head(number_of_annotations).tolist()
labels = df['Arguments'].head(number_of_annotations).tolist()

# print the extracted annotation data for visualization
print(labels)

In [None]:
def get_personal_annotator_data():
    """
    Formats the required data.
    Returns: a dictionary with the required data:
    """

    parsed_labels = []
    for value in labels:
            if isinstance(value, float) and math.isnan(value):
                parsed_labels.append(["None"])
            else:
                row_labels = re.findall(r'\((.*?)\)', value)
                formatted_row_labels = [r.strip().lower() for r in row_labels]
                parsed_labels.append(formatted_row_labels)

    return {'question_id': question_id, 'english_text': english_text, 'labels': parsed_labels}

In [None]:
# get the formatted annotator data
output_data = get_personal_annotator_data()
file_path = f'data_extraction/personal_annotation_{annotator_number}.json'

# save the data to the desired file_path
os.makedirs(os.path.dirname(file_path), exist_ok=True)
with open(file_path, 'w') as json_file:
    json.dump(output_data, json_file)