# Convert the galaxy zoo csv dataset to a model-friendly json dataset

In [6]:
import os
import pandas as pd
import json
from tqdm import tqdm
import urllib
from multiprocessing import Pool

### Load dataset

In [2]:
basename = "/home/michael/Workspace/datasets/galaxy_zoo/"
input_path = os.path.join(basename, "GZ_talk_comments_notes_urls_AISSAI.csv")
assert os.path.exists(input_path)
output_path = os.path.join(basename, "GZ_talk_comments_notes_urls_AISSAI.json")
data = pd.read_csv(input_path)

In [3]:
# Group the data by "subject_id"
grouped_data = data.groupby('subject_id')

# Create a list to store the grouped data as dictionaries
grouped_data_list = []

In [4]:
def fetch_info_by_group(subject_id: float, group: pd.DataFrame) -> dict:
    # Get the conversations of the group as a list
    comment_body = group['comment_body'].tolist()
    # Get the url of the image
    location_entry = group['locations'].iloc[0]
    image = json.loads(location_entry)["0"]
    # Cast the subject_id as an int
    id = str(int(subject_id))
    # Create the conversations as a dict with the training-friendly format
    conversations = [{
            "from": "human",
            "value": sentence
        } for sentence in comment_body]

    return {
        "id": id,
        "image": image,
        "conversations": conversations,
    }

In [5]:
# Initialize the progress bar
progress_bar = tqdm(total=len(grouped_data), desc="Processing")

# Iterate over the groups and populate the grouped data list
for subject_id, group in data.groupby('subject_id'):
    # Append the group dictionary to the list
    grouped_data_list.append(fetch_info_by_group(subject_id, group))

    # Update the progress bar
    progress_bar.update(1)

# Close the progress bar
progress_bar.close()

# Convert the grouped data list to JSON
json_data = json.dumps(grouped_data_list, indent=4)

# Write the JSON data to a file
with open(output_path, 'w') as file:
    file.write(json_data)

Processing: 100%|██████████| 99591/99591 [00:04<00:00, 23328.66it/s]


### Download images

In [18]:
url = grouped_data_list[0]['image']
id = grouped_data_list[0]['id']
print(id, url)

16215288 https://panoptes-uploads.zooniverse.org/subject_location/7653a6ff-ea1f-4cb0-bb86-9a6f5ce0c857.jpeg


In [25]:
def download_image(group_dict: dict, output_folder_image: str):
    url = group_dict["image"]
    extension = os.path.splitext(url)[1]
    id = group_dict["id"]
    print(os.path.join(output_folder_image, id + extension))
    try:
        urllib.request.urlretrieve(url, os.path.join(output_folder_image, id + extension))
    except:
        print("Could not download image for subject ID " + group_dict["id"])

In [26]:
download_image(grouped_data_list[0], basename)

/home/michael/Workspace/datasets/galaxy_zoo/16215288.jpeg
