## Imports

In [None]:
import subprocess
import pyzenodo3
import os
import pandas as pd
from pyunpack import Archive

## Set-Up Downloads

In [None]:
zen = pyzenodo3.Zenodo()

In [None]:
links = {
    "images": "10.5281/zenodo.14223624",
    "annotations": "10.5281/zenodo.14673658"
}

In [None]:
record_images = zen.find_record_by_doi(links["images"]).data['files']
print(f'Found Records for Images, is of type {type(record_images)}')
record_annotations = zen.find_record_by_doi(links["annotations"]).data['files']
print(f'Found Records for Annotations, is of type {type(record_annotations)}')

In [None]:
naming_convention = {
    "nodules": "nodules",
    "images": "images",
    "annotations": "annotations"
}

## Downloads the Record Images
- Warning: ~212GB (For All Files...)
- Set Limiter Below, still downloads Nodules.

In [None]:
# Limit the Number of Downloads to Make. If -1, download all files.
# If Limiter is 1, it still downloads the Nodule Files.
limit = 1

In [None]:
# Convert to pandas DataFrame
df_images = pd.DataFrame(record_images)

# Sort by 'key'
df_images = df_images.sort_values(by='key')

# Convert back to list of dictionaries
record_images = df_images.to_dict(orient='records')

image_count = 0

for record in record_images:
    # Get the name and download url
    name = record['key']
    download_url = record['links']['self']
    
    # If "nodule" in name, then it is a nodule image
    if "nodule" in name:
        download_dir = naming_convention["nodules"]
    else:
        if limit != -1 and image_count >= limit:
            # Go to Next Item in For Loop.
            continue
        download_dir = naming_convention["images"]
        image_count += 1
    
    # Create the download directory
    download_dir = os.path.join("data", download_dir)
    os.makedirs(download_dir, exist_ok=True)
    
    # Check if the file already exists
    if os.path.exists(os.path.join(download_dir, name)):
        print(f'{name} already exists. Skipping...')
        continue
    
    # Download
    subprocess.run(["wget", "-O", os.path.join(download_dir, name), download_url])

print(f'\n\n#############################')
print(f'##    Images Downloaded    ##')
print(f'#############################')

## Downloads Annotations

In [None]:
# Download Location
download_dir = f'{naming_convention["annotations"]}'
download_dir = os.path.join("data", download_dir)
os.makedirs(download_dir, exist_ok=True)

for record in record_annotations:
    # Get the name and download url
    name = record['key']
    download_url = record['links']['self']
    
    # Check if the file already exists
    if os.path.exists(os.path.join(download_dir, name)):
        print(f'{name} already exists. Skipping...')
        continue
    
    # Download
    subprocess.run(["wget", "-O", os.path.join(download_dir, name), download_url])

print(f'\n\n############################')
print(f'## Annotations Downloaded ##')
print(f'############################')

## Unzip the Files

In [None]:
# Location
image_location = os.path.join("data", naming_convention["images"]) + "/"
nodule_location = os.path.join("data", naming_convention["nodules"]) + "/"

# Extracted Location
image_extracted_location = os.path.join("data", naming_convention["images"], "luna25_images") + "/"
nodule_extracted_location = os.path.join("data", naming_convention["nodules"], "luna25_nodule_blocks") + "/"

# Create Extracted Locations if they do not exist
os.makedirs(image_extracted_location, exist_ok=True)
os.makedirs(nodule_extracted_location, exist_ok=True)

# Print All 4
print(f'Images: {image_location}')
print(f'Nodules: {nodule_location}')
print(f'Extracted Images: {image_extracted_location}')
print(f'Extracted Nodules: {nodule_extracted_location}')

In [None]:
# In /data/images/ and /data/nodules/, get list of items.
image_list = os.listdir(image_location)
nodule_list = os.listdir(nodule_location)

# Remove Directories.
image_list = [i for i in image_list if os.path.isfile(os.path.join(image_location, i))]
nodule_list = [i for i in nodule_list if os.path.isfile(os.path.join(nodule_location, i))]

# Print Length of Each
print(f'There are {len(image_list)} images and {len(nodule_list)} nodules.')

### Unzipping

In [None]:
# Unzip the Contents of the .zip files into the same folder, do not create a new folder.
for image_name in image_list:
    full_file_location = image_location + image_name
    Archive(full_file_location).extractall(image_location)
    print(f'Unzipped {image_name}')

for nodule in nodule_list:
    full_extract_location = nodule_location + nodule
    Archive(full_extract_location).extractall(nodule_location)
    print(f'Unzipped {nodule}')

### Moving Files Back

In [None]:
# Move All Items from Extracted Locations to the Location Folder.
for image in os.listdir(image_extracted_location):
    os.rename(os.path.join(image_extracted_location, image), os.path.join(image_location, image))

for nodule in os.listdir(nodule_extracted_location):
    os.rename(os.path.join(nodule_extracted_location, nodule), os.path.join(nodule_location, nodule))

### Deleting Old Zip Files

In [None]:
# Delete all old zip files in image_list
for image in image_list:
    os.remove(os.path.join(image_location, image))
    print(f'Deleted {image}')

# Delete all old zip files in nodule_list
for nodule in nodule_list:
    os.remove(os.path.join(nodule_location, nodule))
    print(f'Deleted {nodule}')