In [40]:
from astropy.io import fits

# Path to your FITS file
# fits_file_path = '../../../projects/k-pop/catalogues/galahdr3-apogeedr17.fits'
fits_file_path = 'data/test/combined.fits'
# Open the FITS file
with fits.open(fits_file_path) as hdul:
    # Access the first extension (HDU 1) which contains the table
    data = hdul[1].data
    
    # Extract the 'APOGEE_ID' and 'sobject_id' columns
    apogee_ids = data['APOGEE_ID']
    galah_ids = data['sobject_id']
    # print(data)
    # Print the 'APOGEE_ID' and 'sobject_id' data for the first five entries
    print("Matched IDs:")
    for i in range(3):  # Ensure we don't go out of range if fewer than 5 entries
        print(f"APOGEE ID: {apogee_ids[i]}, GALAH ID: {galah_ids[i]}")

# This script will display the first five pairs of APOGEE and GALAH identifiers.


Matched IDs:
APOGEE ID: 2M04394779-5753520, GALAH ID: 131118002401015
APOGEE ID: 2M04390586-5752435, GALAH ID: 131118002401016
APOGEE ID: 2M04372258-5755132, GALAH ID: 131118002401023


In [36]:
from astropy.io import fits
import pandas as pd

# Load the FITS file
fits_path = '../../../projects/k-pop/catalogues/galahdr3-apogeedr17.fits'  # Replace with the path to your FITS file
with fits.open(fits_path) as hdul:
    data = hdul[1].data  # Assuming the data of interest is in the first extension

# Convert to a pandas DataFrame
df = pd.DataFrame(data)

# Print the first 5 rows of the DataFrame
print(df.head())


            star_id       sobject_id           APOGEE_ID   GAIAEDR3_SOURCE_ID  \
0  04394779-5753520  131118002401015  2M04394779-5753520  4774350796594075008   
1  04390586-5752435  131118002401016  2M04390586-5752435  4774349937600609664   
2  04372258-5755132  131118002401023  2M04372258-5755132  4774340797910201984   
3  04391862-5802387  131118002401025  2M04391862-5802387  4774324614473456640   
4  04384845-5813280  131118002401038  2M04384845-5813280  4774315852740197376   

          RA        DEC  
0  69.949141 -57.897781  
1  69.774443 -57.878754  
2  69.344086 -57.920361  
3  69.827605 -58.044109  
4  69.701901 -58.224445  


In [39]:
from astropy.io import fits
import pandas as pd

# Load the FITS file
fits_path = '../../../projects/k-pop/spectra/galah/dr3/1902250027013731.fits'  # Replace with the path to your FITS file
with fits.open(fits_path) as hdul:
    data = hdul[1].data  # Assuming the data of interest is in the first extension

# Convert to a pandas DataFrame
df = pd.DataFrame(data)

# Print the first 5 rows of the DataFrame
print(df.head())


          0
0  0.052054
1  0.050679
2  0.050785
3  0.050299
4  0.048695


In [41]:
from astropy.io import fits
from astropy.table import Table

# Path to the original and the new FITS file
original_fits_path = '../../../projects/k-pop/catalogues/galahdr3-apogeedr17.fits'
new_fits_path = 'data/test/combined.fits'

# Open the original FITS file
with fits.open(original_fits_path) as hdul:
    # Access the data in the first extension (HDU 1)
    data = hdul[1].data
    
    # Convert FITS data to an Astropy Table for easier manipulation
    table = Table(data)
    
    # Copy the first five rows
    new_table = table[:100]

    # Create a new HDU from the table
    hdu = fits.BinTableHDU(new_table)

    # Write the new FITS file with only the first five entries
    hdu.writeto(new_fits_path, overwrite=True)

print(f"Created new FITS file with  entries at {new_fits_path}")


Created new FITS file with  entries at data/test/combined.fits


In [42]:
import os
import shutil
from astropy.io import fits

# Path to the original FITS file
fits_file_path = 'data/test/combined.fits'
# Directory containing the APOGEE FITS files
source_dir = '../../../projects/k-pop/spectra/apogee/dr17'
# Destination directory
dest_dir = 'data/test/apogee/'

# Ensure the destination directory exists
os.makedirs(dest_dir, exist_ok=True)

# Open the FITS file and read APOGEE_IDs
with fits.open(fits_file_path) as hdul:
    apogee_ids = hdul[1].data['APOGEE_ID']

# Copy the first five corresponding FITS files
for apogee_id in apogee_ids:
    src_file_path = os.path.join(source_dir, f'aspcapStar-dr17-{apogee_id.strip()}.fits')
    dest_file_path = os.path.join(dest_dir, f'aspcapStar-dr17-{apogee_id.strip()}.fits')
    # Check if source file exists before copying
    if os.path.exists(src_file_path):
        shutil.copy(src_file_path, dest_file_path)
        print(f'Copied: {src_file_path} to {dest_file_path}')
    else:
        print(f'Source file not found: {src_file_path}')

print("Files have been copied.")


Copied: ../../../projects/k-pop/spectra/apogee/dr17/aspcapStar-dr17-2M04394779-5753520.fits to data/test/apogee/aspcapStar-dr17-2M04394779-5753520.fits
Copied: ../../../projects/k-pop/spectra/apogee/dr17/aspcapStar-dr17-2M04390586-5752435.fits to data/test/apogee/aspcapStar-dr17-2M04390586-5752435.fits
Copied: ../../../projects/k-pop/spectra/apogee/dr17/aspcapStar-dr17-2M04372258-5755132.fits to data/test/apogee/aspcapStar-dr17-2M04372258-5755132.fits
Copied: ../../../projects/k-pop/spectra/apogee/dr17/aspcapStar-dr17-2M04391862-5802387.fits to data/test/apogee/aspcapStar-dr17-2M04391862-5802387.fits
Copied: ../../../projects/k-pop/spectra/apogee/dr17/aspcapStar-dr17-2M04384845-5813280.fits to data/test/apogee/aspcapStar-dr17-2M04384845-5813280.fits
Copied: ../../../projects/k-pop/spectra/apogee/dr17/aspcapStar-dr17-2M04373299-5814393.fits to data/test/apogee/aspcapStar-dr17-2M04373299-5814393.fits
Copied: ../../../projects/k-pop/spectra/apogee/dr17/aspcapStar-dr17-2M04362390-5807107.f

In [7]:
import os
import shutil
from astropy.io import fits

# Path to the original FITS file
fits_file_path = 'data/test/combined.fits'
# Directory containing the APOGEE FITS files
source_dir = '../../../projects/k-pop/spectra/galah/dr3'
# Destination directory
dest_dir = 'data/test2/galah'

# Ensure the destination directory exists
os.makedirs(dest_dir, exist_ok=True)

# Open the FITS file and read APOGEE_IDs
with fits.open(fits_file_path) as hdul:
    galah_ids = hdul[1].data['sobject_id']

# Copy the first corresponding FITS file that is found
for galah_id in galah_ids:
    file_found = False
    for i in range(1, 5): 
        src_file_path = os.path.join(source_dir, f'{galah_id}{i}.fits')
        dest_file_path = os.path.join(dest_dir, f'{galah_id}{i}.fits')
        if os.path.exists(src_file_path):
            shutil.copy(src_file_path, dest_file_path)
            print(f'Copied: {src_file_path} to {dest_file_path}')
  

print("Files have been copied.")


Copied: ../../../projects/k-pop/spectra/galah/dr3/1311180024010151.fits to data/test2/galah/1311180024010151.fits
Copied: ../../../projects/k-pop/spectra/galah/dr3/1311180024010152.fits to data/test2/galah/1311180024010152.fits
Copied: ../../../projects/k-pop/spectra/galah/dr3/1311180024010153.fits to data/test2/galah/1311180024010153.fits
Copied: ../../../projects/k-pop/spectra/galah/dr3/1311180024010154.fits to data/test2/galah/1311180024010154.fits
Copied: ../../../projects/k-pop/spectra/galah/dr3/1311180024010161.fits to data/test2/galah/1311180024010161.fits
Copied: ../../../projects/k-pop/spectra/galah/dr3/1311180024010162.fits to data/test2/galah/1311180024010162.fits
Copied: ../../../projects/k-pop/spectra/galah/dr3/1311180024010163.fits to data/test2/galah/1311180024010163.fits
Copied: ../../../projects/k-pop/spectra/galah/dr3/1311180024010164.fits to data/test2/galah/1311180024010164.fits
Copied: ../../../projects/k-pop/spectra/galah/dr3/1311180024010231.fits to data/test2/ga

In [6]:
import os
import shutil
import random
from astropy.io import fits

# Path to the original FITS file
fits_file_path = 'data/test/combined.fits'
# Directory containing the GALAH FITS files
source_dir = '../../../projects/k-pop/spectra/galah/dr3'
# Destination directory
dest_dir = 'data/test2/galah'
# Ensure the destination directory exists
os.makedirs(dest_dir, exist_ok=True)

# Open the FITS file and read GALAH IDs
with fits.open(fits_file_path) as hdul:
    galah_ids = set(hdul[1].data['sobject_id'])

# Get all FITS files in the source directory
all_files = [f for f in os.listdir(source_dir) if f.endswith('.fits')]

# Filter files that are not in galah_ids
files_not_in_list = [f[:15] for f in all_files if f[:15] not in galah_ids]

# Randomly select 200 files (or all if less than 200)
files_to_copy = random.sample(files_not_in_list, min(200, len(files_not_in_list)))

# Prepare to copy all files that start with any of the selected filenames
files_to_copy_full = [f for f in all_files if any(f.startswith(prefix) for prefix in files_to_copy)]

# Copy the selected files
for file_name in files_to_copy_full:
    src_file_path = os.path.join(source_dir, file_name)
    dest_file_path = os.path.join(dest_dir, file_name)
    shutil.copy(src_file_path, dest_file_path)
    print(f'Copied: {src_file_path} to {dest_file_path}')

print(f"Copied {len(files_to_copy_full)} files that start with selected filenames.")


KeyboardInterrupt: 

In [4]:
import os
import shutil
import random
from astropy.io import fits

# Paths
fits_file_path = 'data/test/combined.fits'
source_dir = '../../../projects/k-pop/spectra/galah/dr3'
dest_dir = 'data/test2/galah'

# Make sure the destination directory exists
os.makedirs(dest_dir, exist_ok=True)

# Read the existing GALAH_IDs from the combined.fits file
with fits.open(fits_file_path) as hdul:
    existing_ids = set(hdul[1].data['sobject_id'])

# Function to get a batch of file names
def get_file_batch(directory, batch_size=1500):
    for i, filename in enumerate(os.listdir(directory)):
        if i % batch_size == 0 and i > 0:
            yield
        if filename.endswith('.fits'):
            yield filename

# Set to store unique GALAH IDs
new_ids = set()

# Process files in batches
for batch in get_file_batch(source_dir):
    if batch is None:  # This is just a yield to allow for breaking the loop
        if len(new_ids) >= 200:
            break
        continue
    
    galah_id = batch[:-5]  # Remove '.fits' and last character (channel number)
    if galah_id not in existing_ids:
        new_ids.add(galah_id)
    
    if len(new_ids) >= 200:
        break

# Randomly select 200 IDs (or all if less than 200 available)
selected_ids = random.sample(list(new_ids), min(200, len(new_ids)))

# Copy files for selected IDs
copied_count = 0
for galah_id in selected_ids:
    for i in range(1, 5):  # Channels 1 to 4
        source_file = os.path.join(source_dir, f"{galah_id}{i}.fits")
        if os.path.exists(source_file):
            dest_file = os.path.join(dest_dir, f"{galah_id}{i}.fits")
            shutil.copy2(source_file, dest_file)
            copied_count += 1

print(f"Copied {copied_count} files for {len(selected_ids)} unique GALAH IDs.")

Copied 0 files for 200 unique GALAH IDs.


In [None]:
import os
import shutil
from astropy.io import fits

# Paths
fits_file_path = 'data/test/combined.fits'
source_dir = '../../../projects/k-pop/spectra/apogee/dr17'
dest_dir = 'data/test2/apogee/'

# Ensure the destination directory exists
os.makedirs(dest_dir, exist_ok=True)

# Read the existing APOGEE_IDs from the combined.fits file to ensure they are not copied
existing_ids = set()
with fits.open(fits_file_path) as hdul:
    if 'APOGEE_ID' in hdul[1].columns.names:
        existing_ids = set(hdul[1].data['APOGEE_ID'])

# Initialize a counter for the number of files copied
file_count = 0

# Iterate through potential APOGEE_IDs and copy the first matching file
for root, dirs, files in os.walk(source_dir):
    for file in files:
        if file.endswith('.fits'):
            apogee_id = file.split('-')[-1].split('.')[0]  # Extract the APOGEE_ID from the filename
            # Check if the APOGEE_ID is not in the existing IDs and we have not reached 200 files yet
            if apogee_id not in existing_ids and file_count < 200:
                src_file_path = os.path.join(root, file)
                dest_file_path = os.path.join(dest_dir, file)  # Use original filename in the destination
                shutil.copy(src_file_path, dest_file_path)
                print(f'Copied: {src_file_path} to {dest_file_path}')
                file_count += 1
                existing_ids.add(apogee_id)  # Ensure no further files with this APOGEE_ID are copied
                # Break if 200 files have been copied
                if file_count >= 200:
                    break
    if file_count >= 200:
        break

print(f"Files copied: {file_count}")


In [10]:
import os
import shutil
import random
import re

def copy_files_based_on_galah_id(source_dir, destination_dir, num_samples=300):
    # Use a reservoir sampling algorithm to randomly select files from a large directory
    selected_files = []
    for i, file_name in enumerate(os.listdir(source_dir)):
        if os.path.isfile(os.path.join(source_dir, file_name)):
            if i < num_samples:
                selected_files.append(file_name)
            else:
                r = random.randint(0, i)
                if r < num_samples:
                    selected_files[r] = file_name

    # Now find and copy files with the matching galah_id
    for file in selected_files:
        # Extract the first 15 digits from the filename
        match = re.match(r"(\d{15})", file)
        if match:
            galah_id = match.group(1)
            # Copy files starting with this galah_id
            for f in os.listdir(source_dir):
                if f.startswith(galah_id):
                    src = os.path.join(source_dir, f)
                    dest = os.path.join(destination_dir, f)
                    shutil.copy2(src, dest)
                    print(f"Copied {f} to {destination_dir}")



if __name__ == "__main__":
    source_dir = '../../../projects/k-pop/spectra/galah/dr3'
    # Destination directory
    dest_dir = 'data/test2/galah'
    copy_files_based_on_galah_id(source_dir, dest_dir)


KeyboardInterrupt: 