In [None]:
import glob
import pandas as pd
import os
import re

from astropy.io.votable import parse_single_table
import shutil

Download coordinate data

In [11]:
sources_file = '../data/Gaia DR3/queries/impartial sources coord-result.csv'

In [12]:
sources_df = pd.read_csv(sources_file)
sources_df.head()

Unnamed: 0,source_id,ra,dec
0,5937173300407375616,250.247043,-51.593826
1,5937081353758582016,251.645316,-51.613334
2,5937201200518016768,250.622534,-51.278547
3,5937201200518017024,250.61624,-51.279443
4,5937081388118319616,251.626566,-51.6255


Combine coordinates and spectra

In [14]:
impartial_sources = '../data/Gaia DR3/spectra/XP_temp'
files = glob.glob(f'{impartial_sources}/*.vot')

In [15]:
# this loop works
pattern = re.compile(r'(\d+)(?=\.vot$)')
spectra_df = pd.DataFrame(columns=['source_id', 'flux'])

for f in files[:500]:
    match = pattern.search(os.path.basename(f))
    if match:
        source_id = match.group()
        
        impartial_data = parse_single_table(f).to_table().to_pandas()
        flux_array = impartial_data['flux'].values
        
        temp_df = pd.DataFrame({'source_id': [int(source_id)], 'flux': [flux_array]})
        
        spectra_df = pd.concat([spectra_df, temp_df], ignore_index=True)

In [16]:
# this loop also works

pattern = re.compile(r'(\d+)(?=\.vot$)')
data = []

for f in files[:500]:
    match = pattern.search(os.path.basename(f))
    if match:
        source_id = int(match.group())
        impartial_data = parse_single_table(f).to_table().to_pandas()
        flux_array = impartial_data['flux'].values
        data.append({'source_id': source_id, 'flux': flux_array})

spectra_df = pd.DataFrame(data)

In [22]:
# Ensure the './temp' directory exists
os.makedirs('./temp', exist_ok=True)

pattern = re.compile(r'(\d+)(?=\.vot$)')
batch_size = 500

# Process files in batches, skipping already processed files
for i in range(0, len(files), batch_size):
    batch_files = files[i:i + batch_size]
    batch_filename = f'./temp/{i}_{i + len(batch_files)}.parquet'

    # Skip processing if the batch file already exists
    if os.path.exists(batch_filename):
        print(f"Skipping batch {i}-{i + len(batch_files)}, already processed.")
        continue

    data = []

    for f in batch_files:
        match = pattern.search(os.path.basename(f))
        if match:
            try:
                source_id = int(match.group())
                impartial_data = parse_single_table(f).to_table().to_pandas()
                flux_array = impartial_data['flux'].values
                data.append({'source_id': source_id, 'flux': flux_array})
            except:
                print(f"The following file cannot be read: {os.path.basename(f)}")    
    # Create a DataFrame for the current batch
    spectra_df = pd.DataFrame(data)
    
    # Save the batch to a Parquet file
    spectra_df.to_parquet(batch_filename, index=False)
    
    # Clear the list to free memory
    del data
    del spectra_df

# Combine all batch Parquet files into one final Parquet file
all_files = [os.path.join('./temp', f) for f in os.listdir('./temp') if f.endswith('.parquet')]
combined_df = pd.concat((pd.read_parquet(f) for f in all_files), ignore_index=True)

# Save the final combined DataFrame as Parquet
combined_df.to_parquet('./final_combined.parquet', index=False)

# Delete the './temp' directory and its contents
#shutil.rmtree('./temp')

print("Batch processing completed. All files combined into 'final_combined.parquet'. Temp directory deleted.")

The following file cannot be read: XP_SAMPLED_Gaia_DR3_5888623436756684288.vot
Batch processing completed. All files combined into 'final_combined.parquet'. Temp directory deleted.


In [23]:
result_df = pd.merge(sources_df, combined_df, on='source_id', how='inner')

In [24]:
result_df

Unnamed: 0,source_id,ra,dec,flux
0,5937173300407375616,250.247043,-51.593826,"[2.1760128e-17, 2.8403983e-17, 2.5246224e-17, ..."
1,5937081353758582016,251.645316,-51.613334,"[8.443173e-19, -4.5855413e-20, 2.3197077e-18, ..."
2,5937201200518016768,250.622534,-51.278547,"[1.0169807e-17, 6.2980932e-18, 4.4465553e-18, ..."
3,5937201200518017024,250.616240,-51.279443,"[-1.9316115e-18, -4.0893435e-18, 9.551702e-19,..."
4,5937081388118319616,251.626566,-51.625500,"[-7.4537416e-19, 7.237867e-19, 3.7499377e-18, ..."
...,...,...,...,...
1204488,5616972905625943168,110.344556,-24.053983,"[9.998206e-18, 8.298211e-18, 8.1598654e-18, 8...."
1204489,5616734311599382144,112.142247,-24.760026,"[2.0003316e-18, 3.2607307e-18, 2.7548696e-18, ..."
1204490,5616734345959112576,112.171708,-24.753497,"[5.3145056e-18, 1.7505754e-18, 1.4582408e-19, ..."
1204491,5616973073121307648,110.322070,-24.064154,"[6.4910025e-18, 2.7324015e-18, 2.9787787e-18, ..."


In [26]:
result_df.to_parquet('./impartial_xp_coord.parquet')