In [1]:
# imports
import pandas as pd
import numpy as np
import os
import sys

# files paths
home_dir_path = os.path.expanduser("~")
work_dir_path = os.path.join(home_dir_path, 'workdir3')
code_dir_path = os.path.join(work_dir_path , 'code')
data_dir_path = os.path.join(work_dir_path , 'data')
final_table_csv_path = os.path.join(data_dir_path, 'SDSS_DR16_all.csv')

# adding code folder to path
sys.path.insert(1, code_dir_path)

In [2]:
# loading the table
gs = pd.read_csv(final_table_csv_path, header = 0, dtype = {
    'specobjid' : str,
    'z' : float,
    'z_noqso' : float,
    'snMedian' : float,
    'run2d' : str,
    'ra' : float,
    'dec' : float,
    'plate' : int,
    'mjd' : int,
    'fiberid' : int,
    'waveMin' : float,
    'waveMax' : float
    })

In [3]:
# taking only high SNR samples
gs = gs.sort_values(by='snMedian', ascending=False)
gs = gs[gs.snMedian>20]

# loading the wl_grid
wl_grid = np.load(os.path.join(data_dir_path, 'wl_grid.npy'))

In [4]:
gs.tail()

Unnamed: 0,specobjid,snMedian,run2d,ra,dec,plate,mjd,fiberid,waveMin,waveMax,z
2973151,10594794674823909376,20.00016,v5_13_0,7.065789,0.713201,9410,58069,278,3601.637,10327.61,0.30597
849365,2656066114152327168,20.00014,26,163.312,26.887018,2359,53826,248,3815.927,9191.788,0.021212
73770,495559817868371968,20.00009,26,124.66097,49.222123,440,51885,596,3824.723,9215.099,0.132998
966090,2947700589712140288,20.00009,26,197.94213,20.641125,2618,54506,344,3822.082,9215.099,0.139639
247246,721848111839864832,20.00005,26,323.68371,-6.832992,641,52176,532,3809.781,9206.615,0.159956


In [5]:
print('Estimated dataset size: %.1fGB' % (len(gs)*len(wl_grid)*8/1024/1024/1024))

Estimated dataset size: 21.1GB


In [6]:
# taking a small slice just to test the code
gs = gs[:12000]

In [7]:
# create a wrapper that returns the index also
from pre_processing import download_spectrum
def download_spectrum_wrapper(i):
    spec, _, ivar = download_spectrum(gs.iloc[i], pre_proc=True, wl_grid=wl_grid)
    return i, spec, ivar

In [8]:
# create jobs to download and preprocess
from joblib import Parallel, delayed
res = Parallel(n_jobs=-1, verbose=5, prefer="threads")(delayed(download_spectrum_wrapper)(i) for i in range(len(gs)))

# fiter the good results only (exception encountered during download will return empty arrays)
res = sorted(res, key=lambda x: x[0])
goodRes = [len(val[1]) > 0 for val in res]
gs = gs[goodRes]
gs.index = range(len(gs))
from itertools import compress
res = list(compress(res, goodRes))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:   18.6s
[Parallel(n_jobs=-1)]: Done 256 tasks      | elapsed:   32.4s
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:   50.2s
[Parallel(n_jobs=-1)]: Done 616 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 850 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 1120 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 1426 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 1768 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 2146 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 2560 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 3010 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 3496 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done 4018 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 4576 tasks    

In [9]:
# save the dataframe
autoencoder_dataset_dir = os.path.join(data_dir_path,'HighSNR_12K_V1')
if not os.path.exists(autoencoder_dataset_dir):
    os.mkdir(autoencoder_dataset_dir)
dataset_dataframe_path = os.path.join(autoencoder_dataset_dir,'gs.pkl')
gs.to_pickle(dataset_dataframe_path)

In [10]:
# save the spectra
spec = np.stack([x[1] for x in res], axis=0)
spec_npy_path = os.path.join(autoencoder_dataset_dir,'spec.npy')
np.save(spec_npy_path, spec)

In [11]:
# save the ivar
ivar = np.stack([x[2] for x in res], axis=0)
ivar_npy_path = os.path.join(autoencoder_dataset_dir,'ivar.npy')
np.save(ivar_npy_path, ivar)

# Splitting data from res (Memory issues)

In [None]:
import pickle
autoencoder_dataset_dir = os.path.join(data_dir_path,'AutoEncoderDataset')
res_path = os.path.join(autoencoder_dataset_dir,'res.pkl')
with open(res_path,'rb') as f:
    res = pickle.load(f)

In [None]:
ivar_list = [x[2].astype(np.float32) for x in res]

In [None]:
del res

In [None]:
import gc
gc.collect()

In [None]:
ivar = np.stack(ivar_list, axis=0)

In [None]:
del ivar_list

In [None]:
import gc
gc.collect()

In [None]:
ivar_npy_path = os.path.join(autoencoder_dataset_dir,'ivar.npy')
np.save(ivar_npy_path, ivar)

In [None]:
import sys

local_vars = list(locals().items())
for var, obj in local_vars:
    print(var, sys.getsizeof(obj))

In [None]:
import psutil
dict(psutil.virtual_memory()._asdict())['available']/(1024**3)