In [1]:
import numpy as np
import pandas as pd
from glob import glob
import h5py
import pyarrow.parquet as pq
import pyarrow as pa
import re

from pyspark import SparkContext
from pyspark.sql import SparkSession
from itertools import chain
from pyspark.sql.functions import *
from pyspark.sql.window import Window

In [2]:
my_dir = sorted(glob('project_data/LH014/hdf5/*hdf5'))
my_dir

['project_data/LH014/hdf5/101.hdf5',
 'project_data/LH014/hdf5/111.hdf5',
 'project_data/LH014/hdf5/121.hdf5',
 'project_data/LH014/hdf5/131.hdf5',
 'project_data/LH014/hdf5/141.hdf5',
 'project_data/LH014/hdf5/151.hdf5',
 'project_data/LH014/hdf5/181.hdf5',
 'project_data/LH014/hdf5/191.hdf5',
 'project_data/LH014/hdf5/221.hdf5',
 'project_data/LH014/hdf5/231.hdf5',
 'project_data/LH014/hdf5/241.hdf5',
 'project_data/LH014/hdf5/251.hdf5',
 'project_data/LH014/hdf5/261.hdf5',
 'project_data/LH014/hdf5/271.hdf5']

In [3]:
#Width of each channel = Total bandwidth / Number of channels
freq_range_per_channel = 19.600 / 1024
channel_dict_tune1 = {'channel_' + str(i+1): (20.0 + i * freq_range_per_channel) for i in range(1024)}
channel_dict_tune2 = {'channel_' + str(i+1): (28.0 + i * freq_range_per_channel) for i in range(1024)}

In [10]:
np.array(h5py.File('project_data/LH014/hdf5/111.hdf5')['Observation1']['Tuning1']['V'])

array([[-8.9263916e-04,  9.7102877e-03, -4.2862776e-03, ...,
        -6.5187383e-03, -2.2303991e-03,  3.9499071e-03],
       [ 2.2989910e-03,  1.2052207e-03, -2.6020512e-03, ...,
         9.8421508e-03, -3.3510795e-03,  5.3476426e-03],
       [ 5.8492024e-05, -9.9164061e-04,  3.3063006e-03, ...,
         3.6270628e-03, -3.5055906e-03, -6.6501158e-03],
       ...,
       [-9.6054077e-03, -3.8293405e-03, -4.6943733e-03, ...,
        -5.4656100e-03, -1.3781940e-03,  3.5422246e-03],
       [ 1.7267863e-03,  3.1976772e-03, -1.0957450e-03, ...,
         3.6524907e-03,  7.9040052e-03, -6.9124199e-04],
       [-5.2439370e-03, -6.2004286e-03, -7.1238508e-03, ...,
         4.2944537e-03,  1.7950051e-03, -3.4226263e-03]], dtype=float32)

In [4]:
def flatten_to_parquet(session_number):
    filename = 'project_data/LH014/hdf5/' + str(session_number) + '.hdf5'
    
    t =  np.array(h5py.File(filename)['Observation1']['time'])
    t = t[:, np.newaxis]
    
    for tune in ['Tuning1','Tuning2']:
        for pol in ['I','V']:
            
            data = np.array(h5py.File(filename)['Observation1'][tune][pol])
            np_arr = np.concatenate((data,t),axis=1)
            
            my_dict = {'channel_' + str(i+1): np_arr[:, i] for i in range(1024)} | {'time':np_arr[:, -1]}
            pa_table = pa.table(my_dict)
            
            counter = 0
            for batch in pa_table.to_batches(max_chunksize = 50_000):
                df = batch.to_pandas()
                df= df.melt(id_vars = ['time'],var_name=['channel'])
                df['tuning'] = tune
                df['polarization'] = pol
                if tune == 'Tuning1':
                    df['frequency'] = df.channel.map(channel_dict_tune1)
                else:
                    df['frequency'] = df.channel.map(channel_dict_tune2)
                df.to_parquet(f'project_data/LH014/parquets/{str(session_number)}_{tune}_{pol}_{counter}.parquet')
                counter +=1
                print((tune,pol,counter))
                
                     

In [None]:
%%time
flatten_to_parquet(241)

In [6]:
done_dir = sorted(glob('Project_data/LH014/parquets/*_Tuning1_I_1*'))
pattern_done = r"parquets/(\d+)_Tuning1"

done_dir = [int(re.findall(pattern_done, x)[0]) for x in done_dir]
done_dir = list(set(done_dir))
done_dir

[]

In [7]:
import re
pattern = r"hdf5\/(\d+)\.hdf5"
to_do = [int(re.findall(pattern, x)[0]) for x in my_dir]
to_do = [x for x in to_do if x not in done_dir]
to_do

[101, 111, 121, 131, 141, 151, 181, 191, 221, 231, 241, 251, 261, 271]

In [8]:
%%time
for x in to_do:
    print(x)
    flatten_to_parquet(x)

101
('Tuning1', 'I', 1)
('Tuning1', 'I', 2)
('Tuning1', 'I', 3)
('Tuning1', 'I', 4)
('Tuning1', 'I', 5)
('Tuning1', 'I', 6)
('Tuning1', 'I', 7)
('Tuning1', 'I', 8)
('Tuning1', 'V', 1)
('Tuning1', 'V', 2)
('Tuning1', 'V', 3)
('Tuning1', 'V', 4)
('Tuning1', 'V', 5)
('Tuning1', 'V', 6)
('Tuning1', 'V', 7)
('Tuning1', 'V', 8)
('Tuning2', 'I', 1)
('Tuning2', 'I', 2)
('Tuning2', 'I', 3)
('Tuning2', 'I', 4)
('Tuning2', 'I', 5)
('Tuning2', 'I', 6)
('Tuning2', 'I', 7)
('Tuning2', 'I', 8)
('Tuning2', 'V', 1)
('Tuning2', 'V', 2)
('Tuning2', 'V', 3)
('Tuning2', 'V', 4)
('Tuning2', 'V', 5)
('Tuning2', 'V', 6)
('Tuning2', 'V', 7)
('Tuning2', 'V', 8)
111
('Tuning1', 'I', 1)
('Tuning1', 'I', 2)
('Tuning1', 'I', 3)
('Tuning1', 'I', 4)
('Tuning1', 'I', 5)
('Tuning1', 'I', 6)
('Tuning1', 'I', 7)
('Tuning1', 'I', 8)
('Tuning1', 'I', 9)
('Tuning1', 'V', 1)
('Tuning1', 'V', 2)
('Tuning1', 'V', 3)
('Tuning1', 'V', 4)
('Tuning1', 'V', 5)
('Tuning1', 'V', 6)
('Tuning1', 'V', 7)
('Tuning1', 'V', 8)
('Tuning1', 

In [None]:
#'spark.executor.memory', '8g'), ('spark.executor.cores', '3'), ('spark.cores.max', '3'), ('spark.driver.memory','8g'
spark = (
    SparkSession
    .builder
    .appName("Your App Name")
    .config("spark.driver.memory", "2g")
    # .config("spark.executor.memory", "3g")
    # .config("spark.executor.cores", "7")
    # .config("spark.executor.memory", "8g")
    
    .getOrCreate())

sc = spark.sparkContext

In [None]:
df = spark.read.parquet('project_data/LH014/parquets/111*.parquet')
df.printSchema()

In [None]:
df.count()

In [None]:
df.show()

In [None]:
df.select('channel').distinct().count()

In [None]:
df.select('frequency').distinct().count()

In [None]:
df.select('polarization').distinct().count()

In [None]:
df.select('tuning').distinct().count()

In [None]:
spark.stop()