# Forming the Data set

In [5]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import torch
import time
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import SubsetRandomSampler
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import optuna
import os
import time
import Pk_library as PKL
import density_field_library as DFL
import smoothing_library as SL

class data_gen(Dataset):

    def __init__(self, n, mode, case, dens_cut_str, mult_factor, A_true):
        super().__init__()

        data = np.zeros((n,45))
        A = np.zeros((n,1))

        seed_arr = np.zeros(n)
        for i in range(n):
            seed_arr[i] = i

        for i in range(n):

            grid              = 64                           #pixel dimensions (grid*grid)
            BoxSize           = 1000                         #Mpc/h (Boxsize = field image boxsize)
            seed              = seed_arr[i]                         
            seed              = int(seed)                    #value of the initial random seed
            Rayleigh_sampling = 1                            #whether sampling the Rayleigh distribution for modes amplitudes
            threads           = 1                            #single OpenMP thread
            verbose           = False                        #choosing not to display information  
            MAS               = 'None'                        

            #generatig set of k values : [3*kf, 4*kf, 5*kf........kmax]
            kf = 7e-03
            kmax = 0.9
            k = np.arange(3*kf, kmax, kf)
            k = k.astype(np.float32)

            Pk = []
            if A_true == None:
                A_1 = np.random.uniform(0.8,1.2)
            else:
                A_1 = A_true
            
            for j in k:
                Pk_1 = A_1/(np.sqrt(j))
                Pk.append(Pk_1)

            Pk = np.array(Pk)
            Pk = Pk.astype(np.float32)

            data_1 = DFL.gaussian_field_2D(grid, k, Pk, Rayleigh_sampling, seed,
                    BoxSize, threads, verbose)

            if case != 'original':
                dens_cut = float(dens_cut_str)
                if case == 'min':
                    indexes = np.where(data_1<dens_cut)
                    data_1[indexes] = dens_cut
                else:
                    indexes = np.where(data_1>dens_cut)
                    data_1[indexes] = dens_cut  
                
            spectrum = PKL.Pk_plane(data_1, BoxSize, MAS, threads, verbose)
            k_img = spectrum.k          
            Pk_img = spectrum.Pk       

            Pk_final = np.asarray(Pk_img)*mult_factor
            if n == 10:
                self.unnorm_Pk = Pk_final
                self.k = k_img
                
            data[i,:] = Pk_final
            
            #normalising A wrt maximum and minimum
            A_2 = (A_1 - 0.8)/(1.2-0.8)
            A[i,:] = A_2
            
            
        if   mode=='train':  offset, size_spec = int(0.00*n), int(0.70*n)
        elif mode=='valid':  offset, size_spec = int(0.70*n), int(0.15*n)
        elif mode=='test':   offset, size_spec = int(0.85*n), int(0.15*n)
        elif mode=='all':    offset, size_spec = int(0.00*n), int(1.00*n)
        else:                raise Exception('Wrong name!')

        data = data[offset:offset+size_spec,:]
        A = A[offset:offset+size_spec]

        mean, std = 3.2179966297277343, 1.9581409307305626
        data = (data-mean)/std
        data_t = torch.from_numpy(data)
        self.Pk = data_t

        A_t = torch.from_numpy(A)
        self.A = A_t

        self.size = self.Pk.shape[0]
        
        
    def return_spectra(self):
        return self.unnorm_Pk, self.k
        
    def __len__(self):
        return self.size
    
    
    def __getitem__(self, idx):
        return self.Pk[idx].to(torch.float32), self.A[idx].to(torch.float32)
    
    
    def full_data(self):
        return(self.data_t)