In [8]:
import numpy as np
import pandas as pd
import os 
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from scipy.stats import skew
from scipy.stats import kurtosis
from scipy.stats import entropy

In [9]:
x="/kaggle/input/arieldata/fgs2k/100468857_fgs.npy"
arr=np.load(x)
print(arr.shape)

(2700,)


In [22]:
def process_signal(signal, gain, offset, dead, flat, dark, linear_corr):
    # ADC conversion
    signal = signal * gain + offset
    
    # Clean flat, dark, dead
    flat = np.ma.masked_where(dead, flat)
    dark = np.ma.masked_where(dead, dark)
    flat = np.tile(flat, (signal.shape[0], 1, 1))
    dark = np.tile(dark, (signal.shape[0], 1, 1))
    dead = np.tile(dead, (signal.shape[0], 1, 1))
    
    signal = np.ma.masked_where(dead, signal)
    signal = (signal - dark) / (flat - dark)
    # Apply linear correction
#     linear_corr = np.flip(linear_corr, axis=0)
#     for x in range(signal.shape[1]):
#         for y in range(signal.shape[2]):
#             poli = np.poly1d(linear_corr[:, x, y])
#             signal[:, x, y] = poli(signal[:, x, y])
    # Get CDS
    signal = signal[1::2, :, :] - signal[::2, :, :]
    
    return signal

def bin_obs(cds_signal, binning):
    cds_transposed = cds_signal.transpose(1, 2, 0)
    binned_shape = (cds_transposed.shape[0], cds_transposed.shape[1], cds_transposed.shape[2] // binning)
    cds_binned = np.zeros(binned_shape)
    for i in range(binned_shape[2]):
        cds_binned[:, :, i] = np.sum(cds_transposed[:, :, i*binning:(i+1)*binning], axis=2)
    return cds_binned.transpose(2, 0, 1)


In [11]:
def vector(data, start, end , start1 , end1):
    
    region1= data[:start]  
    
    region2=data[start:end]
    region3=data[end:start1]
    region4=data[start1:end1]

    region5=data[end1:]

    uncovered = (  np.mean(region1) + np.mean(region5)  )/2
    
    reduction1= ( uncovered - np.mean(region2)    ) / uncovered
    reduction2= ( uncovered - np.mean(region3)   )  / uncovered
    reduction3= ( uncovered - np.mean(region4)   )  / uncovered
        
    
    slope=(data[end]-data[start])/(end-start)
    slope1=(data[end1]-data[start1])/(end1-start1)
    
    nr = np.mean(data)    / np.std(data)
    nr1= np.mean(region1) / np.std(region1)
    nr2= np.mean(region2) / np.std(region2)
    nr3= np.mean(region3) / np.std(region3)
    nr4= np.mean(region4) / np.std(region4)
    nr5= np.mean(region5) / np.std(region5)
   
    skewness= skew(data)

#     input_vector = np.array([ slope, slope1 , reduction1, reduction2, reduction3 ] )
    input_vector = np.array([ slope, slope1 , reduction1, reduction2, reduction3 ,nr,nr1,nr2,nr3,nr4,nr5 ,skewness] )

    return input_vector


In [12]:
from scipy.signal import savgol_filter

def smooth_data(data, window_size):
    return savgol_filter(data, window_size, 3)

def optimize_breakpoint(data, initial_breakpoint, window_size, buffer_size, smooth_window):
    best_breakpoint = initial_breakpoint
    best_score = float("-inf")
    midpoint = len(data) // 2
    smoothed_data = smooth_data(data, smooth_window)
#     smoothed_data=data
    for i in range(-window_size, window_size):
        new_breakpoint = initial_breakpoint + i
        if new_breakpoint > buffer_size and new_breakpoint < midpoint - buffer_size:
            region1 = data[: new_breakpoint - buffer_size]
            region2 = data[
                new_breakpoint
                + buffer_size : 2 * midpoint
                - new_breakpoint
                - buffer_size
            ]
            region3 = data[2 * midpoint - new_breakpoint + buffer_size :]

            breakpoint_region1 = smoothed_data[new_breakpoint - buffer_size: new_breakpoint + buffer_size]
            breakpoint_region2 = smoothed_data[new_breakpoint - buffer_size: new_breakpoint + buffer_size]

            mean_diff = abs(np.mean(region1) - np.mean(region2)) + abs(
                np.mean(region2) - np.mean(region3)
            )
            var_sum = np.var(region1) + np.var(region2) + np.var(region3)
            range_at_breakpoint1 = (np.max(breakpoint_region1) - np.min(breakpoint_region1))
            range_at_breakpoint2 = (np.max(breakpoint_region2) - np.min(breakpoint_region2))

            mean_range_at_breakpoint = (range_at_breakpoint1 + range_at_breakpoint2) / 2

            score = mean_diff - 0.5 * var_sum + mean_range_at_breakpoint

            if score > best_score:
                best_score = score
                best_breakpoint = new_breakpoint

                
    return best_breakpoint

In [13]:
initial_breakpoint=900
buffer_size=80 
smooth_window=200
window_size=300

In [14]:
class WavelengthPredictor(nn.Module):
    def __init__(self):
        super(WavelengthPredictor, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(12, 32),
            nn.ReLU(),
            nn.BatchNorm1d(32),
            
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            
            nn.Linear(256, 283)
        )
    
    def forward(self, x):
        return self.model(x)

In [15]:
# class WavelengthPredictor(nn.Module):
#     def __init__(self):
#         super(WavelengthPredictor, self).__init__()
#         self.model = nn.Sequential(
#             nn.Linear(24, 48),
#             nn.ReLU(),
#             nn.BatchNorm1d(48),
            
#             nn.Linear(48, 96),
#             nn.ReLU(),
#             nn.BatchNorm1d(96),
            
#             nn.Linear(96, 192),
#             nn.ReLU(),
#             nn.BatchNorm1d(192),
            
#             nn.Linear(192, 256),
#             nn.ReLU(),
#             nn.BatchNorm1d(256),
            
#             nn.Linear(256, 283)
#         )
    
#     def forward(self, x):
#         return self.model(x)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = WavelengthPredictor()
model = nn.DataParallel(model)
model = model.to(device)

weights="/kaggle/input/arieldata/epoch180.pth"
# weights="/kaggle/input/arieldata/ariel3_335.pth"
checkpoint = torch.load(weights, map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])



In [16]:
torch.set_printoptions(precision=10)

In [17]:
results=[]

In [19]:
import re
def extract_number(filename):
    number = re.search(r'\d+', filename).group()  
    return int(number)

mode="train"
num=15

start= 0
end  = 600
adc_info = pd.read_csv(f'/kaggle/input/ariel-data-challenge-2024/{mode}_adc_info.csv')                                                       
                                                       
planets = sorted(os.listdir(f"/kaggle/input/ariel-data-challenge-2024/{mode}"), key=extract_number)[start:end]

# planets= sorted(os.listdir("/kaggle/input/arieldata/airs-p3") , key=extract_number)[:num]
# planets = [extract_number(planet) for planet in planets]


print(planets[0])
print(len(planets))


785834
600


In [23]:
total_up   = []
total_down = []

for planet in planets:
    planet = int(planet) 

    selected_adc_info = adc_info.loc[adc_info['planet_id'] == planet]
    
    print(f"\nProcessing planet: {planet}")

    airs_dark        = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/{mode}/{planet}/AIRS-CH0_calibration/dark.parquet').values.reshape(32, 356)
    airs_dead        = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/{mode}/{planet}/AIRS-CH0_calibration/dead.parquet').values.reshape(32, 356)
    airs_flat        = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/{mode}/{planet}/AIRS-CH0_calibration/flat.parquet').values.reshape(32, 356)
    airs_linear_corr = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/{mode}/{planet}/AIRS-CH0_calibration/linear_corr.parquet').values.reshape(6, 32, 356)
    airs_ch0         = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/{mode}/{planet}/AIRS-CH0_signal.parquet').values
    airs_ch0_gain   = selected_adc_info['AIRS-CH0_adc_gain'].values[0]
    airs_ch0_offset = selected_adc_info['AIRS-CH0_adc_offset'].values[0]
    airs_ch0 = airs_ch0.reshape(11250, 32, 356)
        
    
    fgs_dark         = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/{mode}/{planet}/FGS1_calibration/dark.parquet').values.reshape(32, 32)
    fgs_dead         = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/{mode}/{planet}/FGS1_calibration/dead.parquet').values.reshape(32, 32)
    fgs_flat         = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/{mode}/{planet}/FGS1_calibration/flat.parquet').values.reshape(32, 32)
    fgs_linear_corr  = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/{mode}/{planet}/FGS1_calibration/linear_corr.parquet').values.reshape(6, 32, 32)
    fgs1             = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/{mode}/{planet}/FGS1_signal.parquet').values
    fgs1_gain       = selected_adc_info['FGS1_adc_gain'].values[0]
    fgs1_offset     = selected_adc_info['FGS1_adc_offset'].values[0]
    fgs1 = fgs1.reshape(135000, 32, 32) 
    
    airs_ch0_processed = process_signal(airs_ch0, airs_ch0_gain, airs_ch0_offset, airs_dead, airs_flat, airs_dark, airs_linear_corr)
    fgs1_processed = process_signal(fgs1, fgs1_gain, fgs1_offset, fgs_dead, fgs_flat, fgs_dark, fgs_linear_corr)
    
    
    airs_frames = bin_obs(airs_ch0_processed, binning=2)
    fgs_frames  = bin_obs(fgs1_processed, binning=(25))
    print(airs_frames.shape , fgs_frames.shape)
    
    break
    
    binned_fgs_array=np.sum(airs_frames , axis=(1,2))
    binned_airs_array=np.sum(fgs1_frames , axis=(1,2))

    
    airs_arr = binned_airs_array
    airs_arr=(airs_arr-np.min(airs_arr))/(np.max(airs_arr)-np.min(airs_arr))
    
    fgs_arr  = binned_fgs_array
    fgs_arr=(fgs_arr-np.min(fgs_arr))/(np.max(fgs_arr)-np.min(fgs_arr))
    
    print(airs_arr.shape ,fgs_arr.shape )

    airsbp = optimize_breakpoint(airs_arr,initial_breakpoint,window_size=window_size,buffer_size=buffer_size,smooth_window=smooth_window)    
    fgsbp = optimize_breakpoint(fgs_arr,initial_breakpoint,window_size=window_size,buffer_size=buffer_size,smooth_window=smooth_window)

    midpoint1 = len(airs_arr) // 2
    bp1 = [airsbp, 2 * midpoint1 - airsbp]
    airs_start   =  bp1[0] - buffer_size
    airs_end     =  bp1[0] + buffer_size
    airs_start1  =  bp1[1] - buffer_size
    airs_end1    =  bp1[1] + buffer_size
    
    midpoint2 = len(fgs_arr) // 2
    bp2 = [fgsbp, 2 * midpoint2 - fgsbp]
    fgs_start  =    bp2[0] - buffer_size
    fgs_end    =    bp2[0] + buffer_size
    fgs_start1 =    bp2[1] - buffer_size
    fgs_end1   =    bp2[1] + buffer_size


    airs_vector=  vector( airs_arr,  airs_start ,  airs_end , airs_start1 , airs_end1 )
    fgs_vector =  vector( fgs_arr, fgs_start  ,   fgs_end , fgs_start1  , fgs_end1 )        
    
    
    input_vector=  np.concatenate((airs_vector , fgs_vector))
#     input_vector=  (airs_vector + fgs_vector )/2
#     input_vector = torch.tensor(np.array(input_vector))

    
#     labels        = pd.read_csv("/kaggle/input/ariel-data-challenge-2024/test_labels.csv")
#     filtered_data = labels[labels["planet_id"] == planet].iloc[0, 1:].values
    
#     print(filtered_data[:10])
#     print(filtered_data.shape)
    
    input_vector  =  torch.tensor(np.array(airs_vector)).unsqueeze(0).float().to(device)
    print(input_vector.shape)
    
    model.eval()
    with torch.no_grad():
        out=model(input_vector)
        
    if mode=="train":

        labels  = pd.read_csv("/kaggle/input/ariel-data-challenge-2024/train_labels.csv")
        true    = labels[labels["planet_id"] == planet].iloc[0, 1:].values
        true    = torch.tensor(true).unsqueeze(0).to(device)
        l    = nn.MSELoss()
        loss = l(out , torch.tensor(true))
        print(f"Loss :{loss.item():.7f} , Out:{out[0][:5]} , True:{true[0][:5]}")
        
        up=[]
        down=[]
        for o ,t  in zip(out[0] , true[0]):
            if o>t:
                up.append(o/t)
            elif t>o:
                down.append(t/o)
            
        total_up.append(len(up))
        total_down.append(len(down))

#     if mode=="test":
#         print(f"Out:{out[0][:5]} ")
        
    count+=1
    print(f"{count} completed of {len(planets)} ")

    


Processing planet: 785834
(2812, 32, 356) (2700, 32, 32)
