# Tratamento dos dados

Este notebook contém todos os cortes e flags de qualidade aplicados aos dados resultantes do cross-matching espacial, para gerar o training set final.

## 1. Importando as bibliotecas

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

## 2. Lendo os dados

In [None]:
df = pd.read_parquet('../dados_tcc/training_sets/xmatching_tcc.parquet')

In [None]:
df.columns

In [None]:
df.shape

## 3. Funções

In [None]:
def z_hist(catalog, sigma=0,pop='',save=0):
    plt.hist(catalog['redshiftdp02_truth'], bins=301,density=True, color='azure', edgecolor='blue',alpha=0.7)
    plt.ylabel(f'counts',fontsize=13)
    plt.xlabel(f'z',fontsize=13)
    plt.show()

In [None]:
def spatial_distribution(catalog, sigma=0, save=0, pop=''):
    plt.hist2d(catalog['coord_radp02_object'], catalog['coord_decdp02_object'], bins=100)
    plt.xlabel('RA [deg]')
    plt.ylabel('Dec [deg]')
    plt.colorbar()

In [None]:
def mag_histogram(catalog, title='DP0.2', sigma=0,pop='',save=0):
    bands = ['u','g', 'r', 'i', 'z','y']
    colors = ['purple','dodgerblue', 'darkgreen', 'deeppink','orange','red']
    plt.figure(figsize=(13,9))
    bins = np.linspace(9, 100, 57)
    j=1
    for i, (band, color) in enumerate(zip(bands,colors)):
        plt.subplot(2,3,j)
        plt.hist(catalog[f'mag_{band}dp02_object'],bins=bins, label=f'{band} band',color = color, edgecolor = 'white', alpha = 0.5)
        #plt.xlim(12,40)
        plt.yscale('log')
        plt.xlabel('mag',fontsize=13)
        plt.ylabel('counts',fontsize=13)
        plt.legend(loc=2)
        #plt.grid(True)
        j+=1
    plt.suptitle(title)
    #plt.savefig(f'mag_hist.png', dpi=300, bbox_inches='tight')
    plt.show()

In [None]:
def mag_error(catalog):
    plt.figure(figsize=(20, 30))
    bands = ['u','g', 'r', 'i', 'z','y']

    for i, band in enumerate(bands, 1):
        plt.subplot(3, 2, i)
        
        #query = f'mag_{band}dp02_object < 30.'
        #data = catalog.query(query)
        data=catalog
        mag = np.array(data[f'mag_{band}dp02_object'])
        err = np.array(data[f'magerr_{band}dp02_object'])

        hb = plt.hexbin(mag, err, gridsize=80, cmap='GnBu', bins='log', mincnt=1)
        
        plt.xlabel(f"mag {band}", fontsize=20)
        if i == 1:
            plt.ylabel("error", fontsize=20)
        #plt.xlim(14,25)
        #plt.ylim(0,20)
        plt.grid(True, linestyle='--', alpha=0.6)
        plt.colorbar(hb, label='log(N)')  

    plt.tight_layout(rect=[0, 0, 1, 0.95])  
    #plt.savefig(f'{path}erros.png')
    plt.show()

In [None]:
def mag_z(catalog, band = 'i'):
  
    z = catalog['redshiftdp02_truth']
    mag = catalog[f'mag_{band}dp02_object']
   
    plt.hexbin(z, mag, cmap='viridis', bins='log', mincnt=1,gridsize=[400, 200])
    plt.ylabel("mag "+band,fontsize=13)
    plt.xlabel(f"z",fontsize=13)
    #plt.gca().set_facecolor('black')
  
    #plt.ylim(14,30)
    plt.xlim(0,3)
    plt.grid(True)
    plt.colorbar()
    plt.savefig('mag_z.png')
    plt.show()

In [None]:
def color_color(catalog, sigma=0,pop='',save=0):
    bands = ['u', 'g', 'r', 'i', 'z','y']
    i=1
    plt.figure(figsize=(12,12))
    for index in range(len(bands)-2):
        plt.subplot(3,2,i)
        i+=1
        color = catalog[f'mag_{bands[index+1]}dp02_object']
        next_color = catalog[f'mag_{bands[index+2]}dp02_object']
        past_color = catalog[f'mag_{bands[index]}dp02_object']
        plt.hexbin(color-next_color,past_color-color, None, mincnt=1, cmap='RdPu', gridsize=[400,200], bins='log')
        plt.xlabel(f'{bands[index+1]}-{bands[index+2]}',fontsize=13)
        plt.ylabel(f'{bands[index]}-{bands[index+1]}',fontsize=13)
        #plt.xlim(-10,10)
        #plt.ylim(-10,10)
        plt.colorbar()
    #plt.savefig('../QA_training_set/color_color.png')    
    plt.show()

## 4. Região WFD 

Seleção da região da simulação DC2 onde foi aplicada a estratégia de observação WFD do LSST

In [None]:
df_wfd = df.copy()

In [None]:
from matplotlib.patches import Polygon

vertices = [
    (50, -44.5),
    (52.2, -27),
    (71.7, -27),
    (73.5, -44.5)
]

fig, ax = plt.subplots(figsize=(10, 6))
#ax.scatter(df['coord_radp02_object'], df['coord_decdp02_object'], s=1, alpha=0.3)
plt.hist2d(df_wfd['coord_radp02_object'], df_wfd['coord_decdp02_object'], bins=100)


trap = Polygon(vertices, closed=True, edgecolor='red', facecolor='none', linewidth=2)
ax.add_patch(trap)


ax.set_aspect('equal', adjustable='box')
ax.grid(True, linestyle='--', linewidth=0.5)
ax.set_xlabel("RA [graus]")
ax.set_ylabel("Dec [graus]")
plt.tight_layout()
#plt.savefig('WFD_region.png')
plt.show()

In [None]:
import numpy as np

'''
m_left = 0.1488
b_left = 56.590

m_right = -0.11905
b_right = 68.226
'''
m_left = 0.1257
b_left = 55.592

m_right = -0.1029
b_right = 68.9217


ra = df_wfd['coord_radp02_object'].to_numpy()
dec = df_wfd['coord_decdp02_object'].to_numpy()

ra_min = m_left * dec + b_left
ra_max = m_right * dec + b_right


mask = (
    (dec >= -44.5) & (dec <= -27) &
    (ra >= ra_min) & (ra <= ra_max)
)

df_trapezio = df_wfd[mask]


In [None]:
df_trapezio.shape

In [None]:
spatial_distribution(df_trapezio)

In [None]:
z_hist(df_trapezio)

## 5. Cortes de qualidade gerais

In [None]:
df_cuts = df_trapezio.copy()

In [None]:
df_cuts.shape

##### Seleção de galáxias

In [None]:
df_cuts['truth_typedp02_truth'].value_counts()

In [None]:
df_cuts = df_cuts[df_cuts['truth_typedp02_truth']==1]

In [None]:
df_cuts = df_cuts[df_cuts['refExtendednessdp02_object']==1]
df_cuts.shape

##### Corte de  objetos que foram ignorados pelo algoritmo de deblending

In [None]:
df_cuts = df_cuts[df_cuts['deblend_skippeddp02_object']==False]
df_cuts.shape

##### Corte de objetos que tiveram falhas no ajuste do cModel

In [None]:
df_cuts = df_cuts[df_cuts['i_cModel_flagdp02_object']==False]
df_cuts.shape

##### Corte de objetos que tiveram falhas na medição do centroide

In [None]:
df_cuts = df_cuts[df_cuts['i_centroid_flagdp02_object']== False]
df_cuts.shape

##### Corte de objetos com alta contaminação por vizinhos

In [None]:
df_cuts = df_cuts[df_cuts['i_blendednessdp02_object'] < (10**(-0.375))]
df_cuts.shape

##### Magnitudes limitantes

In [None]:
df_cuts = df_cuts[df_cuts['mag_idp02_object'] < 24.5]
df_cuts.shape

In [None]:
df_cuts = df_cuts[df_cuts['mag_idp02_object'] > 17]
df_cuts.shape

##### Restrição de S/N > 10 na banda i e S/N >5 em pelo menos duas bandas entre g/r/z/y

In [None]:
df_cuts['snr_g'] = 1 / (10 ** (0.4 * df_cuts['magerr_gdp02_object']) - 1)
df_cuts['snr_r'] = 1 / (10 ** (0.4 * df_cuts['magerr_rdp02_object']) - 1)
df_cuts['snr_z'] = 1 / (10 ** (0.4 * df_cuts['magerr_zdp02_object']) - 1)
df_cuts['snr_y'] = 1 / (10 ** (0.4 * df_cuts['magerr_ydp02_object']) - 1)
df_cuts['snr_i'] = 1 / (10 ** (0.4 * df_cuts['magerr_idp02_object']) - 1)

In [None]:
snr_other_bands_gt5 = (
    (df_cuts['snr_g'] > 5).astype(int) +
    (df_cuts['snr_r'] > 5).astype(int) +
    (df_cuts['snr_z'] > 5).astype(int) +
    (df_cuts['snr_y'] > 5).astype(int)
)

In [None]:
snr_i_gt10 = df_cuts['snr_i'] > 10

In [None]:
df_cuts['snr_i_gt10_and_two_more_gt5'] = (snr_i_gt10) & (snr_other_bands_gt5 >= 2)

In [None]:
df_filtrado = df_cuts[df_cuts['snr_i_gt10_and_two_more_gt5'] == True]

In [None]:
df_filtrado.shape, df_cuts.shape

In [None]:
spatial_distribution(df_filtrado)

In [None]:
z_hist(df_filtrado)

# Flags de pixel

Aplicação das flags para remoção de pixels ruins 

In [None]:
df_pix = df_filtrado.copy()

In [None]:
df_pix.shape

In [None]:
df_pix = df_pix[df_pix['i_pixelFlags_clippedCenterdp02_object']== False]
df_pix.shape

In [None]:
df_pix = df_pix[df_pix['i_pixelFlags_crCenterdp02_object']== False]
df_pix.shape

In [None]:
df_pix = df_pix[df_pix['i_pixelFlags_edgedp02_object']== False]
df_pix.shape

In [None]:
df_pix = df_pix[df_pix['i_pixelFlags_interpolatedCenterdp02_object']== False]
df_pix.shape

In [None]:
df_pix = df_pix[df_pix['i_pixelFlags_offimagedp02_object']== False]
df_pix.shape

In [None]:
df_pix = df_pix[df_pix['i_pixelFlags_saturatedCenterdp02_object']== False]
df_pix.shape

In [None]:
df_pix = df_pix[df_pix['i_pixelFlags_suspectCenterdp02_object']== False]
df_pix.shape

In [None]:
df_pix = df_pix[df_pix['i_pixelFlags_baddp02_object']== False]
df_pix.shape

In [None]:
df_pix = df_pix[df_pix['i_pixelFlags_clippeddp02_object']== False]
df_pix.shape

In [None]:
df_pix = df_pix[df_pix['i_pixelFlags_crdp02_object']== False]
df_pix.shape

In [None]:
df_pix = df_pix[df_pix['i_pixelFlags_interpolateddp02_object']== False]
df_pix.shape

In [None]:
df_pix = df_pix[df_pix['i_pixelFlags_offimagedp02_object']== False]
df_pix.shape

In [None]:
df_pix = df_pix[df_pix['i_pixelFlags_saturateddp02_object']== False]
df_pix.shape

In [None]:
df_pix = df_pix[df_pix['i_pixelFlags_suspectdp02_object']== False]
df_pix.shape

# Não detecções
Tratando magnitudes com alto blendedness como não detecções(NaN).

In [None]:
df_nan = df_pix.copy()
df_nan.shape

In [None]:
magnitude_cols = [col for col in df.columns if 'mag_' in col]

df_nan[magnitude_cols] = df_nan[magnitude_cols].where(df_nan[magnitude_cols] <= 60, np.nan)
#df_nan[magnitude_cols] = df_nan[magnitude_cols].fillna(99.0)

In [None]:
df_nan.shape

In [None]:
mag_histogram(df_nan)

In [None]:
mag_error(df_nan)

In [None]:
color_color(df_nan)

# Cores 
Adicionando cores e seus respectivos erros.

In [None]:
df_nan['u-g'] = df_nan['mag_udp02_object'] - df_nan['mag_gdp02_object']
df_nan['g-r'] = df_nan['mag_gdp02_object'] - df_nan['mag_rdp02_object']
df_nan['r-i'] = df_nan['mag_rdp02_object'] - df_nan['mag_idp02_object']
df_nan['i-z'] = df_nan['mag_idp02_object'] - df_nan['mag_zdp02_object']
df_nan['z-y'] = df_nan['mag_zdp02_object'] - df_nan['mag_ydp02_object']

In [None]:
df_nan['u-g_err'] = np.sqrt((df_nan['magerr_udp02_object'])**2 + (df_nan['magerr_gdp02_object'])**2)
df_nan['g-r_err'] = np.sqrt((df_nan['magerr_gdp02_object'])**2 + (df_nan['magerr_rdp02_object'])**2)
df_nan['r-i_err'] = np.sqrt((df_nan['magerr_rdp02_object'])**2 + (df_nan['magerr_idp02_object'])**2)
df_nan['i-z_err'] = np.sqrt((df_nan['magerr_idp02_object'])**2 + (df_nan['magerr_zdp02_object'])**2)
df_nan['z-y_err'] = np.sqrt((df_nan['magerr_zdp02_object'])**2 + (df_nan['magerr_ydp02_object'])**2)

# Training set

In [None]:
df_final = df_nan.copy()

##### Mantendo medidas de magnitudes erros NaN em concordância

In [None]:
for b in ['u', 'g', 'r', 'i', 'z', 'y']:
    mag = f"mag_{b}dp02_object"
    magerr = f"magerr_{b}dp02_object"
    
    df_final.loc[df_final[mag].notna() & df_final[magerr].isna(), mag] = np.nan
    df_final.loc[df_final[mag].isna() & df_final[magerr].notna(), magerr] = np.nan

In [None]:
df_final['mag_udp02_object'].isna().sum()

In [None]:
inconsistentes = (
    df_final["mag_ydp02_object"].notna() & df_final["magerr_ydp02_object"].isna()
)| (df_final["mag_ydp02_object"].isna() & df_final["magerr_ydp02_object"].notna())

df_inconsistentes = df_final[inconsistentes]
df_inconsistentes

##### Selecionando ~100000 objetos para o training set final

In [None]:
df_training_set= df_final.sample(frac=0.4827, random_state=42)
df_training_set

In [None]:
colunas = ['mag_udp02_object', 'mag_gdp02_object', 'mag_rdp02_object', 'mag_idp02_object', 'mag_zdp02_object', 'mag_ydp02_object']
df_training_set[colunas].count()

In [None]:
mag_z(df_training_set)

In [None]:
z_hist(df_training_set)

In [None]:
mag_histogram(df_training_set)

In [None]:
spatial_distribution(df_training_set)

##### Salvando o training set em arquivo .parquet

In [None]:
df_training_set.to_parquet('../dados_tcc/training_sets/training_set.parquet')