In [3]:
import warnings
warnings.filterwarnings('ignore')

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:70% !important; }</style>"))

import faiss
import umap
import os

import tensorflow as tf
import pandas as pd
import numpy as np

from tensorboard.plugins import projector
from sklearn.manifold import TSNE
from sklearn.preprocessing import QuantileTransformer

In [4]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [13]:
dict_info = {
    'oid_object': ['ZTF18aawfqax','ZTF21aaqqwsa'], #['ZTF22aadpscu'], # ['ZTF18abgpzts'],  #['ZTF23aaxuvkn'], # ,# ['ZTF21aanfcmk'] # Busqueda de un objeto a la vez
    'data_file': 'features.parquet',
    'k_nn': 5000,  # 200,
}

## Load data

In [21]:
data = pd.read_parquet(dict_info['data_file']).set_index('oid')
data

Unnamed: 0_level_0,Amplitude_1,Amplitude_2,AndersonDarling_1,AndersonDarling_2,Autocor_length_1,Autocor_length_2,Beyond1Std_1,Beyond1Std_2,Con_1,Con_2,...,n_non_det_before_fid_1,n_non_det_before_fid_2,n_pos_1,n_pos_2,positive_fraction_1,positive_fraction_2,r-W2_0,r-W3_0,rb_0,sgscore1_0
oid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ZTF22aabgrek,,0.033651,,0.809992,,1.0,,0.333333,,0.0,...,0.0,0.0,1.0,6.0,1.000000,1.000000,2.776421,3.666420,0.790000,0.979869
ZTF18abumkut,,0.035398,,0.616626,,1.0,,0.250000,,0.0,...,8.0,12.0,2.0,8.0,1.000000,1.000000,4.684831,7.393831,0.839286,0.993542
ZTF19abcejaa,0.567169,0.183057,1.000000,1.000000,5.0,2.0,0.175000,0.344828,0.000000,0.0,...,0.0,0.0,6.0,1.0,0.150000,0.034483,5.297417,8.109416,0.945714,0.803289
ZTF18acjvvot,0.061632,0.033853,0.687478,0.778749,1.0,2.0,0.357143,0.372093,0.000000,0.0,...,2.0,68.0,70.0,43.0,1.000000,1.000000,2.777639,3.667638,0.855714,0.979869
ZTF21aacomkz,0.053794,0.032420,0.363219,0.967365,1.0,2.0,0.300000,0.347826,0.000000,0.0,...,0.0,24.0,10.0,23.0,1.000000,1.000000,2.769856,3.659855,0.870000,0.979869
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZTF18accboxe,0.223578,0.412897,1.000000,1.000000,3.0,1.0,0.225000,0.176471,0.078947,0.0,...,0.0,3.0,33.0,15.0,0.825000,0.882353,5.048504,7.552504,0.816667,0.116780
ZTF18abvzlln,0.130467,0.116905,0.627308,0.748061,1.0,2.0,0.280000,0.266667,0.000000,0.0,...,8.0,11.0,25.0,15.0,1.000000,1.000000,5.425498,8.027498,0.623571,0.199893
ZTF18abhueor,0.279524,0.218180,0.999624,1.000000,1.0,1.0,0.333333,0.160714,0.000000,0.0,...,16.0,1.0,2.0,3.0,0.222222,0.053571,3.049909,5.577909,0.825714,1.000000
ZTF18abvpvhy,0.019740,0.025279,0.312601,0.617235,1.0,1.0,0.333333,0.166667,0.000000,0.0,...,40.0,70.0,0.0,0.0,0.000000,0.000000,2.353717,3.334717,0.717857,0.971333


## Object search from the ALeRCE DB

In [46]:
import psycopg2
import json
import requests

url = "https://raw.githubusercontent.com/alercebroker/usecases/master/alercereaduser_v4.json"
params = requests.get(url).json()['params']

conn = psycopg2.connect(
        dbname=params['dbname'], 
        user=params['user'], 
        host=params['host'], 
        password=params['password'])

base_query = """SELECT oid, CONCAT(name, '_', fid) AS feature, value
FROM feature  
WHERE oid IN (%s)"""

batch = dict_info['oid_object']
batch = [f"'{o}'" for o in batch]
batch = ",".join(batch)
query = base_query % batch
df_features_q = pd.read_sql(query, conn)

df_features_q = df_features_q.pivot_table(values='value', index="oid", columns='feature', aggfunc='first')

In [47]:
df_features_q

feature,Amplitude_1,Amplitude_2,AndersonDarling_1,AndersonDarling_2,Autocor_length_1,Autocor_length_2,Beyond1Std_1,Beyond1Std_2,Con_1,Con_2,...,n_non_det_after_fid_1,n_non_det_after_fid_2,n_non_det_before_fid_1,n_non_det_before_fid_2,n_pos_1,n_pos_2,positive_fraction_1,positive_fraction_2,rb_0,sgscore1_0
oid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ZTF18aawfqax,0.707719,0.451899,0.978339,0.61371,5.0,4.0,0.304348,0.4375,0.0,0.0,...,3.0,6.0,8.0,7.0,23.0,16.0,1.0,1.0,0.957143,0.046583
ZTF21aaqqwsa,0.123774,0.076891,0.845686,0.886404,4.0,4.0,0.35,0.372093,0.0,0.0,...,10.0,12.0,0.0,0.0,40.0,43.0,1.0,1.0,0.888571,0.131071


In [48]:
df_features_q.to_parquet("query.parquet")

## Unified dataset 

In [27]:
for i in df_features_q.index:
    if i in data.index:
        data.drop([i], axis=0, inplace=True)

data = pd.concat([data, df_features_q])

df_objs = data[[]].copy()
df_objs['url'] = ['https://alerce.online/object/{}'.format(oid) for oid in df_objs.index.unique()]
df_objs['outlier'] = 0
df_objs.loc[df_features_q.index[0], 'outlier'] = 1
df_objs = df_objs[['url', 'outlier']]
df_objs

Unnamed: 0_level_0,url,outlier
oid,Unnamed: 1_level_1,Unnamed: 2_level_1
ZTF22aabgrek,https://alerce.online/object/ZTF22aabgrek,0
ZTF18abumkut,https://alerce.online/object/ZTF18abumkut,0
ZTF19abcejaa,https://alerce.online/object/ZTF19abcejaa,0
ZTF18acjvvot,https://alerce.online/object/ZTF18acjvvot,0
ZTF21aacomkz,https://alerce.online/object/ZTF21aacomkz,0
...,...,...
ZTF18abhueor,https://alerce.online/object/ZTF18abhueor,0
ZTF18abvpvhy,https://alerce.online/object/ZTF18abvpvhy,0
ZTF18abtdgyo,https://alerce.online/object/ZTF18abtdgyo,0
ZTF18aawfqax,https://alerce.online/object/ZTF18aawfqax,1


In [28]:
data

Unnamed: 0_level_0,Amplitude_1,Amplitude_2,AndersonDarling_1,AndersonDarling_2,Autocor_length_1,Autocor_length_2,Beyond1Std_1,Beyond1Std_2,Con_1,Con_2,...,n_non_det_before_fid_1,n_non_det_before_fid_2,n_pos_1,n_pos_2,positive_fraction_1,positive_fraction_2,r-W2_0,r-W3_0,rb_0,sgscore1_0
oid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ZTF22aabgrek,,0.033651,,0.809992,,1.0,,0.333333,,0.0,...,0.0,0.0,1.0,6.0,1.000000,1.000000,2.776421,3.666420,0.790000,0.979869
ZTF18abumkut,,0.035398,,0.616626,,1.0,,0.250000,,0.0,...,8.0,12.0,2.0,8.0,1.000000,1.000000,4.684831,7.393831,0.839286,0.993542
ZTF19abcejaa,0.567169,0.183057,1.000000,1.000000,5.0,2.0,0.175000,0.344828,0.0,0.0,...,0.0,0.0,6.0,1.0,0.150000,0.034483,5.297417,8.109416,0.945714,0.803289
ZTF18acjvvot,0.061632,0.033853,0.687478,0.778749,1.0,2.0,0.357143,0.372093,0.0,0.0,...,2.0,68.0,70.0,43.0,1.000000,1.000000,2.777639,3.667638,0.855714,0.979869
ZTF21aacomkz,0.053794,0.032420,0.363219,0.967365,1.0,2.0,0.300000,0.347826,0.0,0.0,...,0.0,24.0,10.0,23.0,1.000000,1.000000,2.769856,3.659855,0.870000,0.979869
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZTF18abhueor,0.279524,0.218180,0.999624,1.000000,1.0,1.0,0.333333,0.160714,0.0,0.0,...,16.0,1.0,2.0,3.0,0.222222,0.053571,3.049909,5.577909,0.825714,1.000000
ZTF18abvpvhy,0.019740,0.025279,0.312601,0.617235,1.0,1.0,0.333333,0.166667,0.0,0.0,...,40.0,70.0,0.0,0.0,0.000000,0.000000,2.353717,3.334717,0.717857,0.971333
ZTF18abtdgyo,0.054506,,0.999999,,1.0,,0.111111,,0.0,,...,16.0,3.0,9.0,3.0,1.000000,1.000000,1.329315,1.330314,0.646429,0.992500
ZTF18aawfqax,0.707719,0.451899,0.978339,0.613710,5.0,4.0,0.304348,0.437500,0.0,0.0,...,8.0,7.0,23.0,16.0,1.000000,1.000000,,,0.957143,0.046583


In [29]:
df_objs.groupby('outlier').count()

Unnamed: 0_level_0,url
outlier,Unnamed: 1_level_1
0,2188495
1,1


## Normalize dataset

In [30]:
data_qt = data.copy()
qt = QuantileTransformer(n_quantiles=10, output_distribution='uniform', random_state=0)

for column in data_qt.columns:
    non_nan_data = data_qt[column].dropna().values.reshape(-1, 1)
    transformed_non_nan_data = qt.fit_transform(non_nan_data) * 0.9 + 0.1
    data_qt.loc[data_qt[column].notna(), column] = transformed_non_nan_data.flatten()

data_qt = data_qt.fillna(0)
data_qt

Unnamed: 0_level_0,Amplitude_1,Amplitude_2,AndersonDarling_1,AndersonDarling_2,Autocor_length_1,Autocor_length_2,Beyond1Std_1,Beyond1Std_2,Con_1,Con_2,...,n_non_det_before_fid_1,n_non_det_before_fid_2,n_pos_1,n_pos_2,positive_fraction_1,positive_fraction_2,r-W2_0,r-W3_0,rb_0,sgscore1_0
oid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ZTF22aabgrek,0.000000,0.200997,0.000000,0.430044,0.000000,0.100000,0.000000,0.600000,0.0,0.1,...,0.100000,0.100000,0.500000,0.600000,1.000000,1.000000,0.493566,0.382469,0.634146,0.514800
ZTF18abumkut,0.000000,0.211093,0.000000,0.331174,0.000000,0.100000,0.000000,0.354545,0.0,0.1,...,0.666667,0.660000,0.533333,0.666667,1.000000,1.000000,0.714187,0.796689,0.789361,0.710379
ZTF19abcejaa,0.903660,0.669300,0.900000,0.899998,0.903448,0.800000,0.215000,0.662069,0.1,0.1,...,0.100000,0.100000,0.650000,0.450000,0.475000,0.417488,0.773541,0.865118,0.965217,0.275523
ZTF18acjvvot,0.309538,0.202165,0.335680,0.411553,0.100000,0.800000,0.711688,0.754651,0.1,0.1,...,0.400000,0.900950,0.906706,0.901674,1.000000,1.000000,0.493848,0.382681,0.833333,0.514800
ZTF21aacomkz,0.279950,0.196411,0.190545,0.560283,0.100000,0.800000,0.500000,0.678261,0.1,0.1,...,0.100000,0.800000,0.725000,0.840000,1.000000,1.000000,0.492045,0.381325,0.870370,0.514800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZTF18abhueor,0.762449,0.730063,0.662111,1.000000,0.100000,0.100000,0.600000,0.196429,0.1,0.1,...,0.807143,0.300000,0.533333,0.525000,0.509722,0.427168,0.539683,0.657941,0.748936,1.000000
ZTF18abvpvhy,0.148083,0.172184,0.172673,0.331468,0.100000,0.100000,0.600000,0.200000,0.1,0.1,...,0.901580,0.901096,0.100000,0.100000,0.100000,0.100000,0.394748,0.324657,0.392857,0.460651
ZTF18abtdgyo,0.283116,0.000000,0.786174,0.000000,0.100000,0.000000,0.162500,0.000000,0.1,0.0,...,0.807143,0.400000,0.712500,0.525000,1.000000,1.000000,0.188088,0.149973,0.192742,0.688372
ZTF18aawfqax,0.908257,0.902708,0.526083,0.329767,0.903448,0.901053,0.513043,0.900000,0.1,0.1,...,0.666667,0.533333,0.835000,0.805000,1.000000,1.000000,0.000000,0.000000,0.976812,0.199955


## Get K nearest neighbor

In [31]:
dim = data_qt.shape[1]
index = faiss.IndexFlatL2(dim) 
index.add(data_qt)

k = dict_info['k_nn']
xq = data_qt.filter(items=[df_features_q.index[0]], axis=0)
D, I = index.search(xq, k)

In [32]:
I.shape

(1, 5000)

In [33]:
def print_link(index=None, classifier=None):
    expr1 = 'https://alerce.online/?oid='
    expr2 = '&oid='.join(list(index))
    expr3 = '&selectedClassifier=' + classifier + '_classifier&page=1'
    expr = expr1 + expr2 + expr3

    display(HTML("<a href='%s' target=\"_blank\"> %s <a>" % (expr, expr)))
    
print('Link of the closest objects:')
print_link(index=data_qt.index.values[I[0][0:199]], classifier="stamp")

Link of the closest objects:


In [34]:
df_data_knn = data_qt.iloc[I[0]]
df_data_knn

Unnamed: 0_level_0,Amplitude_1,Amplitude_2,AndersonDarling_1,AndersonDarling_2,Autocor_length_1,Autocor_length_2,Beyond1Std_1,Beyond1Std_2,Con_1,Con_2,...,n_non_det_before_fid_1,n_non_det_before_fid_2,n_pos_1,n_pos_2,positive_fraction_1,positive_fraction_2,r-W2_0,r-W3_0,rb_0,sgscore1_0
oid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ZTF18aawfqax,0.908257,0.902708,0.526083,0.329767,0.903448,0.901053,0.513043,0.900000,0.1,0.1,...,0.666667,0.533333,0.8350,0.805000,1.0,1.0,0.0,0.0,0.976812,0.199955
ZTF21aazqkeq,0.919533,0.911420,0.277142,0.406153,0.903448,0.903158,0.685000,0.669231,0.1,0.1,...,0.633333,0.566667,0.8200,0.855000,1.0,1.0,0.0,0.0,0.951449,0.199955
ZTF23absguth,0.932574,0.929020,0.795410,0.383197,0.901724,0.900000,0.427778,0.451020,0.1,0.1,...,0.450000,0.400000,0.8100,0.783333,1.0,1.0,0.0,0.0,0.419048,0.200000
ZTF24aabsfyt,0.910236,0.901994,0.364423,0.398142,0.800000,0.900000,0.359091,0.225974,0.1,0.1,...,0.750000,0.466667,0.7000,0.733333,1.0,1.0,0.0,0.0,0.955072,0.200000
ZTF24aagiouv,0.917429,0.920460,0.452086,0.139486,0.800000,0.900000,0.300000,0.731818,0.1,0.1,...,0.633333,0.620000,0.7125,0.733333,1.0,1.0,0.0,0.0,0.951449,0.199953
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZTF21ablkfxg,0.901238,0.880631,0.586787,0.398925,0.800000,0.800000,0.411364,0.731818,0.1,0.1,...,0.800000,0.750000,0.7375,0.733333,1.0,1.0,0.0,0.0,0.284616,0.199951
ZTF20aadckor,0.740248,0.900359,0.226282,0.186597,0.100000,0.800000,0.600000,0.451020,0.1,0.1,...,0.500000,0.566667,0.6500,0.633333,1.0,1.0,0.0,0.0,0.634146,0.199950
ZTF21aapkbeu,0.900703,0.764385,0.587529,0.402903,0.800000,0.900000,0.260000,0.714286,0.1,0.1,...,0.600000,0.566667,0.7250,0.783333,1.0,1.0,0.0,0.0,0.780851,0.199955
ZTF21abhqhqb,0.715472,0.808124,0.450658,0.467443,0.800000,0.100000,0.285714,0.600000,0.1,0.1,...,0.857143,0.809677,0.7750,0.800000,1.0,1.0,0.0,0.0,0.969565,0.200000


## Visualization K nearest neighbor

In [41]:
df_objs_knn = df_objs.reindex(df_data_knn.index)
df_objs_knn

Unnamed: 0_level_0,url,outlier
oid,Unnamed: 1_level_1,Unnamed: 2_level_1
ZTF18aawfqax,https://alerce.online/object/ZTF18aawfqax,1
ZTF21aazqkeq,https://alerce.online/object/ZTF21aazqkeq,0
ZTF23absguth,https://alerce.online/object/ZTF23absguth,0
ZTF24aabsfyt,https://alerce.online/object/ZTF24aabsfyt,0
ZTF24aagiouv,https://alerce.online/object/ZTF24aagiouv,0
...,...,...
ZTF21ablkfxg,https://alerce.online/object/ZTF21ablkfxg,0
ZTF20aadckor,https://alerce.online/object/ZTF20aadckor,0
ZTF21aapkbeu,https://alerce.online/object/ZTF21aapkbeu,0
ZTF21abhqhqb,https://alerce.online/object/ZTF21abhqhqb,0


In [42]:
log_dir = 'logs'
#data_name = 'ZTF_dataset'

dict_metadata = {
    'oid': list(df_objs_knn.index),
    'url': list(df_objs_knn['url'].values),
    'outlier': list(df_objs_knn['outlier'].values)
    # Agrega aquí más columnas si necesitas
}

In [43]:
metadata_file_path = f'{log_dir}/metadata.tsv'
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

# Escribir los datos en el archivo .tsv
with open(metadata_file_path, 'w') as file:
    column_names = '\t'.join(dict_metadata.keys())
    file.write(column_names + '\n')

    num_rows = len(next(iter(dict_metadata.values())))

    for i in range(num_rows):
        row_data = [str(dict_metadata[column][i]) for column in dict_metadata]
        file.write('\t'.join(row_data) + '\n')


# Crear un checkpoint desde los datos
weights = tf.Variable(df_data_knn)
checkpoint = tf.train.Checkpoint(embedding=weights)
checkpoint_prefix = os.path.join(log_dir, "embedding.ckpt")
checkpoint.save(checkpoint_prefix)

'logs/embedding.ckpt-1'

In [44]:
# Inicializa un proyector
config = projector.ProjectorConfig()
embedding_config = config.embeddings.add()

# Especifica los nombres de los tensors y la ruta a los metadatos
embedding_config.tensor_name = f"embedding/.ATTRIBUTES/VARIABLE_VALUE"
embedding_config.metadata_path = 'metadata.tsv'

# Guarda una proyección configurada
projector.visualize_embeddings(log_dir, config)

In [39]:
#!kill 4013354

In [40]:
#%tensorboard --logdir ./logs --host 0.0.0.0