# Cluster Characterization

This notebook trains an Artificial Neural Network for cluster characterization.

## Initial settings

In [None]:
from cdalvaro.catalogues import OpenClust
from cdalvaro.data_base import DB
from cdalvaro import graphics
from cdalvaro.logging import Logger
from cdalvaro.ml import DEC
from cdalvaro.ml.utils import estimate_n_clusters
from IPython.display import Image
from keras.initializers import VarianceScaling
from keras.optimizers import SGD
from keras.utils import plot_model
import logging
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler

In [None]:
logger = Logger.instance()
logger.setLevel(logging.ERROR)

db = DB.instance(host='gaia.cdalvaro.io', port=15432)

cdalvaro_plot = graphics.Plot(save_figs=False, figs_path="../../figures")

save_dir = './results'

clusters = OpenClust.catalogue()

## Feature selection

In [None]:
variables = [
    "ra", "ra_error", "dec", "dec_error", "pmra", "pmra_error",
    "pmdec", "pmdec_error", "parallax", "parallax_error",
    "phot_g_mean_mag", "bp_rp"
]

non_null_columns = list(filter(lambda x: not re.search(r'_error', x), variables))

## Stars selection

In [None]:
cluster = clusters["Melotte 22"]
stars_df = db.get_stars(region=cluster, columns=variables, filter_null_columns=non_null_columns)
#stars_df.isnull().sum()

stars_df['pmmod'] = np.sqrt(stars_df['pmra'] ** 2 + stars_df['pmdec'] ** 2)
stars_df['pmang'] = np.arctan2(stars_df['pmdec'], stars_df['pmra'])

stars_df.head()

## Correlation analysis

In [None]:
correlation_variables = list(filter(lambda x: not re.search(r'_error', x), stars_df.columns))
pairplot_df = stars_df.copy()

scaler = MinMaxScaler()
pairplot_df[correlation_variables] = scaler.fit_transform(pairplot_df[correlation_variables])
#sns.pairplot(pairplot_df[correlation_variables])

#plt.savefig(f"{save_dir}/pairplot.pdf")

## Feature selection

In [None]:
features = ['pmra', 'pmdec', 'pmmod', 'parallax']

## Feature rescaling

In [None]:
scaler = MinMaxScaler()
x = scaler.fit_transform(stars_df[features])

## K-Means

In [None]:
## Step 1 - Creating and training K-means model
n_clusters = 6 # estimate_n_clusters(x, verbose=True)
kmeans = KMeans(n_clusters=n_clusters)
stars_df['cluster_g'] = kmeans.fit_predict(x)

In [None]:
fig, ax, g = cdalvaro_plot.plot_cluster_proper_motion(stars_df,
                                                      img_name=f"pm_{cluster.name}",
                                                      xlim=(-30, 50), ylim=(-70, 40))

# Deep Embedded Clustering (DEC)

## Links

- https://towardsdatascience.com/deep-clustering-for-financial-market-segmentation-2a41573618cf
- https://github.com/yzzhang/machine-learning/tree/master/deep_learning/unsupervised_learning/dec_keras_clustering
- https://arxiv.org/pdf/1511.06335.pdf
- https://www.dlology.com/blog/how-to-do-unsupervised-clustering-with-keras/


In [None]:
# Reference:
#     Unsupervised Deep Embedding for Clustering Analysis - 4.3 Implementation
dims = [x.shape[-1], 500, 500, 2000, 10]

loss = ['kld', 'mse']
optimizer = SGD(1, 0.9)
init = VarianceScaling(scale=1./3., mode='fan_in', distribution='uniform')

# DEC model
dec = DEC(dims=dims, n_clusters=n_clusters, initializer=init)
dec.compile(optimizer=optimizer, loss=loss)
dec.model.summary()

In [None]:
plot_model(dec.model, to_file=f'{save_dir}/dec_model.png', show_shapes=True)
Image(filename=f'{save_dir}/dec_model.png')

In [None]:
# Training parameters
epochs = 30
batch_size = 128
maxiter = 1000
update_interval = 50
verbose = 1

dec.pretrain(x, optimizer=optimizer, epochs=epochs, batch_size=batch_size)
dec.fit(x, batch_size=batch_size, maxiter=maxiter, update_interval=update_interval, verbose=verbose)

In [None]:
stars_df['cluster_g'] = dec.predict(x)
stars_df['cluster_g'].value_counts()

In [None]:
fig, ax, g = cdalvaro_plot.plot_cluster_proper_motion(stars_df,
                                                      img_name=f"pm_{cluster.name}",
                                                      xlim=(-30, 50), ylim=(-70, 40))

fig, ax, g = cdalvaro_plot.plot_cluster_parallax_histogram(stars_df,
                                                           img_name=f"parallax_{cluster.name}",
                                                           xlim=(-4, 10), stat='density')

fig, ax, g = cdalvaro_plot.plot_cluster_isochrone_curve(stars_df,
                                                        img_name=f"isochrone_{cluster.name}",
                                                        xlim=(-1, 4), ylim=(3, 21))

In [None]:
x_embedded = TSNE(n_components=2).fit_transform(x)

vis_x = x_embedded[:, 0]
vis_y = x_embedded[:, 1]
plt.scatter(vis_x, vis_y, c=stars_df['clusters'], cmap=graphics.color_palette())
plt.colorbar(ticks=range(256))
plt.clim(-0.5, 9.5)
plt.show()