# Analysis GAIA M67 DR2
### Packages

In [1]:
import astropy.units as u  # Simplify unity manipulation
from astropy.coordinates import SkyCoord # To handle sky/space coordinates
from astroquery.gaia import Gaia
from astropy.io.votable import parse

from pylab import rcParams
import numpy as np
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
import pandas as pd

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

Created TAP+ (v1.0.1) - Connection:
	Host: gea.esac.esa.int
	Use HTTPS: False
	Port: 80
	SSL Port: 443


### Reading votable

In [2]:
## Reading the votable
voname = "n2516.vot"
producDir  

votable = parse(voname)

for table in votable.iter_tables():
    data = table.array
    print(data.dtype.names)
    
lgal = data['l']
bgal = data['b']
pmas = data['parallax']
distance = 1000. / np.ma.filled(pmas, -9999.) #distance = 1000/parallax
pmra = np.ma.filled(data['pmra'], -9999999.) # right assention
pmdec= np.ma.filled(data['pmdec'],-9999999.) #proper motion declinente
vdec = 4.74 * pmdec / pmas   ##? (pour )
vra  = 4.74 * pmra  / pmas   # pour avoir des km.s-1

FileNotFoundError: [Errno 2] No such file or directory: 'n2516.vot'

### Fonctions

In [None]:
#lgal,bgal,distance,vra,vdec
def filter_data(lgal, bgal, distance, vra, vdec, cartesian = False, dist_range = [0., 2000], vra_range = [-200,200], vdec_range = [-200.,200]):
    "filter the data applying the range and return the sklearn-centric array"
    
    i1 = np.where((distance >= dist_range[0]) & (distance < dist_range[1]))
    i2 = np.where((vra >= vra_range[0]) & (vra < vra_range[1]))
    i12 = np.intersect1d(i1,i2)
    i3 = np.where((vdec >= vdec_range[0]) & (vdec < vdec_range[1]))
    ifinal = np.intersect1d(i12,i3)
    
    # plot_check(lgal,vra,ifinal)
    
    datask = np.zeros((len(ifinal),5))
    
    if cartesian:
        xx, yy, zz = convert_to_cartesian(lgal[ifinal], bgal[ifinal], distance[ifinal])
        datask[:,0] = xx
        datask[:,1] = yy
        datask[:,2] = zz
        datask[:,3] = vra[ifinal]
        datask[:,4] = vdec[ifinal]
    else:
        datask[:,0] = lgal[ifinal]
        datask[:,1] = bgal[ifinal]
        datask[:,2] = distance[ifinal]
        datask[:,3] = vra[ifinal]
        datask[:,4] = vdec[ifinal]
    
    return(datask)


# normalize the 5d datask and also return the normalization vector
# choix_norm = 0 (norme 0) et = 1 (norm 2)
def normalization(data,choix_norm=1):
    result = np.zeros(data.shape)
    normalization_vector = np.zeros(5)
    
    for i in range(5) :
        if choix_norm == 0 : normalization_vector[i] = np.max(abs(data[:,i]))
        else               : normalization_vector[i] = np.linalg.norm(data[:,i])
        result[:,i] = data[:,i]/normalization_vector[i]
    
    return(result,normalization_vector)


#Normalized the 5d datask with linear projection from [min,max] to [0,1]
def normalization0_1(data):
    result = np.zeros(data.shape)
    normalization_vector = np.zeros((5,2)) #Represente max and min    
    
    for i in range(5) :
        normalization_vector[i,0] = np.max(data[:,i]) # max
        normalization_vector[i,1] = np.min(data[:,i]) # min
        result[:,i] = (data[:,i]-normalization_vector[i,1])/(normalization_vector[i,0]-normalization_vector[i,1])  
        
    return(result,normalization_vector)

#Unnormalized Data with linear projection from [0,1] to [min,max]
def unnormalization0_1(data,normalization_vector):
    result = np.zeros(data.shape) 
    for i in range(5) :
        result[:,i] = data[:,i]*(normalization_vector[i,0]-normalization_vector[i,1]) + normalization_vector[i,1]
        
    return result

### Data processing

In [None]:
print("## Total stars: %d"%(len(lgal)))

# Clean wrong values, return (n,5) array
datask = filter_data(lgal,bgal,distance,vra,vdec, cartesian = False)
print("## Stars selected: %d"%(len(datask[:,0])))

datanor0, normalization_vector0 = normalization(datask,0)
datanor1, normalization_vector1 = normalization(datask,1)
datanor2, normalization_vector2 = normalization0_1(datask)

# Normalization
# datask  =  non-normalized
# datanor0 = normalized norm0 (divided by max(abs(data)))
# datanor1 = normalized norm2 (divided by np.linalg.norm(data[:,i]))
# datanor2 = linear projection from [min,max] to [0,1]

In [None]:
## fitted cluster in k-means
nclust = 12
print("## computing k-means...")

# KMeans for each normalisation
kmeans = KMeans(n_clusters=nclust, max_iter = 2000, n_init = 50)
kmeans.fit(datask)
kmeans_nor0 = KMeans(n_clusters=nclust, max_iter = 2000, n_init = 50)
kmeans_nor0.fit(datanor0)
kmeans_nor1 = KMeans(n_clusters=nclust, max_iter = 2000, n_init = 50)
kmeans_nor1.fit(datanor1)
kmeans_nor2 = KMeans(n_clusters=nclust, max_iter = 2000, n_init = 50)
kmeans_nor2.fit(datanor2)

# Centroid for each normalisation
centroid = kmeans.cluster_centers_
centroid_nor0 = np.multiply(kmeans_nor0.cluster_centers_,normalization_vector0)
centroid_nor1 = np.multiply(kmeans_nor1.cluster_centers_,normalization_vector1)
centroid_nor2 = unnormalization0_1(kmeans_nor2.cluster_centers_,normalization_vector2)

# Labels for each normalisation
labels = kmeans.labels_
labels_nor0 = kmeans_nor0.labels_
labels_nor1 = kmeans_nor1.labels_
labels_nor2 = kmeans_nor2.labels_

In [None]:
# Useful tab
colors = ["r.","k.","g.","y.","b.","k.","b.","g.","y.","c.","r.","k.","g.","y.","w.","k.","b.","g.","y.","c.","r.","k.","g.","y.","w.","k.","b.","g.","y.","c."]
string = ['non-normalized','normalized norm 0', 'normalized norm 2','normalized linearly between 0 and 1']
data_name = ['lgal','bgal','distance','vdec','vra']

In [None]:
# Plotting i2 with respect to i1
# (0 = lgal; 1 = bgal; 2 = distance...)
i1 = 1
i2 = 2

j = 1
plt.figure(figsize=(25,22))
centro = [centroid,centroid_nor0,centroid_nor1,centroid_nor2]
for lab in (labels,labels_nor0,labels_nor1,labels_nor2) :
    plt.subplot(2,2,j)
    for i in range(nclust):
        ilabel = np.where(lab == i)[0]
        plt.plot(datask[ilabel,i1],datask[ilabel,i2],colors[i],markersize=1)
        plt.plot(centro[j-1][i,i1],centro[j-1][i,i2],colors[i],markersize=19)
    plt.xlabel(data_name[i1], fontsize=22)
    plt.ylabel(data_name[i2], fontsize=22)
    plt.title("clustering with variables " + string[j-1], fontsize=22)
    j+=1
    
plt.show()

### Cluster plotting for data normalized linearly between 0 and 1

In [None]:
# Plotting i2 with respect to i1
# (0 = lgal; 1 = bgal; 2 = distance...)
i1 = 1
i2 = 2

plt.figure(figsize=(25,22))
for i in range(nclust):
    plt.subplot(4,3,i+1)
    ilabel = np.where(labels_nor2 == i)[0]
    plt.plot(datask[ilabel,i1],datask[ilabel,i2],colors[i],markersize=1)
    plt.plot(centroid_nor2[i,i1],centroid_nor2[i,i2],colors[i],markersize=19)
    #plt.xlabel(data_name[i1], fontsize=22)
    plt.ylabel(data_name[i2], fontsize=22)
    plt.title("num. " + str(i) + " " + data_name[i2] + "=" + str(round(centroid_nor2[i,i2],2)), fontsize=22)
    
plt.show()

# -------------------------------------------------------------

### Description in pandas variable 

In [None]:
d1 = {'lgal': datask[:,0], 'bgal': datask[:,1], 'distance': datask[:,2], 'vra': datask[:,3], 'vdec': datask[:,4]}
datapd = pd.DataFrame(data=d1)
d2 = {'lgal': datanor0[:,0], 'bgal': datanor0[:,1], 'distance': datanor0[:,2], 'vra': datanor0[:,3], 'vdec': datanor0[:,4]}
datapd_nor0 = pd.DataFrame(data=d2)
d3 = {'lgal': datanor1[:,0], 'bgal': datanor1[:,1], 'distance': datanor1[:,2], 'vra': datanor1[:,3], 'vdec': datanor1[:,4]}
datapd_nor1 = pd.DataFrame(data=d3)
d4 = {'lgal': datanor2[:,0], 'bgal': datanor2[:,1], 'distance': datanor2[:,2], 'vra': datanor2[:,3], 'vdec': datanor2[:,4]}
datapd_nor2 = pd.DataFrame(data=d4)

### Description for one element

In [None]:
## Histogram
plt.figure(figsize=(25,15))
i = 1
for string in ('lgal','bgal','distance','vra','vdec'):
    plt.subplot(2,3,i)
    datapd[string].hist()
    plt.xlabel(string, fontsize=25)
    i+=1

In [None]:
plt.figure(figsize=(25,10))

i = 1
string = ['non-normalized','normalized norm 0', 'normalized norm 2','normalized linearly between 0 and 1']
data_i = [datapd,datapd_nor1,datapd_nor1,datapd_nor2]
for data_i in (datapd,datapd_nor0,datapd_nor1,datapd_nor2):
    data_i.plot(kind = "box",figsize=(15, 7))
    plt.title("Boxplot, variables " + string[i-1], fontsize=18)
    i+=1


plt.show()

### Hierarchical Clustering Dendrogram

In [None]:
Z = linkage(datapd, 'ward', metric='euclidean') # choix de la distance
Z0 = linkage(datapd_nor0, 'ward', metric='euclidean') # choix de la distance
Z1 = linkage(datapd_nor1, 'ward', metric='euclidean') # choix de la distance
Z2 = linkage(datapd_nor2, 'ward', metric='euclidean') # choix de la distance

In [None]:
string = ['non-normalized','normalized norm 0', 'normalized norm 2','normalized linearly between 0 and 1']
Z_ = [Z,Z0,Z1,Z2]
t = np.zeros((16,4))
plt.figure(figsize=(14,10))
x=np.arange(16)+1
for i in range(4) :
    plt.subplot(2,2,i+1)
    height = Z_[i][:,2]
    height = sorted(height,reverse=True)
    plt.scatter(x,height[0:16])  #height[0:16]/sum(height[0:16])*100
    t[:,i] = height[0:16]
    plt.xlabel('Index')
    plt.ylabel('Height')
    plt.title("Choice of the number of classes for "+string[i])
plt.show()

In [None]:
string = ['non-normalized','normalized norm 0', 'normalized norm 2','normalized linearly between 0 and 1']
Z_ = [Z,Z0,Z1,Z2]
data_ = [datapd, datapd_nor0, datapd_nor1, datapd_nor2]
plt.figure(figsize=(20,16))
x=np.arange(16)+1
for i in range(4) :
    plt.subplot(2,2,i+1)
    dendrogram(Z_[i],leaf_font_size=8.,labels=data_[i].index)
    plt.xlabel('Individus')
    plt.ylabel('Distance')
    plt.title(string[i])
plt.show()

In [None]:
plt.figure(figsize=(15, 8))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Individus')
plt.ylabel('Distance')
dendrogram(Z,leaf_font_size=8.,labels=marsRech.index)
plt.show()

In [None]:
classesCAH = fcluster(Z,t=15,criterion='distance')
pd.DataFrame(classesCAH).hist()
plt.show()

In [None]:
print(classesCAH)

In [None]:
colors = ["r.","k.","g.","y.","b.","k.","b.","g.","y.","c.","r.","k.","g.","y.","w.","k.","b.","g.","y.","c.","r.","k.","g.","y.","w.","k.","b.","g.","y.","c."]
rcParams['figure.figsize'] = 8, 8

for i in range(4):
    ilabel = np.where(classesCAH == i)[0]
    plt.plot(datask[ilabel,1],datask[ilabel,2],colors[i],markersize=1)
    #plt.scatter(centroid_nor[0,0],centroid_nor[0,1],'y',markersize=10)
    
plt.show()

In [None]:
colors = ["r.","k.","g.","y.","b.","k.","b.","g.","y.","c.","r.","k.","g.","y.","w.","k.","b.","g.","y.","c.","r.","k.","g.","y.","w.","k.","b.","g.","y.","c."]
rcParams['figure.figsize'] = 16, 8

i1 = 1
i2 = 2
string = ['non-normalized','normalized norm 0', 'normalized norm 2','normalized linearly between 0 and 1']
plt.figure(figsize=(25,22))
centro = [centroid,centroid_nor0,centroid_nor1,centroid_nor2]


for i in range(nclust):
    plt.subplot(4,3,i+1)
    ilabel = np.where(labels_nor0 == i)[0]
    plt.plot(datask[labels_nor0,i1],datask[labels_nor0,i2],colors[i],markersize=1)
    plt.plot(centroid_nor0[i,i1],centroid_nor0[i,i2],colors[i],markersize=19)
    plt.xlabel("bgal", fontsize=22)
    plt.ylabel("distance", fontsize=22)
    plt.title("cluste num. " + str(j), fontsize=22)
    
plt.show()