<a href="https://colab.research.google.com/github/Yasminebenhamadi/NMA/blob/main/IncrementalPy/yasmine/KL_R.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from sklearn import mixture
import random
import pandas as pd
import statsmodels.api as sm
import seaborn as sns
import scipy.stats
from sklearn.datasets import make_spd_matrix,make_blobs
from scipy.stats import multivariate_normal
from sklearn.mixture import GaussianMixture as GMM
from sklearn.model_selection import train_test_split
from scipy.stats import multivariate_normal as mvn
from sklearn.cluster import KMeans
from sklearn.metrics import pair_confusion_matrix, davies_bouldin_score, calinski_harabasz_score, silhouette_score
from numpy import linalg as la
from sklearn.preprocessing import MinMaxScaler
from scipy.stats.stats import kruskal
from sklearn.manifold import TSNE
from scipy.stats import zscore
import re
from sklearn.model_selection import GridSearchCV
import os
import csv


plt.style.use('seaborn-dark')
plt.rcParams['figure.figsize']=14,6

  from scipy.stats.stats import kruskal


### utilities

In [None]:
#@title Figure Settings
import ipywidgets as widgets       # interactive display
%config InlineBackend.figure_format = 'retina'
plt.style.use("https://raw.githubusercontent.com/NeuromatchAcademy/course-content/main/nma.mplstyle")

In [None]:
def visualize_components(component1, component2, labels, show=True):
  """
  Plots a 2D representation of the data for visualization with categories
  labelled as different colors.

  Args:
    component1 (numpy array of floats) : Vector of component 1 scores
    component2 (numpy array of floats) : Vector of component 2 scores
    labels (numpy array of floats)     : Vector corresponding to categories of
                                         samples

  Returns:
    Nothing.

  """

  plt.figure()
  cmap = plt.cm.get_cmap('tab10')
  plt.scatter(x=component1, y=component2, c=labels, cmap=cmap)
  plt.xlabel('Component 1')
  plt.ylabel('Component 2')
  plt.colorbar(ticks=range(10))
  plt.clim(-0.5, 9.5)
  if show:
    plt.show()

In [None]:
def gmm_bic_score(estimator, X):
    """Callable to pass to GridSearchCV that will use the BIC score."""
    # Make it negative since GridSearchCV expects a score to maximize
    return -estimator.bic(X)

In [None]:
def ari(labels_true,labels_pred): 
    '''safer implementation of ari score calculation'''
    (tn, fp), (fn, tp) = pair_confusion_matrix(labels_true, labels_pred)
    tn=int(tn)
    tp=int(tp)
    fp=int(fp)
    fn=int(fn)

    # Special cases: empty data or full agreement
    if fn == 0 and fp == 0:
        return 1.0

    return 2. * (tp * tn - fn * fp) / ((tp + fn) * (fn + tn) +
                                       (tp + fp) * (fp + tn))

In [None]:
def f1_score(labels_true,labels_pred): 
    '''safer implementation of ari score calculation'''
    (tn, fp), (fn, tp) = pair_confusion_matrix(labels_true, labels_pred)
    tn=int(tn)
    tp=int(tp)
    fp=int(fp)
    fn=int(fn)

    precision= tp/(tp+fp)
    recall= tp/(tp+fn)

    # Special cases: empty data or full agreement
    if fn == 0 and fp == 0:
        return 1.0

    return 2. * precision * recall / (precision+recall)

In [None]:
def unison_shuffled_copies(a, b):
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]

In [None]:
def min_max(data):
  scaler = MinMaxScaler()
  scaler.fit(data)
  return scaler.transform(data)

## Ploting

In [None]:
def plot_bivariate_data(X, title):
  fig = plt.figure(figsize=[8, 4])
  gs = fig.add_gridspec(2, 2)
  ax = fig.add_subplot(gs[:, 1])
  ax.plot(X[:, 0], X[:, 1], '.', markerfacecolor=[.5, .5, .5],
           markeredgewidth=0)
  plt.xlabel('Feature 1')
  plt.ylabel('Feature 2')
  plt.title(title)
  plt.show()

In [None]:
def plot_contours(data, means, covs,labels, title, xa=[-12,12],ya=[-12,12]):
    """visualize the gaussian components over the data"""
    plt.figure()
    cmap = plt.cm.get_cmap('tab10')
    plt.scatter(data[:, 0], data[:, 1],c=labels, cmap=cmap, s=40 ,alpha=0.4)
    

    delta = 0.025
    if len(means)>0:
      k = means.shape[0]
    else:
      k=0
    x = np.arange(xa[0], xa[1], delta)
    y = np.arange(ya[0], ya[1], delta)
    x_grid, y_grid = np.meshgrid(x, y)
    coordinates = np.array([x_grid.ravel(), y_grid.ravel()]).T

    col = ['cyan', 'red', 'indigo','blue','white']
    for i in range(k):
        mean = means[i]
        cov = covs[i]
        z_grid = multivariate_normal(mean, cov).pdf(coordinates).reshape(x_grid.shape)
        plt.contour(x_grid, y_grid, z_grid, colors = col[i])

    plt.title(title)
    plt.tight_layout()

## Generate data

In [None]:
def generate_bivariate_data(nbr_components=1,min_mean=[-10],max_mean=[10],scale=[], size=1000):
  data = np.zeros((nbr_components,size,2))
  loc = [i*5 for i in range(nbr_components)]
  loc1 = np.random.uniform(low=min_mean,high=max_mean,size=nbr_components)
  if len(scale)==0:
    scale= make_spd_matrix(n_dim=2)
  means = []
  for i in range(nbr_components):
    mean = [loc[i],loc1[i]]
    s=np.random.multivariate_normal(mean=mean, cov=scale, size=size)
    means.append(mean)
    data[i] = s
  np.random.shuffle(data)
  return data[0], means,scale

In [None]:
def generate_bivariate_component(mean,scale,size=1000):
  data = np.zeros((1,size,2))
  if len(scale)==0:
    scale= make_spd_matrix(n_dim=2)
  data=np.random.multivariate_normal(mean=mean, cov=scale, size=size)
  np.random.shuffle(data)
  return data

In [None]:
def generate_bivariate_data_overlap(nbr_components=2,mean=[0,0],var_1=1,var_2=10, size=1000):
  data = np.zeros((nbr_components,size,2))
  scale1= [[var_1,random.uniform(0, 1.5)],[random.uniform(0, 1.5),var_2]]
  scale2= [[var_2,random.uniform(0, 1.5)],[random.uniform(0, 1.5),var_1]]
  for i in range(nbr_components):
    s=np.random.multivariate_normal(mean=mean, cov=scale1 if i%2==1 else scale2, size=size)
    data[i] = s
    

  np.random.shuffle(data)
  return np.concatenate((data[0], data[1])), data[0],data[1],[mean,scale1,scale2]

# KL Divergence

In [None]:
def kl_mvn(m0, S0, m1, S1):
    """
    Kullback-Liebler divergence from Gaussian pm,pv to Gaussian qm,qv.
    Also computes KL divergence from a single Gaussian pm,pv to a set
    of Gaussians qm,qv.
    

    From wikipedia
    KL( (m0, S0) || (m1, S1))
         = .5 * ( tr(S1^{-1} S0) + log |S1|/|S0| + 
                  (m1 - m0)^T S1^{-1} (m1 - m0) - N )
    """
    # store inv diag covariance of S1 and diff between means
    N = m0.shape[0]
    iS1 = np.linalg.inv(S1)
    diff = m1 - m0

    # kl is made of three terms
    tr_term   = np.trace(iS1 @ S0)
    det_term  = np.log(np.linalg.det(S1)/(np.linalg.det(S0)+1e-6)) #np.sum(np.log(S1)) - np.sum(np.log(S0))
    quad_term = diff.T @ np.linalg.inv(S1) @ diff #np.sum( (diff*diff) * iS1, axis=1)
    return .5 * (tr_term + det_term + quad_term - N) 

In [None]:
def KL_matrix(m0,S0,m1,S1):
  k0=m0.shape[0]
  k1=m1.shape[0]
  M=np.zeros((k0,k1))
  for i in range(k0):
    for j in range(k1):
      M[i,j]=kl_mvn(m0[i],S0[i], m1[j], S1[j])
  return M

# Incremental EM for GMM

In [None]:
def trainGMM_lot(data_x,num,n_components,dim, random_state, covariance_type='full',max_iter = 100,tol=1e-03):
  w = []
  m = []
  covs = []
  gmm = [GMM(n_components=n_components, covariance_type=covariance_type, random_state=random_state, max_iter=max_iter,tol=tol) for i in range(num)]
  for i in range(num):
    gmm[i].fit(data_x[i])
    w.append(gmm[i].weights_)
    m.append(gmm[i].means_)
    covs.append(gmm[i].covariances_)
  w = np.array(w)
  m = np.array(m)
  covs = np.array(covs)
  if num==1:
    w = w.reshape(n_components)
    m = m.reshape(n_components,dim)
    covs = covs.reshape(n_components,dim,dim)
  return w,m,covs,gmm

In [None]:
def trainGMM(data_x,n_components,dim, random_state,covariance_type='full',max_iter = 100,tol=1e-03):
  gmm = GMM(n_components=n_components, covariance_type=covariance_type, random_state=random_state, max_iter=max_iter,tol=tol)
  gmm.fit(data_x)
  return gmm.weights_,gmm.means_,gmm.covariances_,gmm


In [None]:
def merge_two(n0,w0,m0,s0,n1,w1,m1,s1):
    new_mean=(n0*w0*m0+n1*w1*m1)/(n0*w0+n1*w1)
    new_weight=(n0*w0+n1*w1)/(n0+n1)
    s1=(n0*w0*s0+n1*w1*s1)/(n0*w0+n1*w1)
    sw=n0*w0*np.outer(np.transpose(m0),m0)+n1*w1*np.outer(np.transpose(m1),m1)
    s2=sw/(n0*w0+n1*w1)
    sub3=np.outer(np.transpose(new_mean),new_mean)
    new_cov=s1+s2-sub3
    return new_weight,new_mean,new_cov


In [None]:
def delete_e(w,i):
  liste = w.tolist()
  liste.pop(i)
  return np.array(liste)

In [None]:
def delete_all(w,m,s,i):
  w0=delete_e(w,i)
  m0=delete_e(m,i)
  s0=delete_e(s,i)
  return w0,m0,s0

In [None]:
def merge(n0,w0,m0,s0,n1,w1,m1,s1):
  KL=KL_matrix(m0,s0,m1,s1)
  new_weights=[]
  new_means=[]
  new_covs=[]
  for k in range(KL.shape[0]):
    ij_min = np.unravel_index(KL.argmin(), KL.shape)
    i,j=ij_min
    new_weight,new_mean,new_cov = merge_two(n0,w0[i],m0[i],s0[i],n1,w1[j],m1[j],s1[j])
    new_weights.append(new_weight)
    new_means.append(new_mean)
    new_covs.append(new_cov)
    KL= np.delete(KL, ij_min[0], 0)
    KL= np.delete(KL, ij_min[1], 1)
    w0,m0,s0=delete_all(w0,m0,s0,i)
    w1,m1,s1=delete_all(w1,m1,s1,j)
  return np.array(new_weights),np.array(new_means),np.array(new_covs)

In [None]:
def incGMM(data,dim, n_components, increments_number, random_state, covariance_type='full',max_iter = 100,tol=1e-03, true_lables=[], incPrint=True):
  size_increments=int(len(data)/increments_number)
  clus_increment_size=int(size_increments/n_components)
  assignments=[]
  increments=[]
  weights=[]
  means=[]
  inc_labels=[]
  covariances=[]
  n0=0
  gmm = 0
  for i in range(increments_number):
      s=[i for j in range(size_increments)]
      inc_labels.append(s)
      if i == (increments_number - 1):
        inc = data[i*size_increments:data.shape[0],:]
      else:
        inc = data[i*size_increments:size_increments*(i+1),:]
      increments.append(inc)
      w,m,covs,gmm = trainGMM(inc,n_components,dim, covariance_type='full',random_state=random_state,max_iter = 100,tol=1e-03)
      n1=len(inc)
      if len(weights)>0:
        w,m,covs=merge(n0,w,m,covs,n1,weights,means,covariances)
      weights,means,covariances=w,m,covs
      n0=n0+n1
      if incPrint and (len(true_lables)>0) and data.shape[1]==2:
        plot_contours(inc,means,covariances,true_lables[i*size_increments:size_increments*(i+1)], "increments_true")
        plt.savefig("inc_true{}.png".format(i))
  if incPrint and data.shape[1]==2:
    increments, inc_labels = np.array(increments).reshape(data.shape[0],dim) , np.array(inc_labels).reshape(data.shape[0])
    plot_contours(increments,[],[],inc_labels, "increments_partition")
  return np.array(weights), np.array(means), np.array(covariances), gmm

# Tests

In [None]:
def getFinalGmm(data, means, covariances, weights, true_labels = [], plot=True):
    gmm=GMM(n_components=weights.shape[0],covariance_type='full',max_iter=1)
    gmm.means_=means
    gmm.covariances_=covariances
    gmm.weights_=weights
    precisions_cholesky = np.linalg.inv(la.cholesky(covariances))
    gmm.precisions_cholesky_= np.array([np.transpose(p) for p in precisions_cholesky])
    assign = gmm.predict(data)
    if plot and len(true_labels>0) and data.shape[1]==2:
        plot_contours(data,means,covariances,assign, "increments")
    return gmm, assign


In [None]:
def compare(data,dim, n_components, increments_number, random_state, true_labels = [], printB=True, max_iter = 100,tol=1e-03, covariance_type='full'):
    weights, means, covariances, gm = incGMM(data,dim, n_components=n_components, increments_number=increments_number, covariance_type=covariance_type,random_state=random_state,max_iter = max_iter,tol=tol, true_lables=true_labels, incPrint=printB)
    gmm_inc, assign_inc = getFinalGmm(data, means, covariances, weights, true_labels = true_labels, plot=printB)
    all_gmm=GMM(n_components=n_components, covariance_type=covariance_type,max_iter = max_iter,tol=tol)
    all_gmm.fit(data)
    assign_all = all_gmm.predict(data)
    ############
    partial_gmm=GMM(n_components=n_components, covariance_type=covariance_type,max_iter = max_iter,tol=tol)
    #print((len(data)/increments_number))
    partial_gmm.fit(data[0:int(len(data)/increments_number)])
    assign_part = partial_gmm.predict(data)

    all_metrics = [all_gmm.score(data), partial_gmm.score(data), gmm_inc.score(data)]
    all_metrics.extend([davies_bouldin_score(data,true_labels), davies_bouldin_score(data,assign_all), davies_bouldin_score(data,assign_part), davies_bouldin_score(data,assign_inc)])
    all_metrics.extend([calinski_harabasz_score(data,true_labels), calinski_harabasz_score(data,assign_all), calinski_harabasz_score(data,assign_part), calinski_harabasz_score(data,assign_inc)])
    all_metrics.extend([ari(true_labels.flatten(),assign_all.flatten()), ari(true_labels.flatten(),assign_part.flatten()), ari(true_labels.flatten(),assign_inc.flatten())])
    all_metrics.extend([f1_score(true_labels.flatten(),assign_all.flatten()), f1_score(true_labels.flatten(),assign_part.flatten()), f1_score(true_labels.flatten(),assign_inc.flatten())])

    return all_metrics

In [None]:
def getIC(data,dim, n_components, increments_number, random_state, true_labels = [], printB=True, max_iter = 100,tol=1e-03, covariance_type='full'):
  weights, means, covariances, gm = incGMM(data,dim, n_components=n_components, increments_number=increments_number, covariance_type=covariance_type,random_state=random_state,max_iter = max_iter,tol=tol, true_lables=true_labels, incPrint=printB)
  gmm_inc, assign_inc = getFinalGmm(data, means, covariances, weights, true_labels = true_labels, plot=printB)
  
  all_gmm=GMM(n_components=n_components, covariance_type=covariance_type,max_iter = max_iter,tol=tol)
  all_gmm.fit(data)
  
  return gmm_inc.bic(data), gmm_inc.aic(data), all_gmm.aic(data), all_gmm.bic(data)

In [None]:
def kmeans_reorganize(toute,number_comp,increments_number, random_state):
    means = KMeans(init="k-means++", n_clusters=number_comp, n_init=4, random_state=random_state)
    kmeans.fit(toute)
    l=kmeans.predict(toute)
    true_labels_k=np.unique(l)
    clusters=[]
    for j in true_labels_k:
        cluster=[toute[i] for i in range(len(toute)) if l[i]==j]

        clusters.append(cluster)
    clusters_sizes=np.unique(l,return_counts=True)
    clusters_sizes=clusters_sizes[1]
    increment_size=int(sum(clusters_sizes)//increments_number)
    sizes_cluster_per_increment=clusters_sizes//increments_number
    trainingDS=[]#clusters with different sizes
    for i in range(increments_number):
        for j in range(number_comp):
            ind=true_labels_k[j]
            if j==0:
               r0=clusters[j][i*sizes_cluster_per_increment[j]:(i+1)*sizes_cluster_per_increment[j]]
               r1=r0
            else:
               r2=clusters[j][i*sizes_cluster_per_increment[j]:(i+1)*sizes_cluster_per_increment[j]]
               r=np.concatenate((r1,r2),axis=0)

               r1=r
        trainingDS.append(r)
    training=np.array(trainingDS)
    training=training.reshape(training.shape[0]*training.shape[1],training.shape[2])
    return training

## R generated data

In [None]:
def read_info_file(path):
    words = ["sepVal", "Number of clusters", "Number of dimensions", "Number of data points", "Number of outliers"]
    N, dim, number_comp, sepVal, outliers = (0,0,0,0,0)
    with open(path, 'r') as fp:
        # read all lines using readline()
        lines = fp.readlines()
        for row in lines:
            for word in words:
                if row.find(word) != -1:
                    x = row.split(' ')[-1]
                    if word == words[0]:
                        sepVal=float(x)
                    elif word == words[1]:
                        number_comp=int(x)
                    elif word == words[2]:
                        dim=int(x)
                    elif word == words[3]:
                        N=int(x)
                    else:
                        outliers=float(x)
                        
    return N, dim, number_comp, sepVal, outliers

In [None]:
mainDir="C:/Users/benhamya/PFE/Rdatasets/datasets/balanced/"
datasets=[name for name in os.listdir(mainDir) if os.path.isdir(os.path.join(mainDir, name))]
datasets=np.sort(datasets)
i=0
with open(mainDir+'results_dimBig6.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    header=["DatasetNumber", "size", "dim", "Percentage of increment", "cluster size", "number of clusters", "sepVal", "Percentage of outliers", "loglikelihood all", "loglikelihood part", "loglikelihood inc", "DBS true", "DBS all", "DBS part", "DBS inc", "CHS true", "CHS all", "CHS part", "CHS inc", "ARI all", "ARI part", "ARI inc", "F1 all", "F1 part", "F1 inc"]
    writer.writerow(header)
    for dataset in datasets:
        dataset_dir=mainDir+dataset+"/"
        folder = os.listdir(dataset_dir)
        for s in folder:
            data_file_chemin = dataset_dir+s+"/"
            files =os.listdir(data_file_chemin)
            if(len(files)>0):
                data_file =  data_file_chemin+[f for f in files if f.endswith(".mat")][0]
                print(data_file)
                labels_file =data_file_chemin+ [f for f in files if f.endswith(".mem")][0]
                info_file = data_file_chemin+[f for f in files if (f.endswith(".log") and not f.endswith("info.log"))][0]
                N, dim, number_comp, sepVal, outliers = read_info_file(info_file)

                data=pd.read_csv(data_file,sep=" ",header=None)
                data_labels=pd.read_csv(labels_file,header=None)
                data_labels.rename(columns = {0:2}, inplace = True)
                toute=pd.concat([data,data_labels],axis=1)
                # sctt_plt = sns.scatterplot(data=toute, x=0, y=1, hue=2)
                # fig = sctt_plt.get_figure()
                # fig.savefig(dataset_dir+"out.png")
                # plt.clf()
                i=i+1
                toute=pd.concat([data,data_labels],axis=1)
                toute=toute.dropna()
                toute=toute.to_numpy()
                np.unique(data_labels.to_numpy(),return_counts=True)
                labels_true=data_labels.to_numpy().reshape(-1)
                data=data.to_numpy()
                data=min_max(data)
                #***********************************************GMM***********************************************
                number_tries=5
                for i in [2,3,4,10,20]:
                    print(i)
                    metrics = np.zeros(len(header)-8)
                    for t in range(number_tries):
                        metrics_i = compare(data, dim=dim, random_state=t, n_components=number_comp, increments_number=i, true_labels = labels_true, printB=False, max_iter = 100,tol=1e-03)
                        metrics = metrics+metrics_i
                    metrics = metrics/number_tries
                    info = [dataset, N, dim, 100/i, int(N/number_comp), number_comp, sepVal, outliers]
                    info.extend(metrics)
                    writer.writerow(info)

    file.close()

C:/Users/benhamya/PFE/Rdatasets/datasets/balanced/dimBig/6/big-20000-0-300-10.mat
2
