In [None]:
import os
import keras
import metrics
import numpy as np
import pandas as pd
import keras.backend as K
from keras.models import load_model
from keras.optimizers import SGD
from time import time
import matplotlib.pyplot as plt
from keras.initializers import VarianceScaling,RandomNormal
from keras.engine.topology import Layer, InputSpec
import seaborn as sns
import pickle
from matplotlib import pyplot
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import animation, rc
from IPython.display import HTML

In [None]:
from DEC import DEC , plot_animated_history

In [None]:
from keras.datasets import mnist
folder_name = 'results_with_uniform_glorot' # for saving the results

In [None]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()

In [None]:
x_train_r = x_train.reshape(60000,28*28)
x_test_r = x_test.reshape(10000,28*28)
x_train_r.shape

In [None]:
#init = VarianceScaling(scale=1. / 3, mode='fan_in', distribution='uniform')  
#init = RandomNormal(mean=0.0, stddev=0.05, seed=None)
init = 'glorot_uniform'
dec = DEC(dims=[784, 700, 500, 200, 3], n_clusters=10, init=init)
Encoded = dec.encoder.predict(x_train_r)
fig = pyplot.figure(figsize=(10,10))
ax = Axes3D(fig)
ax.scatter(Encoded[:,0], Encoded[:,1], Encoded[:,2], s=0.05, c = 'b', marker='o')

In [None]:
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

dec.pretrain(x=x_train_r,
               y=None,
               validation_data=(x_test_r, x_test_r),
               #validation_split=0.1,
               optimizer='adam',
               epochs=700,
               batch_size=2048,
               save_dir=folder_name)

In [None]:
plot_animated_history(dec.encoders_predictions_history[:],
                      threeD=True,
                      saving_name=folder_name+'/pretraining.mp4')

In [None]:
dec.compile(optimizer=SGD(0.01, 0.9), loss='kld')
y_pred = dec.fit(x=unique_data, y=None, tol=0.01, maxiter=2e4, batch_size=2048,
                 update_interval=200, save_dir=folder_name)

In [None]:
plot_animated_history(dec.encoders_clustering_history[:],threeD=True,saving_name=folder_name+'/clustering.mp4')

In [None]:
Encoded = dec.encoder.predict(unique_data)
Clusters_p = dec.model.predict(unique_data)

clrs = sns.color_palette("hls", dec.n_clusters)
fig, ax = plt.subplots(1,figsize=(15,15))
for i in range(len(Encoded)):
    idx = Clusters_p[i].argmax()
    point = ax.scatter(Encoded[i,1], Encoded[i,0], s=2)
    point.set_color(clrs[idx])

In [None]:
loc = r'/home/alireza/Clustering/'
labeled_accounts = pd.read_csv(loc+ "/labeled_accounts.csv",sep=';',encoding='cp437')[:805]
unique_ones,b,counts = np.unique(labeled_accounts['TXT50'].values,return_inverse=True,return_counts=True)
idx = np.where(counts!=1)[0]
for acc in unique_ones[idx]:
    idx2 = np.where(labeled_accounts['TXT50'].values==acc)[0]
    labeled_accounts.loc[idx2[0],'TXT50'] = labeled_accounts.loc[idx2[0],'TXT50']+'_DUPLICATE'
len(np.unique(labeled_accounts['TXT50'].values))
Acc_frq_in_Dset = pd.DataFrame(np.sum(unique_data,axis=0), index=labeled_accounts['TXT50'].values.tolist(),
                                                           columns=['In_dataset_frequency'])
Acc_frq_in_Dset.index.name = 'Accounts_label'

In [None]:
Clusters_p = dec.model.predict(unique_data)
clusters_list=[]
for i in range(unique_data.shape[0]):
    cluster_id = Clusters_p[i].argmax()
    active_accounts = labeled_accounts.loc[np.where(unique_data[i]==1)[0]]['TXT50'].values
    
    clusters_list.append([cluster_id,
                          active_accounts,
                          len(active_accounts)])
       
clusters_df=pd.DataFrame(clusters_list,
                         columns=['Cluster_id', 'Active_acounts_label','N_active_accounts'])

In [None]:
all_clusters_info= []
for cluster_id in range(dec.n_clusters):
    
    Cluster = clusters_df.loc[clusters_df['Cluster_id']==cluster_id]
    Cluster_size = len(Cluster)
    cluster_acc = Cluster['Active_acounts_label'].values
    
    All_acc_in_clst = []
    for i in range(len(cluster_acc)):
        All_acc_in_clst += [acc for acc in cluster_acc[i]]
    In_cluster_unique_accs, In_clst_frq = np.unique(All_acc_in_clst, return_counts=True)
    In_Dset_frq = []
    for account in In_cluster_unique_accs:
        Dset_frq = Acc_frq_in_Dset.at[account,'In_dataset_frequency']
        #print(account,type(Dset_frq))
        In_Dset_frq.append(Dset_frq)
    
    accounts_info = np.asarray((In_cluster_unique_accs,
                                In_clst_frq,
                                In_Dset_frq,
                                np.round(In_clst_frq/Cluster_size,decimals=1),
                                np.round(In_clst_frq/In_Dset_frq, decimals=1),
                                #np.round(In_clst_frq/len(All_acc_in_clst),decimals=2)
                               )).T
    Cluster_info_df = pd.DataFrame(accounts_info,
                                   columns=['Unique_accounts',
                                            'In_cluster_frequency',
                                            'In_dataset_frequency',
                                            'In_cluster_frequency/cluster_size',
                                            'In_cluster_frequency/In_dataset_frequency',
                                            #'In_cluster_frequency/len(All_accounts_in_cluster)'
                                           ])
    all_clusters_info.append([Cluster_info_df, Cluster_size])
    
all_Clusters_info_df = pd.DataFrame(all_clusters_info, columns=['Cluster_info_DataFrame','Cluster_size'])
all_Clusters_info_df.index.name = 'Cluster_id'

In [None]:
all_Clusters_info_df

In [None]:
all_Clusters_info_df.at[8,'Cluster_info_DataFrame'].sort_values(by=['In_cluster_frequency/cluster_size','In_cluster_frequency/In_dataset_frequency'], ascending=False)

In [None]:
pickle.dump(all_Clusters_info_df,
            open(folder_name+'/Clusters_df.p',
                  "wb"),
                  protocol=4)
pickle.dump(dec,
            open(folder_name+'/dec_model.p',
                  "wb"),
                  protocol=4)