In [3]:
%matplotlib inline
import pandas as pd
import numpy as np
from collections import deque
import seaborn as sns
from scipy.stats import entropy
from tqdm import tqdm
import matplotlib.ticker as ticker
import matplotlib.pyplot as plt
%matplotlib inline
import plotly
import chart_studio.plotly as py
import plotly.graph_objects as go
from timeit import default_timer as timer
import datetime
import sys, os, shutil, gc
from csv import DictWriter

In [None]:
# ''' Version of the packages in this work.'''
# # print("Pandas version: ",pd.__version__)
# print("Pandas version: 0.25.1")
# # print("Numpy version: ",np.__version__)
# print("Numpy version: 1.16.4")
# # print("Sys python version: ",sys.version)
# print("Sys python version: 3.7.1 | package by conda-forge [MSC v.1900 64 bit (AMD64)]")
# print("IPython: 7.8.0")
# print("IPython genutils: 0.2.0")

###  Header

In [None]:
#toy example dataset
# input_test2.dat and input_test2.dat

#synthetic datasets
# synthetic-1_fimi.dat, synthetic-2_fimi.dat and synthetic-3_fimi.dat

experiment = 'Toy' # Syn: run the synthetic data analysis; Real: run the real data analysis; 
                   # Toy: run the toy example.
toy_number = "2" # sufix of a given toy dataset [1,1-1,2]

find_overlap = False # True: find overlapped co-clusters; False: Find no overlapped co-clusters
VERBOSE = True # True: print results as a verbose mode; False: print the main results
num_of_sim = 1 # number of simulations to perform
k = -1 #max number of co-cluster that could be found. -1: default [driven by cost function]
e_obj = 0.4 # maximum error tolerance for object. -1: default [accept the maximum error]
e_att = .8 # maximum error tolerance for attribute. -1: default [accept the maximum error]

if VERBOSE: print('--->Verbose mode ON.<---')

print("Executing TRACOCLUS method")
if experiment == 'Syn':
    path = "./data/synthetic/fimi/"
    syn_datasets = [path+"synthetic-1_fimi.dat",path+"synthetic-2_fimi.dat",path+"synthetic-3_fimi.dat"]
    path_method = "OutputAnalysis\ococlus"
    check_path(path_method)

    for ds in range(len(syn_datasets)):
        ds_name = "Syn-"+str(ds+1)
        print("\nDataset: "+ds_name)
        res = os.mkdir(path_method+"\\"+ds_name)

        for run in range(num_of_sim):
            print("Run-"+str(run+1))

            df_fimi = pd.read_csv(syn_datasets[ds], header=None, names=["transation"])
            D,co_clusters = OCoClus(df_fimi,k,e_obj,e_att) # Calling OCoClus main method

            print("")
            Rec_error(D,co_clusters)
#             print(co_clusters)
            
            omega_format = build_clustering_output_omega(co_clusters)
            OCoClus_clustering_xm = xmeasures_format(omega_format)# save to XMEASURES format C++ version
            df_gt = pd.DataFrame(OCoClus_clustering_xm)
            path = path_method+"/"+ds_name
            df_gt.to_csv(path.replace("\\","/")+"/run_"+str(run+1)+"_res_ococlus_"+ds_name+"_co.cnl", 
                         header= False,index=False, encoding='utf8')
            del omega_format, df_gt, OCoClus_clustering_xm
            gc.collect()
elif experiment == 'Real':
    k = 10
    print('Real data clustering.')
    path = "./data/real_application/"
    real_datasets = [path+"cal500_fimi.dat",path+"covid19_fimi.dat"]
    path_method = "OutputAnalysis\ococlus"
    check_path(path_method)
    
    for ds in range(len(real_datasets)):
        ds_name = real_datasets[ds].replace("/","_").split("_")[4].capitalize()
        print("\nDataset: "+ds_name)
        res = os.mkdir(path_method+"\\"+ds_name)
        
        df_fimi = pd.read_csv(real_datasets[ds], header=None, names=["transation"])
        D,co_clusters = OCoClus(df_fimi,k,e_obj,e_att) # Calling OCoClus main method
        writeFileOutput(co_clusters,ds_name,method='OCoClus',fileName='OCoClusResult_'+ds_name)
        
    print('DONE!')
elif experiment == 'Toy':
    print('Toy example')
    VERBOSE = False
#     input_data_pd = pd.read_csv("./data/toy_example/toy"+toy_number+"_traj.dat", header=None, names=["transation"])
#     input_data_pd = pd.read_csv("./data/toy_example/toy"+toy_number+"_traj.dat", header=None, names=["transation"])
    input_data_pd = pd.read_csv("./data/real_application/foursquare_NY/fs_ny_week_sequences.dat", header=None, names=["transation"])
#     print(input_data_pd)
    D,co_clusters = TRACOCLUS(input_data_pd,k,e_obj,e_att)
    print(co_clusters)
    #Compute the measures
#     print("")
#     Rec_error(D,co_clusters)
else:
    print('ERROR! Choose a valid option for the experiment analysis.')

# TraCoClus algorithm

This is the main algorithm of TraCoClus.

In [None]:
er = '0-3-1-4-0-1'
er2 = er.split('-')
print(er2)
# print(er2.nunique())
def change_vect_cont(queue):
    queue[0] = 'Q'

In [None]:
my_vect = [1,2,8,4]
change_vect_cont(my_vect)
print(my_vect)

In [None]:
import seaborn as sns
# %matplotlib inline
import matplotlib.pyplot as plt

In [None]:
max_tested_atts = np.log2(10)
vec_log_num = [1,10,20,30,40,50,60,70,80,90,100]#,250,500]#,750,1000]#,1500,2000,3000,5000,7500,10000]
base = [2,10]
df_log_base_diff = pd.DataFrame(columns = ['Input x','Log Result','Log Base'])
popupalte_df_dict = {'Input x':0,'Log Result':0,'Log Base':''}

for base_i in base:
    popupalte_df_dict['Log Base'] = 'Log_'+str(base_i)
    for num in vec_log_num:
        popupalte_df_dict['Input x'] = num
#         if base_i == 2:
#         popupalte_df_dict['Log Result'] = np.log2(num)    
        popupalte_df_dict['Log Result'] = np.log(num)/np.log(base_i)
#         else:
#             popupalte_df_dict['Log Result'] = np.log10(num)
        df_log_base_diff = df_log_base_diff.append(popupalte_df_dict, ignore_index=True)
df_log_base_diff.head()
sns.lineplot(data=df_log_base_diff, x="Input x", y="Log Result", hue="Log Base", style="Log Base",
             markers=True, dashes=False)
# plt.show()
# print(max_tested_atts)

In [None]:
a = {'a':1,'b':2,'c':3,'d':4}
list_values = a.values()
print(np.mean(list(list_values)))

In [36]:
VERBOSE = False
k = -1 #max number of co-cluster that could be found. -1: default [driven by cost function]
e_obj = 0 # maximum error tolerance for object. -1: default [accept the maximum error]
e_att = 0

cc_prune_ref = 'rows' # 1. 'rows'; 2. 'cost'; 3. 'combine'
cc_ref_list = ['rows','cost','combine']
cc_type_process = 'sample' # 1. incremental [evaluate the candidate during the process]; 2. sample [consider the set of candidates]
cc_type_analysis = 'z_score' # # 1. mean; 2. z_score
metric_stat_list = ['mean','z_score']
#Rows and cost are inverse among them.
#Rows grow positively while cost negatively.
cc_z_threshold = .1 # general z_thres; the rows are the reference, i.e., rows_z = -1 and cost_z = 1 (other side)
cc_z_threshold_r = 3 #e.g., -1, we want the right-side, so we keep values equal or bigger than 1 sd bellow the avg
cc_z_threshold_c = -3 #e.g., 1, we want the left-side, so we keep values equal or smaller than 1 sd above the avg
# cc_z_threshold
# z_thres_list = [-1,0,1]
z_thres_list = [-.5,0,.5]

# input_data_pd = pd.read_csv("./data/toy_example/toy2_traj.dat", header=None, names=["transation"])
# input_data_pd = pd.read_csv("./data/real_application/foursquare_NY/fs_ny_week_sequences.dat", header=None, names=["transation"])
# input_data_pd = pd.read_csv("./data/real_application/foursquare_NY/preprocessed/fs_ny_top_10_week_sequences.dat", header=None, names=["transation"])

# input_data_pd = './data/toy_example/toy2_traj.dat'
# input_data_pd = './data/real_application/foursquare_NY/fs_ny_week_sequences.dat'
# input_data_pd = './data/real_application/foursquare_NY/preprocessed/fs_ny_top_10_week_sequences.dat'
# path = './data/real_application/foursquare_NY/preprocessed/'
datasets = ['fs_ny_top_users_10.dat','fs_ny_top_users_81.dat','fs_ny_top_users_193.dat','sjgs.dat','sjgs2.dat']
SCABILITY = False
STORE_CLUS_STATS = False
element_analysis = False # True: get elements with freq >= AVG ; False: consider all number of elements
# scability_els_list = [13,28,43,58,73,88,93,108,123,138]
# scability_els_list_rstart = [138]
# scability_els_list = [58]
# scability_els_list = [88]
scability_els_list = [93]
clustering_perf = Performance()
# D,co_clusters = TRACOCLUS(input_data_pd,cc_prune_ref,k,e_obj,e_att)
# for i in range(1,len(datasets)):

# for i in range(1,3):#datasets
for i in range(4,5):# reiniciar do dataset ...
#     for run_sim in range(5):#run simulations
    for run_sim in range(0,1):# reiniciar da run simulations #....
        print('\nRun simulation: '+str(run_sim+1))
        for num_of_els in scability_els_list:
#         for num_of_els in scability_els_list_rstart:# reiniciar a partir do element ...
            print(datasets[i])
        #     for cc_prune_ref in cc_ref_list:
            D,co_clusters = TRACOCLUS(datasets[i],cc_prune_ref,k,e_obj,e_att)
#     clustering_perf.plot_scability_test('scability_num_of_els_users_d'+str(i))
#     clustering_perf.plot_scability_rows_cost('scability_num_of_els_users_d'+str(i))
#     for j in range(1,len(cc_ref_list)):
#         cc_prune_ref = cc_ref_list[j]
#         if cc_type_analysis == 'mean':
#             D,co_clusters = TRACOCLUS(datasets[i],cc_prune_ref,k,e_obj,e_att)
#         else:
#             for cc_z_threshold in z_thres_list:
#                 D,co_clusters = TRACOCLUS(datasets[i],cc_prune_ref,k,e_obj,e_att)
# TRACOCLUS(input_data_pd,k,e_obj,e_att)
# print('\nNumber of found co-clusters: ',len(co_clusters))
# print('Final co-clusters: ',co_clusters)


Run simulation: 1
sjgs2.dat
['sjgs2', 'dat']
######################################
Number of trajectories: 3189
Number of unique check-ins: 77
########################################
Original number of elements:  77
Element analysis: False. We consider the original number of elements:  77
Tirando a duvida sobre o tamanho da lista de elementos:  77
Searching for candidate: 1408
Total clustering time in minutes:  8.08
Total clustering time:  0h:8m:4s (9/12/2021 - 23:53:16)

### Clustering statistics ###
Number of co-clusters:  24
AVG rows:68.50±19.65[CV:28.68%], AVG cost:-51.96±24.60[CV:-47.34%]
AVG sequece length:2.00±0.00[CV:0.00%]
Number of unique elements grouped: 18


In [96]:
# clustering_perf.df_scability.head()
# clustering_perf.df_scab_rows_cost.head()
# clustering_perf.df_scab_rows_cost.info()
# df = pd.read_csv('coclustering_file_outputs/df_scability.csv', sep=";")
# df = pd.read_csv('coclustering_file_outputs/df_scab_rows_cost.csv', sep=";")
# df = pd.read_csv('coclustering_file_outputs/df_clustering_stats.csv', encoding='utf8' , sep=';')
# df

### Splice-junction gene sequence

k:4 ; z:0

In [57]:
# clustering_perf.test_norm_dist()
# clustering_perf.test_skewness()
# df = pd.read_csv('data/real_application/foursquare_NY/fs_ny_top_users_10.csv', sep=";")
# clustering_perf.analysis_entropy_purity(create_df_map_traj_user(df))
clustering_perf.get_clusters()
clustering_perf.create_alluvial_diagram()
# clustering_perf.show_boxplot()
# clustering_perf.plot_cost()

Cluster #1 - Candidate 0 [Absolute:2152 | Relative:0.67 | Cost: -4301 | Ov_coef: 0.00 | Seq length: 3]
Attributes sequence "G-C-T" and trajectories "'1779', '2606', '587', '240', '1392', '1149', '753', '2516',[...]".

Cluster #2 - Candidate 2 [Absolute:1987 | Relative:0.62 | Cost: -2733 | Ov_coef: 0.09 | Seq length: 3]
Attributes sequence "T-C-A" and trajectories "'1187', '680', '1656', '1392', '2505', '2916', '2752', '1149',[...]".

Cluster #3 - Candidate 3 [Absolute:1720 | Relative:0.54 | Cost: -1780 | Ov_coef: 0.10 | Seq length: 3]
Attributes sequence "A-G-T" and trajectories "'2606', '680', '587', '2916', '2505', '2752', '2879', '1149',[...]".

Cluster #4 - Candidate 4 [Absolute:1906 | Relative:0.60 | Cost: -1070 | Ov_coef: 0.11 | Seq length: 3]
Attributes sequence "A-T-G" and trajectories "'1779', '1187', '680', '240', '1392', '2505', '1656', '2916',[...]".

Trajectory "50" belongs to User: "EI"
Max sequence length: 3
Levels: ['lvl1', 'lvl2', 'lvl3', 'lvl4', 'lvl5']
Columns alluvi

<Figure size 900x600 with 0 Axes>

### Splice-junction gene sequence

k:3 ; z:0.5

In [59]:
clustering_perf.get_clusters()
clustering_perf.create_alluvial_diagram()

Cluster #1 - Candidate 0 [Absolute:2152 | Relative:0.67 | Cost: -4301 | Ov_coef: 0.00 | Seq length: 3]
Attributes sequence "G-C-T" and trajectories "'1779', '2606', '587', '240', '1392', '1149', '753', '2516',[...]".

Cluster #2 - Candidate 2 [Absolute:1987 | Relative:0.62 | Cost: -2733 | Ov_coef: 0.09 | Seq length: 3]
Attributes sequence "T-C-A" and trajectories "'1187', '680', '1656', '1392', '2505', '2916', '2752', '1149',[...]".

Cluster #3 - Candidate 4 [Absolute:1906 | Relative:0.60 | Cost: -1070 | Ov_coef: 0.11 | Seq length: 3]
Attributes sequence "A-T-G" and trajectories "'1779', '1187', '680', '240', '1392', '2505', '1656', '2916',[...]".

Trajectory "50" belongs to User: "EI"
Max sequence length: 3
Levels: ['lvl1', 'lvl2', 'lvl3', 'lvl4', 'lvl5']
Columns alluvial df: ['lvl1', 'lvl2', 'lvl3', 'lvl4', 'lvl5', 'count', 'cluster']


<Figure size 900x600 with 0 Axes>

### Splice-junction gene sequence

k:6 ; z:-3

In [67]:
clustering_perf.get_clusters()
clustering_perf.create_alluvial_diagram()

Cluster #1 - Candidate 0 [Absolute:2152 | Relative:0.67 | Cost: -4301 | Ov_coef: 0.00 | Seq length: 3]
Attributes sequence "G-C-T" and trajectories "'1779', '2606', '587', '240', '1392', '1149', '753', '2516',[...]".

Cluster #2 - Candidate 1 [Absolute:847 | Relative:0.27 | Cost: -1286 | Ov_coef: 0.09 | Seq length: 3]
Attributes sequence "A-C-G" and trajectories "'2474', '2384', '2606', '2916', '1392', '2505', '1000', '753',[...]".

Cluster #3 - Candidate 2 [Absolute:1987 | Relative:0.62 | Cost: -2733 | Ov_coef: 0.09 | Seq length: 3]
Attributes sequence "T-C-A" and trajectories "'1187', '680', '1656', '1392', '2505', '2916', '2752', '1149',[...]".

Cluster #4 - Candidate 3 [Absolute:1720 | Relative:0.54 | Cost: -1780 | Ov_coef: 0.10 | Seq length: 3]
Attributes sequence "A-G-T" and trajectories "'2606', '680', '587', '2916', '2505', '2752', '2879', '1149',[...]".

Cluster #5 - Candidate 4 [Absolute:1906 | Relative:0.60 | Cost: -1070 | Ov_coef: 0.11 | Seq length: 3]
Attributes sequence "

<Figure size 900x600 with 0 Axes>

### Splice-junction gene sequence 2

k:177 ; z:1

In [32]:
clustering_perf.get_clusters()
clustering_perf.create_alluvial_diagram()

Cluster #1 - Candidate 0 [Absolute:119 | Relative:0.04 | Cost: -117 | Ov_coef: 0.00 | Seq length: 2]
Attributes sequence "CTG-CAG" and trajectories "'1751', '978', '986', '1392', '987', '2650', '1387', '1251',[...]".

Cluster #2 - Candidate 1 [Absolute:71 | Relative:0.02 | Cost: -62 | Ov_coef: 0.04 | Seq length: 2]
Attributes sequence "CAG-CTG" and trajectories "'1751', '1580', '2983', '2779', '2695', '1421', '2672', '863',[...]".

Cluster #3 - Candidate 2 [Absolute:93 | Relative:0.03 | Cost: -86 | Ov_coef: 0.02 | Seq length: 2]
Attributes sequence "CCC-CAG" and trajectories "'3002', '978', '2983', '725', '987', '1108', '1388', '1279',[...]".

Cluster #4 - Candidate 3 [Absolute:63 | Relative:0.02 | Cost: -59 | Ov_coef: 0.01 | Seq length: 2]
Attributes sequence "CCT-CAG" and trajectories "'1585', '3126', '895', '153', '2713', '956', '892', '2227',[...]".

Cluster #5 - Candidate 4 [Absolute:133 | Relative:0.04 | Cost: -115 | Ov_coef: 0.04 | Seq length: 2]
Attributes sequence "CAG-GTG" an

<Figure size 900x600 with 0 Axes>

### Splice-junction gene sequence 2

k:68 ; z:2

In [34]:
clustering_perf.get_clusters()
clustering_perf.create_alluvial_diagram()

Cluster #1 - Candidate 0 [Absolute:119 | Relative:0.04 | Cost: -117 | Ov_coef: 0.00 | Seq length: 2]
Attributes sequence "CTG-CAG" and trajectories "'1751', '978', '986', '1392', '987', '2650', '1387', '1251',[...]".

Cluster #2 - Candidate 1 [Absolute:71 | Relative:0.02 | Cost: -62 | Ov_coef: 0.04 | Seq length: 2]
Attributes sequence "CAG-CTG" and trajectories "'1751', '1580', '2983', '2779', '2695', '1421', '2672', '863',[...]".

Cluster #3 - Candidate 2 [Absolute:93 | Relative:0.03 | Cost: -86 | Ov_coef: 0.02 | Seq length: 2]
Attributes sequence "CCC-CAG" and trajectories "'3002', '978', '2983', '725', '987', '1108', '1388', '1279',[...]".

Cluster #4 - Candidate 3 [Absolute:63 | Relative:0.02 | Cost: -59 | Ov_coef: 0.01 | Seq length: 2]
Attributes sequence "CCT-CAG" and trajectories "'1585', '3126', '895', '153', '2713', '956', '892', '2227',[...]".

Cluster #5 - Candidate 4 [Absolute:133 | Relative:0.04 | Cost: -115 | Ov_coef: 0.04 | Seq length: 2]
Attributes sequence "CAG-GTG" an

<Figure size 900x600 with 0 Axes>

### Splice-junction gene sequence 2

k:24 ; z:3

In [37]:
clustering_perf.get_clusters()
clustering_perf.create_alluvial_diagram()

Cluster #1 - Candidate 0 [Absolute:119 | Relative:0.04 | Cost: -117 | Ov_coef: 0.00 | Seq length: 2]
Attributes sequence "CTG-CAG" and trajectories "'1751', '978', '986', '1392', '987', '2650', '1387', '1251',[...]".

Cluster #2 - Candidate 1 [Absolute:71 | Relative:0.02 | Cost: -62 | Ov_coef: 0.04 | Seq length: 2]
Attributes sequence "CAG-CTG" and trajectories "'1751', '1580', '2983', '2779', '2695', '1421', '2672', '863',[...]".

Cluster #3 - Candidate 2 [Absolute:93 | Relative:0.03 | Cost: -86 | Ov_coef: 0.02 | Seq length: 2]
Attributes sequence "CCC-CAG" and trajectories "'3002', '978', '2983', '725', '987', '1108', '1388', '1279',[...]".

Cluster #4 - Candidate 3 [Absolute:63 | Relative:0.02 | Cost: -59 | Ov_coef: 0.01 | Seq length: 2]
Attributes sequence "CCT-CAG" and trajectories "'1585', '3126', '895', '153', '2713', '956', '892', '2227',[...]".

Cluster #5 - Candidate 4 [Absolute:133 | Relative:0.04 | Cost: -115 | Ov_coef: 0.04 | Seq length: 2]
Attributes sequence "CAG-GTG" an

In [469]:
import scipy.stats as stats
y = [1.90642, 2.22488, 2.10288, 1.69742, 1.52229, 3.15435, 2.61826, 1.98492, 1.42738, 1.99568]
shapiro_stat, shapiro_p_valor = stats.shapiro(y)
print('O valor da estatística de shapiro-wilk = '+str(shapiro_stat))
print('O valor do p-value de shapiro-wilk = '+str(shapiro_p_valor))
mean = np.mean(y)
std = np.std(y,ddof=1)
ks_stat, ks_p_value = stats.kstest(y,cdf='norm',args=(mean,std), N=len(y))
print(ks_stat)
print(ks_p_value)
# a = [-4,-3,-2,-1,0,1,2,3,4]
# print(np.mean(a),np.std(a))
# print('Número de desvios padrões (Z-score): ',(1.5-np.mean(a)/np.std(a)))

O valor da estatística de shapiro-wilk = 0.9266944527626038
O valor do p-value de shapiro-wilk = 0.4161774218082428
0.17709753067016487
0.9123891112746063


In [2]:
def populate_queue(poi_freq_dict):
#     print([{k:v} for k,v in poi_freq_dict.items()])
    queue = deque()
    [queue.append([k,v]) for k,v in poi_freq_dict.items()]
    return queue  

In [3]:
def check_sequence(trajectory_dataset_dict_list, candidate_trajectories_sequence_set, test_traj_sequence):
    '''This method receive 3 parameters: 
       1) trajectory dataset as a dict->list | x['key']:[...];
       2) trajectories indeces that contains the tested check-ins as a set;
       3) the given tested sequence of check-ins as a string
    '''
    new_set_trajectories = set()
    position_pois_per_traj_list = {}
#     test_traj_sequence = test_traj_sequence.strip()
    # we will test if the test_traj_sequence exist in the candidate_trajectories that contains the elements.
    for traj_id in candidate_trajectories_sequence_set:

        try:
#             traj_dataset = '-'.join(trajectory_dataset_dict_list[traj_id]).strip()
            test_subsequence, positions_at_traj = is_subsequence(trajectory_dataset_dict_list[traj_id],test_traj_sequence.split('-'))
            if test_subsequence:
#                 print('OK->',end=' ')
#                 print(traj_id,positions_at_traj)
                new_set_trajectories.add(traj_id)
                position_pois_per_traj_list[traj_id] = positions_at_traj
        except Exception as error:
            print(error)
#         print('{}-> {}'.format(traj_id,trajectory_dataset_dict_list[traj_id]))
#     print('Sequence "{}" is present in trajectories: {}'.format(test_traj_sequence,new_set_trajectories))
    return new_set_trajectories, position_pois_per_traj_list

In [4]:
def is_subsequence(sequence, subsequence):
    '''This sub method receive two arrays: 
       first one is the sequence and second one is the tested subsequence.'''
    n = len(sequence)
    m = len(subsequence)
    position_poi_sequence = []
    
    # Two pointers to traverse the arrays
    i = 0; j = 0;
 
    # Traverse both arrays simultaneously
    while (i < n and j < m):
 
        # If element matches
        # increment both pointers
        if (sequence[i] == subsequence[j]):
            position_poi_sequence.append(str(i))
            i += 1
            j += 1
 
            # If subsequence is completely
            # traversed
            if (j == m):
                return True, position_poi_sequence
         
        # If not,
        # increment i and reset j
        else:
            position_poi_sequence = []
            i = i - j + 1
            j = 0
         
    return False,None

In [None]:
test_dataset = {'0':['1','4','6','1','10'],'1':['3','6','7'],'2':['7','9','5'],'3':['4','6','1','10']
                ,'4':['9','5']}
test_candidate = set(['0','1','2','3','4'])
test_sequence = '1-10'
mySet, myPos = check_sequence(test_dataset,test_candidate,test_sequence)
print([rows+test_sequence.split('-')[poi_id]+poi_pos for rows in mySet 
 for poi_id in range(len(test_sequence.split('-'))) for poi_pos in myPos[rows][poi_id]])
form_elements(mySet,test_sequence,myPos)

In [5]:
def form_elements(trajs_index_set, tested_sequence, poi_positions_trajectories_dict_list):
    '''
    This method returns a set of elements.
    Each element is formed by the traj ID, poi ID in the sequence and its respective position at traj ID.
    Ex: set(['013', '0104', '312', '3103'])
        '013': 0-> traj ID, 1-> poi ID, and 3-> position of poi ID at traj ID
    '''
    tested_sequence = tested_sequence.split('-')
    return set([trajID+tested_sequence[poi_id]+poi_pos for trajID in trajs_index_set 
                for poi_id in range(len(tested_sequence)) 
                for poi_pos in poi_positions_trajectories_dict_list[trajID][poi_id]])

In [None]:
clusA = set([1,2,3,4,5])
dict_cc = {'0': {'cc_elements':set([5,6,7,8,9])},'1': {'cc_elements':set([4,5,6,7,8])},
           '2': {'cc_elements':set([5,4,3,2,1])}}
overlap_coefficient(clusA,dict_cc)
# print(overlap_coefficient(clusA,dict_cc))

In [6]:
def overlap_coefficient(clusterA, discovered_cc):
    max_overlap = 0
    
    for key, value in discovered_cc.items():
#         print(key,value)
        elements_intersection = len(clusterA.intersection(discovered_cc[key]['cc_elements']))
#         print(elements_intersection)
        curr_overlap = elements_intersection/np.min([len(clusterA),len(discovered_cc[key]['cc_elements'])])
#         print(curr_overlap)
        
        if curr_overlap > max_overlap:
            max_overlap = curr_overlap
            
    return max_overlap

In [None]:
t = {'c1':10,'c2':40,'c3':20,'c4':10,'c5':5,'c6':30,'c7':15,'c8':30,'c9':40,'c10':38}
t_values = list(t.values())
print(t_values,type(t_values))
t_mean = np.mean(t_values)
print(t_mean)
# looking = True
print('Before: ',t)
# while looking:
tmp = {}
for key, value in t.items():
    if value > t_mean:
        tmp.update({key:value})
#         looking = True
#         break
#     looking = False
t = tmp
print('After: ',t)

In [21]:
def TRACOCLUS(input_data, avg_cc_analysis, k=-1, e_obj=-1, e_att=-1):
#     input_D = pd.read_csv(input_data, header=None, names=["transation"])
    
    input_D = ''
    split = input_data.split('.')
    print(split)
    if (split[0] != 'sjgs') and (split[0] != 'sjgs2') and (split[0] != 'splice_data'):
        if split[-1] == 'dat':
            path = './data/real_application/foursquare_NY/preprocessed/'
            input_D = pd.read_csv(path+input_data, header=None, names=["transation"])
        else:
            path = './data/real_application/foursquare_NY/'
            input_D = pd.read_csv(path+input_data, header=None, names=["transation"])
    else:
        if split[-1] == 'dat':
            path = './data/real_application/gene_sequences/SJGS/preprocessed/'
            input_D = pd.read_csv(path+input_data, header=None, names=["transation"])
        else:
            path = './data/real_application/gene_sequences/SJGS/'
            input_D = pd.read_csv(path+input_data, header=None, names=["transation"])
    
    ### variable declaration
    if k == -1:
        k=sys.maxsize
    if e_obj == -1:
        e_obj = 1
    if e_att == -1:
        e_att = 1
    
    cost_model = sys.float_info.max # initial cost function of the model
    num_of_coclusters = 0
    D = []
    final_coclusters = [] # store the attribute and objects clusters. final_coclusters[[C1_att,C1_obj],[Ck_att,Ck_obj]]
    pattern_model = [set(),set()]# Union between the found co-clusters [list of obj,list of att]
    cost_per_cocluster = []# stores the cost to build the cocluster
    history_cost_model = []
    ###
    
#     D,N,data_dict,data_res_dict,map_id_to_attribute = get_data(input_D)
    
    # Gamma: store the found co-clusters
    overlap_coef_threshold = 0.5
    INITIAL_COST = 100.0
    final_coclusters = {}
    final_clustered_elements = set()
    final_coclustering_cost = 0
    coclustering_sizes_remove = [] # stores the cluster to be removed from final_coclustering_size
#     avg_cc_analysis = "combine" # 1. 'index_rows_set'; 2. 'cost_function'; 3. combine
#     avg_cc_analysis = cc_analysis # 1. 'index_rows_set'; 2. 'cost_function'
    final_coclustering_size = {} # stores the clusters and its num of rows
    candidates_ref_values = {}
    final_coclustering_avg_row_size = 0
    total_of_iterations = 0
#     clustering_perf = Performance(sns,plt)
#     clustering_perf = Performance()

    start_1 = timer()
    # Initialize main data structures
    map_id_to_attribute_dict, S_poi_freq_dict, poi_at_trajs_dict_set, trajs_data_dict_list = get_data(input_D)
    time_p1 = timer()-start_1
    clustering_perf.set_variables(len(trajs_data_dict_list))
#     S_poi_freq_dict = sort_attributes(S_poi_freq_dict)
    
    ### select att-values at most elements by log2 of the length of the set
#     i = num_elements_to_test('log2',len(S_poi_freq_dict))
#     print('Limit log2: ',i)
#     for key, value in S_poi_freq_dict.items():
#         if i < 0:
#             break
#         else:
#             max_list_with_log2[key]=value
#             i-=1
    
    stop_scability = False
    if SCABILITY:
        stop_scability = False
        if num_of_els <= len(S_poi_freq_dict):
            print('Number of elements to test in the scability analysis: '+str(num_of_els))
            S_poi_freq_dict = sort_attributes(S_poi_freq_dict)
            max_list_of_elems = {}
            i = 0
            for key, value in S_poi_freq_dict.items():
                if i < num_of_els:
                    max_list_of_elems[key]=value
                    i += 1
                else:
                    break
            S_poi_freq_dict = max_list_of_elems.copy()
            print('Number of the most frequent elements: ',len(S_poi_freq_dict))
        else:
            print('Current dataset is done for scability analisys.')
            stop_scability = True
    else:
        
        print('Original number of elements: ',len(S_poi_freq_dict))
        
        ## select att-values by its frequence that are higher than the average
        #if true it selects just the elements with frequency equal or bigger than the AVG; otherwise use all elements
        if element_analysis:
            max_list_of_elems = {}
            average_freq = np.mean(list(S_poi_freq_dict.values()))
            for key, value in S_poi_freq_dict.items():
                if value > average_freq:
                    max_list_of_elems[key]=value

            S_poi_freq_dict = max_list_of_elems.copy()
            print('Element analysis: True. Number of the most frequent elements: ',len(S_poi_freq_dict))
        else:
            print('Element analysis: False. We consider the original number of elements: ',len(S_poi_freq_dict))
    
    
    print('Tirando a duvida sobre o tamanho da lista de elementos: ',len(S_poi_freq_dict))
    
    start = timer()
    for iter_k in range(k):
        if stop_scability:
            break
#     for iter_k in tqdm(range(len(S_poi_freq_dict)*len(S_poi_freq_dict)), colour='blue', desc='Searching for candidates'):
        print('Searching for candidate: '+str(iter_k+1),end="\r")
#         print('')
        
#         print('S: ',S_poi_freq_dict)
        S_poi_freq_dict = sort_attributes(S_poi_freq_dict)
#         S_poi_freq_dict = sort_attributes(max_list_with_log2)
#         print('Current main list S: ',S_poi_freq_dict)
        S_uppercase_queue_list = populate_queue(S_poi_freq_dict)
        
        ### Initialize the current co-cluster 'cocluster_*' (CC) and candidate co-cluster 'cc_candidate' (CC*)
        cocluster_sequence_str = ''
        cocluster_attributes_list = ''
        cocluster_index_rows_set = set()
        cocluster_elements_set = set()
#         cocluster_cost_function = sys.maxsize
        cocluster_cost_function = INITIAL_COST
        cocluster_max_overlapped_coef = 1
        cc_candidate = {}
#         num_of_attributes = len(s_poi_freq_queue_list)

        clustering_perf.append_result(total_of_iterations,iter_k,final_coclustering_cost)
    
#         num_att_to_test_S = len(S_uppercase_queue_list)
#         while(num_att_to_test_S > 0):
#             num_att_to_test_S -= 1
#         while S_uppercase_queue_list: # loop it while queue is not empty
        limit = num_elements_to_test('length',len(S_uppercase_queue_list))
#         limit = num_elements_to_test('log2',len(S_uppercase_queue_list))
        for iter_elements_freq in range(0,limit):
#         for iter_elements_freq in tqdm(range(limit), colour='blue', desc='Testing element reference'):
            
#             if cocluster_sequence_str == '':
        
            S_poi_node_queue = S_uppercase_queue_list.popleft()
#             head_sequence_str = S_poi_node_queue[0]
#             trajectories_head_sequence_set = poi_at_trajs_dict_set[S_poi_node_queue[0]]
#             tail_sequence_str = S_poi_node_queue[0]
#             trajectories_tail_sequence_set = poi_at_trajs_dict_set[S_poi_node_queue[0]]
            S_uppercase_queue_list.append(S_poi_node_queue)
            s_lowercase_queue_list = S_uppercase_queue_list.copy()
#             sequence_cc = S_poi_node_queue[0]
            sequence_cc = {'cs_sequence_cc': S_poi_node_queue[0],
                           'cs_traj_ids_set_cc': poi_at_trajs_dict_set[S_poi_node_queue[0]],
                           'cs_elements_cc': set(),
                           'clustered_elements': final_clustered_elements}

            num_attributes_to_test_s = len(s_lowercase_queue_list)
            while(num_attributes_to_test_s > 0): # if it completes one loop the process stops

                s_poi_node_queue = s_lowercase_queue_list.popleft()
#                     poi_node_queue = s_poi_freq_queue_list[0]
                #s_lowercase_queue_list.append(s_poi_node_queue)# original: comentado para inserir apenas no update
                cc_candidate = candidate_cocluster(trajs_data_dict_list, poi_at_trajs_dict_set,
                                                   sequence_cc, s_poi_node_queue)

                if ((cc_candidate != None) and (cc_candidate['cost_function'] <= cocluster_cost_function) and 
                    (candidate_deviation(avg_cc_analysis,cc_candidate,final_coclustering_size,
                                         ('pass' if cc_type_process != 'incremental' else cc_type_process))) and 
                    (overlap_coefficient(cc_candidate['elements_set'],final_coclusters) <= overlap_coef_threshold)):
                    
                    over_coef_cc_candidate=overlap_coefficient(cc_candidate['elements_set'],final_coclusters)
#                     print('Current co-cluster CC was improved!')

                    ### update CC
                    cocluster_sequence_str = cc_candidate['sequence_str']
                    cocluster_attributes_list = cc_candidate['sequence_str'].split('-')
                    cocluster_index_rows_set = cc_candidate['index_rows_set'].copy()
                    cocluster_elements_set = cc_candidate['elements_set'].copy()
                    cocluster_cost_function = cc_candidate['cost_function']
                    cocluster_max_overlapped_coef = over_coef_cc_candidate

                    ### update sequence_cc
                    sequence_cc['cs_sequence_cc'] = cocluster_sequence_str
                    sequence_cc['cs_traj_ids_set_cc'] = cocluster_index_rows_set
                    sequence_cc['cs_elements_cc'] = cocluster_elements_set

#                     update_queue_s(cocluster_sequence_str, sequence_cc['cs_sequence_cc'],
#                                    s_lowercase_queue_list, s_poi_node_queue)
                    update_queue_s(cocluster_sequence_str, s_lowercase_queue_list, s_poi_node_queue)

                    num_attributes_to_test_s = len(s_lowercase_queue_list)# reassign the counter to restart


#                     trajectories_head_sequence_set = cocluster_index_rows_set
#                     head_sequence_str = cocluster_sequence_str
#                     trajectories_tail_sequence_set = cocluster_index_rows_set
#                     tail_sequence_str = cocluster_sequence_str


                    total_of_iterations += 1
        
    #                         clustering_perf.append_result(total_of_iterations,iter_k,cocluster_cost_function)
#                     else:# inserting back the element without update
#                         s_lowercase_queue_list.append(s_poi_node_queue)
#                         num_attributes_to_test_s -= 1
#                         total_of_iterations += 1
                else:# inserting back the element without update
                    s_lowercase_queue_list.append(s_poi_node_queue)
#                     print('Current co-cluster CC was NOT improved!')
#                     trajectories_head_sequence_set = tmp_traj_set
#                     head_sequence_str = tmp_head_sequence_str
#                     trajectories_tail_sequence_set = tmp_traj_set
#                     tail_sequence_str = tmp_tail_sequence_str
                    num_attributes_to_test_s -= 1
                    total_of_iterations += 1
#                         clustering_perf.append_result(total_of_iterations,iter_k,cocluster_cost_function)

#                 print('Queue s* AFTER to update: ',s_lowercase_queue_list)
#                 print('')

                ### Performance purpose ###
                ### Descontinuado ###
#                 if cocluster_cost_function != INITIAL_COST:
#                     clustering_perf.append_result(total_of_iterations,iter_k,
#                                                   (final_coclustering_cost+cocluster_cost_function))
#                 else:
#                     clustering_perf.append_result(total_of_iterations,iter_k,final_coclustering_cost)

            ## END while POIs_to_test (POIs_queue) ##
            #########################################

            ### check if CC was identified. If don't, it tries the next element p
            if cocluster_sequence_str == '':
                sequence_cc['cs_sequence_cc'] = ''
                sequence_cc['cs_sequence_cc'] = set()
                
            else: # co-cluster identified Step to store the found cocluster K
#                 final_coclusters.update({str(iter_k):{'cc_objs':cocluster_index_rows_set,
#                                                       'cc_atts':cocluster_sequence_str,
#                                                       'cc_elements':cocluster_elements_set,
#                                                       'cc_cost':cocluster_cost_function}})
#                 final_clustered_elements = final_clustered_elements.union(cocluster_elements_set)
#                 final_coclustering_cost += cocluster_cost_function
#                 print('Main list S BEFORE to update: ',S_poi_freq_dict)
#                 update_uppercase_S(cocluster_attributes_list, cocluster_index_rows_set, S_poi_freq_dict)
#                 print('Main list S AFTER to update: ',S_poi_freq_dict)
#                 partial = timer()
#                 print('Cluster "{}" finished at time "{}".'.format((iter_k+1),(partial-start))
                break
            
        ### END while S
        #
        
        ## into loop of iteration_k
        partial = timer()
        if VERBOSE:
            print('Cluster "{}" finished at time "{}".'.format(iter_k+1,partial-start))
        
        ### there is not any good co-cluster to identify anymore. Stop searching
        if (cocluster_cost_function >= 0) or (cocluster_max_overlapped_coef > overlap_coef_threshold):
            if VERBOSE:
                print('There is not any good co-cluster to identify anymore.')
                
            if cc_type_process == 'sample':
                candidate_ref = avg_cc_analysis
                set_of_candidates = final_coclusters.copy()
#                 candidates_ref_values = candidates_ref_values.copy()
                
                clustering_perf.store_dist(final_coclustering_size)
                candidate_deviation(avg_cc_analysis,final_coclusters,final_coclustering_size,'sample')
                end = timer()
                time_p2 = end-start
                total_time_alg = time_p1+time_p2
                clustering_perf.store_data_scability_test(candidates_ref_values, num_of_els,
                                                          format_time_minutes(total_time_alg), input_data, run_sim)
                
                print('\nTotal clustering time in minutes: ',format_time_minutes(total_time_alg))
                print('Total clustering time: ',format_time_output(end-start), end=" ")
                now = datetime.datetime.now()
                print("("+str(now.day)+"/"+str(now.month)+"/"+str(now.year)+" - "+str(now.hour)+":"+str(now.minute)+":"+str(now.second)+")")
                print('')
                
                if run_sim < 1 and STORE_CLUS_STATS:
                    clustering_perf.compute_measures_at_once(set_of_candidates, candidates_ref_values,
                                                             map_id_to_attribute_dict,
                                                             trajs_data_dict_list, input_data,
                                                             clustering_perf.store_dist(candidates_ref_values))
#                 end = timer()
#                 print('\nTotal clustering time after measures_at_once method: ',format_time_output(end-start))
            
            break
        else:
            if VERBOSE:
                    print('Co-cluster sequence "{}" present in "{}" trajectories.'.format(cocluster_sequence_str,
                                                                                          len(cocluster_index_rows_set)))
            final_coclusters.update({str(iter_k):{'cc_objs':cocluster_index_rows_set,
                                                      'cc_atts':cocluster_sequence_str,
                                                      'cc_elements':cocluster_elements_set,
                                                      'cc_cost':cocluster_cost_function,
                                                      'cc_over_coef':cocluster_max_overlapped_coef}})
            final_clustered_elements = final_clustered_elements.union(cocluster_elements_set)
            final_coclustering_cost += cocluster_cost_function
            update_uppercase_S(cocluster_attributes_list, cocluster_index_rows_set, S_poi_freq_dict)
            
            
            ### PERFORMANCE PURPOSE CODE ###
            ### storing the candidates reference values to evaluate the candidate later
            if avg_cc_analysis == "rows":
#                 print('Rows')
                final_coclustering_size.update({str(iter_k):len(cocluster_index_rows_set)})
                candidates_ref_values.update({str(iter_k):{'rows':len(cocluster_index_rows_set),
                                                           'cost':cocluster_cost_function}})
            elif avg_cc_analysis == "cost":
#                 print('Cost')
                final_coclustering_size.update({str(iter_k):cocluster_cost_function})
                candidates_ref_values.update({str(iter_k):{'rows':len(cocluster_index_rows_set),
                                                           'cost':cocluster_cost_function}})
            else:
                final_coclustering_size.update({str(iter_k):{'rows':len(cocluster_index_rows_set),
                                                             'cost':cocluster_cost_function}})
                candidates_ref_values.update({str(iter_k):{'rows':len(cocluster_index_rows_set),
                                                           'cost':cocluster_cost_function}})
            
#             def store_data_scability_rows_cost(self,cc_id,candidate,time_elapse,num_of_traj_points,
#                                                num_of_els,run_sim,dataset):
            clustering_perf.store_data_scability_rows_cost(int(iter_k+1),
                                                           len(cocluster_index_rows_set),
                                                           int(cocluster_cost_function),
                                                           len(cocluster_sequence_str.split('-')),
                                                           format_time_minutes(timer()-start_1),
                                                           int(len(cocluster_elements_set)),
                                                           int(num_of_els),
                                                           int(run_sim),
                                                           input_data)
        
    ## out of loop iteraton k
#     end = timer()
#     print('\nTotal clustering time: ',format_time_output(end-start))
#     if cc_type_analysis == 'mean':
#         print('Process: {}; Metric: {}; Co-cluster ref: {}'.format(cc_type_process,cc_type_analysis,avg_cc_analysis))
#     else:
#         print('Process: {}; Metric: {}; Co-cluster ref: {}; Z-score: {}'.format(cc_type_process,cc_type_analysis,avg_cc_analysis,cc_z_threshold))
    clustering_perf.summary_clusters(final_coclusters, map_id_to_attribute_dict, trajs_data_dict_list)
#     clustering_perf.calculate_entropy_purity(input_data)
    del map_id_to_attribute_dict,trajs_data_dict_list,candidates_ref_values,cocluster_index_rows_set,cocluster_elements_set
    del final_clustered_elements, S_poi_freq_dict, poi_at_trajs_dict_set
    return D,final_coclusters

In [108]:
f = [34,5,6,7,8,9,2]
print('Mean:',np.mean(f),' Var:',np.std(f), ' Round:',np.round(np.std(f)))

Mean: 10.142857142857142  Var: 9.963197585235825  Round: 10.0


In [8]:
# def avg_cluster_size(ref_analysis, test_value, set_of_clusters):
# def avg_cluster_size(ref_analysis,set_of_clusters):
def candidate_deviation(ref,value_ref,set_of_clusters,cc_type_process='incremental'):
    '''
    Method to return the avg number of the reference in the set of co-clusters.
    If the set is bigger than 1 it calculates the avg, otherwise it is 0.
    Parameters:
        ref_analysis: 1. index_rows_set -> considers the rows; 2. cost_function -> considers the cost.
        test_value: The value to test.
        set_of_clusters: The current set of co-clusters containing its values for the ref_analysis    
    '''
   
    if len(set_of_clusters) >= 2:
        try:# single ref
            mean = np.mean(list(set_of_clusters.values()))
            std = np.std(list(set_of_clusters.values()))
        except:# double ref
            sum_rows = []
            sum_cost = []
            for key,value in set_of_clusters.items():
                sum_rows.append(set_of_clusters[key]['rows'])
                sum_cost.append(set_of_clusters[key]['cost'])
            mean_rows = np.mean(sum_rows)
            mean_cost = np.mean(sum_cost)
            std_rows = np.std(sum_rows)
            std_cost = np.std(sum_cost)
        
        if cc_type_process == 'incremental':
            if ref == "rows":
                ### normal mean
                if cc_type_analysis == 'mean':
                    return len(value_ref['index_rows_set']) >= np.floor(mean)
                else:
                ### z-score: we consider values greater than z_thres once it is a positive distribution
                    try:
                        z = (len(value_ref['index_rows_set'])-mean)/std
                    except:
                        z = (value_ref-mean)/std
                    print('Z-score(rows): ',z)
                    return z >= cc_z_threshold_r
            elif ref == "cost":
                ### normal mean
                if cc_type_analysis == 'mean':
                    return value_ref['cost_function'] <= np.ceil(mean)
                else:
                ### z-score: we consider values smaller than z_thres once it is a negative distribution
                    try:
                        z = (value_ref['cost_function']-mean)/std
                    except:
                        z = (value_ref-mean)/std
                    print('Z-score(cost): ',z)
                    return z <= cc_z_threshold_c
            else:#combine
                ### normal mean
                if cc_type_analysis == 'mean':
#                     print('Mean(rows):',mean_rows,' Mean(cost):',mean_cost)
                    return ((len(value_ref['index_rows_set']) >= np.floor(mean_rows)) or 
                            (value_ref['cost_function'] <= np.ceil(mean_cost)))
                else:
                ### z-score
                    z_rows = (len(value_ref['index_rows_set'])-mean_rows)/std_rows
                    z_cost = (value_ref['cost_function']-mean_cost)/std_cost
#                     print('Z-score(rows): ',z_rows,' Z-score(cost): ',z_cost)
                    return ((z_rows >= cc_z_threshold_r) or (z_cost <= cc_z_threshold_c))
                
        elif cc_type_process == 'sample':
            candidates_to_remove = []
            try:# single ref: rows OR cost
                mean
                if cc_type_analysis == 'mean':
                    print('Mean:',mean)
                    for key,value in set_of_clusters.items():
#                         print('Candidate-'+key+' Mean:',mean,' Value ref:',value,end='')
                        if ref == 'rows' and value < mean:
#                             print(' -> Remove')
                            candidates_to_remove.append(key)
                        elif ref == 'cost' and value > mean:
#                             print(' -> Remove')
                            candidates_to_remove.append(key)
                        else:
#                             print(' -> Keep')
                            pass
                else:#z-score
                    for key,value in set_of_clusters.items():
                        z = (value-mean)/std
#                         print('Candidate-'+key+' Z-score:',z,end='')
                        if ref == 'rows' and z < cc_z_threshold_r:
                            candidates_to_remove.append(key)
#                             print(' -> Remove')
                        elif ref == 'cost' and z > -cc_z_threshold_c:
                            candidates_to_remove.append(key)
#                             print(' -> Remove')
                        else:
#                             print(' -> Keep')
                            pass

            except:#double ref combine: rows AND cost
                if cc_type_analysis == 'mean':
                    for key,value in set_of_clusters.items():
#                         print('Candidate-'+key+' Mean(rows):',mean_rows,' Mean(cost):',mean_cost,end='')
                        if (set_of_clusters[key]['rows'] < mean_rows) and (set_of_clusters[key]['cost'] > mean_cost):
#                             print(' -> Remove')
                            candidates_to_remove.append(key)
                        else:
#                             print(' -> Keep')
                            pass

                else:#z-score
                    for key,value in set_of_clusters.items():
                        z_rows = (set_of_clusters[key]['rows']-mean_rows)/std_rows
                        z_cost = (set_of_clusters[key]['cost']-mean_cost)/std_cost
#                         print('Candidate-'+key+' Z-score(row):',z_rows,' Z-score(cost):',z_cost,end='')
                        if (z_rows < cc_z_threshold_r) and (z_cost > cc_z_threshold_c):
#                             print(' -> Remove')
                            candidates_to_remove.append(key)
                        else:
#                             print(' -> Keep')
                            pass
                        
#             print("Remove candidates: ",candidates_to_remove)
#             print("Number of candidates to remove: ",len(candidates_to_remove))
            for candidate in candidates_to_remove:#at this point, value_ref is the set of candidates
                del value_ref[candidate]

        else: # just pass step. Stores the candidate co-clusters and analyse them with sample analysis if desirable
            return True               
                
    else:# pass step to reach a minimum number of elements to perform computation
        return True

In [216]:
q = {'0':{'r':4,'c':7},'1':{'r':5,'c':9}}
for key, value in q.items():
    print(key,value)

0 {'r': 4, 'c': 7}
1 {'r': 5, 'c': 9}


In [None]:
q = 'None'
print(q)
if q == None:
    print("None")
else:
    print('Diff none')

In [9]:
def candidate_cocluster(trajs_data_dict_list, poi_at_trajs_dict_set, sequence_cc, s_poi_node_queue):
    INITIAL_COST = 100.0
    ### Current sequence
    ### The method tries to form two sequence, if the sequence is valid the method picks the best one
    head_sequence_str = sequence_cc['cs_sequence_cc']
    trajectories_head_sequence_set = sequence_cc['cs_traj_ids_set_cc']

    tail_sequence_str = sequence_cc['cs_sequence_cc']
    trajectories_tail_sequence_set = sequence_cc['cs_traj_ids_set_cc']

    ### Try to expand the candidate sequence one element at a time if it forms a frequent sequence
    ### Step to test ELEMENT at the HEAD ###
    tmp_head_sequence_str = head_sequence_str
    head_sequence_str = s_poi_node_queue[0]+'-'+head_sequence_str
#     if VERBOSE:
#         print('-> Head sequence: ',head_sequence_str)
    tmp_traj_set = trajectories_head_sequence_set
    trajectories_head_sequence_set = trajectories_head_sequence_set.intersection(poi_at_trajs_dict_set[s_poi_node_queue[0]])
    trajectories_head_sequence_set, position_poi_per_traj_head = check_sequence(trajs_data_dict_list,
                                                                                trajectories_head_sequence_set,
                                                                                head_sequence_str)

    if len(trajectories_head_sequence_set) > 0:
#         if VERBOSE:
#             print('Number of rows with this sequence: {}'.format(len(trajectories_head_sequence_set)))
        elements_head_sequence = form_elements(trajectories_head_sequence_set,
                                               head_sequence_str,
                                               position_poi_per_traj_head)    
        overlapped_elements = elements_head_sequence.intersection(sequence_cc['clustered_elements'])
        cost_head_sequence = cost_function(len(trajectories_head_sequence_set),
                                           len(head_sequence_str.split('-')),
                                           len(overlapped_elements))
#         overlap_coef_head = overlap_coefficient(elements_head_sequence,final_coclusters)
#         print('Head cost: {} and overlap_coef: {}.'.format(cost_head_sequence,
#                                                            overlap_coef_head))
    else:
#         if VERBOSE:
#             print('Tested head sequence "{}" does NOT exist!'.format(head_sequence_str))
        trajectories_head_sequence_set = tmp_traj_set
        head_sequence_str = tmp_head_sequence_str
        cost_head_sequence = INITIAL_COST #
        overlap_coef_head = 1
    #### END test HEAD sequence ####

    #### Step test ELEMENT at the TAIL ####
    tmp_tail_sequence_str = tail_sequence_str
    tail_sequence_str = tail_sequence_str+'-'+s_poi_node_queue[0]
#     if VERBOSE:
#         print('-> Tail sequence: ',tail_sequence_str)
    tmp_traj_set = trajectories_tail_sequence_set
    trajectories_tail_sequence_set = trajectories_tail_sequence_set.intersection(poi_at_trajs_dict_set[s_poi_node_queue[0]])
    trajectories_tail_sequence_set, position_poi_per_traj_tail = check_sequence(trajs_data_dict_list,
                                                                                trajectories_tail_sequence_set,
                                                                                tail_sequence_str)

    if (len(trajectories_tail_sequence_set) > 0):
#         if VERBOSE:
#             print('Number of rows with this sequence: {}'.format(len(trajectories_tail_sequence_set)))
        elements_tail_sequence = form_elements(trajectories_tail_sequence_set,
                                               tail_sequence_str,
                                               position_poi_per_traj_tail)
        overlapped_elements = elements_tail_sequence.intersection(sequence_cc['clustered_elements'])
        cost_tail_sequence = cost_function(len(trajectories_tail_sequence_set),
                                           len(tail_sequence_str.split('-')),
                                           len(overlapped_elements))
#         overlap_coef_tail = overlap_coefficient(elements_tail_sequence,final_coclusters)
#         print('Tail cost: {} and overlap_coef: {}.'.format(cost_tail_sequence,
#                                                            overlap_coef_tail))
    else:
#         if VERBOSE:
#             print('Tested tail sequence "{}" does NOT exist!'.format(tail_sequence_str))
        trajectories_tail_sequence_set = tmp_traj_set
        tail_sequence_str = tmp_tail_sequence_str
        cost_tail_sequence = INITIAL_COST
        overlap_coef_tail = 1
    #### END test TAIL sequence ####
    
#     print('Current co-cluster cost: ',cocluster_cost_function)
#     print('Queue s* BEFORE to upadate: ',s_lowercase_queue_list)

    ### Step to test the best sequence if exist a sequence
    if (cost_head_sequence < cost_tail_sequence) and (cost_head_sequence < 0):
#         print('Co-cluster improved with HEAD sequence.')

        # update the nodes of queue s.
#         update_queue_s(cocluster_sequence_str, head_sequence_str,
#                        s_lowercase_queue_list, s_poi_node_queue)
#         update_queue_s(candidate_sequence['cs_sequence_cc'], head_sequence_str,
#                        s_lowercase_queue_list, s_poi_node_queue)

#         cocluster_sequence_str = head_sequence_str
#         cocluster_attributes_list = head_sequence_str.split('-')
#         cocluster_index_rows_set = trajectories_head_sequence_set.copy()
#         cocluster_elements_set = elements_head_sequence.copy()
#         cocluster_cost_function = cost_head_sequence
#         cocluster_max_overlapped_coef = overlap_coef_head
        
        cc_candidate = {'sequence_str': head_sequence_str,
                        'attributes_list': head_sequence_str.split('-'),
                        'index_rows_set': trajectories_head_sequence_set.copy(),
                        'elements_set': elements_head_sequence.copy(),
                        'cost_function': cost_head_sequence}        
        
        return cc_candidate

    elif (cost_tail_sequence < cost_head_sequence) and (cost_tail_sequence < 0):
#         if VERBOSE:
#             print('Co-cluster improved with TAIL sequence.')

        # update the nodes of queue s.
#         update_queue_s(cocluster_sequence_str,tail_sequence_str,
#                        s_lowercase_queue_list,s_poi_node_queue)
#         update_queue_s(candidate_sequence['cs_sequence_cc'], tail_sequence_str,
#                        s_lowercase_queue_list, s_poi_node_queue)

#         cocluster_sequence_str = tail_sequence_str
#         cocluster_attributes_list = tail_sequence_str.split('-')
#         cocluster_index_rows_set = trajectories_tail_sequence_set.copy()
#         cocluster_elements_set = elements_tail_sequence.copy()
#         cocluster_cost_function = cost_tail_sequence
#         cocluster_max_overlapped_coef = overlap_coef_tail
        
        cc_candidate = {'sequence_str': tail_sequence_str,
                        'attributes_list': tail_sequence_str.split('-'),
                        'index_rows_set': trajectories_tail_sequence_set.copy(),
                        'elements_set': elements_tail_sequence.copy(),
                        'cost_function': cost_tail_sequence}        
        
        return cc_candidate
    
    else:# it does not found any sequence formed by the elements
#         cc_candidate = {'sequence_str': None}
        return None

In [16]:
create_alluvial_diagram()

Trajectory "50" belongs to User: "293"


In [15]:
def create_alluvial_diagram():
    df_traj_user = pd.read_csv('./data/real_application/foursquare_NY/fs_ny_top_users_10.csv', sep=';')
#     df_traj_user.drop(columns=['tid','lat_lon','time','day','type','root_type','rating','weather'],inplace=True)
    df_traj_user = df_traj_user[['new_tid','label']]
    #     print(df_traj_user.head())
    
    traj_id = 50
    user_label = df_traj_user[df_traj_user['new_tid'] == traj_id]['label'].unique()[0]
    print('Trajectory "{}" belongs to User: "{}"'.format(traj_id,
                                                         user_label))

In [162]:
r = ['l1','l2','l3']
t = dict.fromkeys(r)
t['l1'] = 0
t

{'l1': 0, 'l2': None, 'l3': None}

In [35]:
class Performance():
#     perf_df_clustering_output_measures = pd.DataFrame(columns = ['Iteration_i','Candidate_iteration_k',
#                                                             'Candidate_cost'])
    
#     df_quality_clustering = pd.DataFrame(columns= ['Dataset','Clustering_approach','Cocluster_reference',
#                                                    'Cocluster_statistic','Num_of_candidates','Num_of_clusters',
#                                                    'Overall_entropy','Purity'])
#     df_scability = pd.DataFrame(columns = ['num_of_elements','time_minutes','dataset','run_simulation'])

    def __init__(self):
        self.df_scability = pd.DataFrame(columns = ['num_of_elements','time_minutes','run_simulation',
                                                    'num_of_candidates','dataset'])
        
        self.df_scab_rows_cost = pd.DataFrame(columns = ['candidate_id','candidate_num_rows','candidate_cost',
                                                         'time_discovered_minutes','num_of_traj_points','num_of_elements',
                                                         'run_simulation','dataset'])
        self.df_scab_rows_cost.candidate_id = self.df_scab_rows_cost.candidate_id.astype(float)
        self.df_scab_rows_cost.candidate_num_rows = self.df_scab_rows_cost.candidate_num_rows.astype(float)
        self.df_scab_rows_cost.candidate_cost = self.df_scab_rows_cost.candidate_cost.astype(float)
    
    def plot_scability_test(self,file):
        self.dpi = 600
        self.fig = plt.figure(figsize=(3, 2),dpi=self.dpi)
        self.ax = sns.lineplot(x = "num_of_elements", y = "time_minutes", hue='dataset', err_style='bars', data = self.df_scability)
        
        self.ax.legend(loc='upper left', fontsize=4)
        sns.despine(offset=0, trim=True, left=True)
        self.ax.yaxis.set_major_locator(ticker.MultipleLocator(10))
        self.ax.set_yticklabels(self.ax.get_ymajorticklabels(), fontsize = 6)
        self.ax.yaxis.set_major_formatter(ticker.ScalarFormatter())
        plt.xticks(horizontalalignment='center',fontsize=6)
#         ax.set_xticklabels(ax.get_xmajorticklabels(), fontsize = 6)
#         ax.xaxis.set_major_locator(ticker.MultipleLocator(70))
#         ax.xaxis.set_major_formatter(ticker.ScalarFormatter())
        plt.ylabel('AVG time (minutes)',fontsize=7)
        plt.xlabel('Number of elemetns',fontsize=7)
        self.axes = plt.gca()
        self.axes.yaxis.grid(color='black',linewidth=.1)
        plt.tight_layout()
#         fig.savefig('C:/Users/yurin/Downloads/'+file+'.png',transparent=True,bbox_inches = 'tight',pad_inches=0,dpi=dpi)
        plt.show()
#         print(self.df_scability.head())

    def plot_scability_rows_cost(self,file):
        self.dpi = 600
        self.fig = plt.figure(figsize=(3, 2),dpi=self.dpi)
        self.ax = sns.lineplot(x = "candidate_id",y = "candidate_cost",hue = "dataset",err_style='bars', data = self.df_scab_rows_cost)
        
#         self.ax.legend(loc='upper right', fontsize=4)
        self.ax.legend(fontsize=4)
        sns.despine(offset=0, trim=True, left=True)
        self.ax.yaxis.set_major_locator(ticker.MultipleLocator(20))
        self.ax.set_yticklabels(self.ax.get_ymajorticklabels(), fontsize = 6)
        self.ax.yaxis.set_major_formatter(ticker.ScalarFormatter())
        plt.xticks(horizontalalignment='center',fontsize=6)
#         ax.set_xticklabels(ax.get_xmajorticklabels(), fontsize = 6)
#         ax.xaxis.set_major_locator(ticker.MultipleLocator(70))
#         ax.xaxis.set_major_formatter(ticker.ScalarFormatter())
        plt.ylabel('Cost value per candidate',fontsize=7)
        plt.xlabel('Number of candidates',fontsize=7)
#         self.axes = plt.gca()
#         self.axes.yaxis.grid(color='black',linewidth=.1)
#         plt.tight_layout()
#         fig.savefig('C:/Users/yurin/Downloads/'+file+'.png',transparent=True,bbox_inches = 'tight',pad_inches=0,dpi=dpi)
        plt.tight_layout()
        plt.show()
        
#         self.dpi = 600
#         self.fig2 = plt.figure(figsize=(3, 2),dpi=self.dpi)
#         self.ax2 = sns.lineplot(x = "candidate_id", y = "candidate_num_rows", hue = "dataset",data = self.df_scab_rows_cost)
#         self.ax2.legend(loc='upper right', fontsize=4)
#         sns.despine(offset=0, trim=True, left=True)
#         self.ax2.yaxis.set_major_locator(ticker.MultipleLocator(20))
#         self.ax2.set_yticklabels(self.ax2.get_ymajorticklabels(), fontsize = 6)
#         self.ax2.yaxis.set_major_formatter(ticker.ScalarFormatter())
#         plt.xticks(horizontalalignment='center',fontsize=6)
#         plt.ylabel('Num of trajs. per candidate',fontsize=7)
#         plt.xlabel('Number of candidates',fontsize=7)
        
#         plt.tight_layout()
#         plt.show()
    
    def store_data_scability_rows_cost(self,cc_id,num_of_rows,cost,seq_len,time_elapse,num_of_traj_points,num_of_els,run_sim,dataset):
        if dataset == 'fs_ny_top_users_193.dat':
            self.__sdst_dataset = 'All users'
        elif dataset == 'fs_ny_top_users_81.dat':
            self.__sdst_dataset = 'Top 81 users'
        elif dataset == 'fs_ny_top_users_10.dat':
            self.__sdst_dataset = 'Top 10 users'
        elif (dataset == 'sjgs.dat') or (dataset == 'sjgs2.dat'):
            self.__sdst_dataset = dataset.split('.')[0].upper()
        else:
            self.__sdst_dataset = 'Undefined dataset.'
        
#         self.df_scab_rows_cost = self.df_scab_rows_cost.append({'candidate_id':cc_id,
#                                                                 'candidate_num_rows':int(num_of_rows),
#                                                                 'candidate_cost':int(cost),
#                                                                 'time_discovered':time_elapse,
#                                                                 'num_of_traj_points':int(num_of_traj_points),
#                                                                 'num_of_elements':num_of_els,
#                                                                 'run_simulation':run_sim,
#                                                                 'dataset':self.__sdst_dataset},
#                                                                ignore_index=True)
        #list of column names
        self.frc_field_names = ['candidate_id','candidate_num_rows','candidate_cost','candidate_seq_len',
                                'time_discovered_minutes','num_of_traj_points','num_of_elements','run_simulation',
                                'dataset']
        
        # Dictionary
        self.frc_data = {'candidate_id':cc_id,'candidate_num_rows':int(num_of_rows),'candidate_cost':int(cost),
                         'candidate_seq_len':int(seq_len),'time_discovered_minutes':time_elapse,
                         'num_of_traj_points':int(num_of_traj_points),'num_of_elements':num_of_els,
                         'run_simulation':run_sim,'dataset':self.__sdst_dataset}
        
        with open('./coclustering_file_outputs/df_scab_rows_cost.csv', 'a+', newline='') as self.frc_object:
      
            # Pass the file object and a list 
            # of column names to DictWriter()
            # You will get a object of DictWriter
            self.rc_dictwriter_object = DictWriter(self.frc_object, delimiter=";", fieldnames=self.frc_field_names)

            #Pass the dictionary as an argument to the Writerow()
            self.rc_dictwriter_object.writerow(self.frc_data)

            #Close the file object
            self.frc_object.close()
    
    def store_data_scability_test(self,candidates_ref_values,num_of_els,time_elapse,dataset,run_sim):
#         datasets = ['fs_ny_top_users_193.dat','fs_ny_top_users_81.dat','fs_ny_top_users_10.dat']
        if dataset == 'fs_ny_top_users_193.dat':
            self.__sdst_dataset = 'All users'
        elif dataset == 'fs_ny_top_users_81.dat':
            self.__sdst_dataset = 'Top 81 users'
        elif dataset == 'fs_ny_top_users_10.dat':
            self.__sdst_dataset = 'Top 10 users'
        elif (dataset == 'sjgs.dat') or (dataset == 'sjgs2.dat'):
            self.__sdst_dataset = dataset.split('.')[0].upper()
        else:
            self.__sdst_dataset = 'Undefined dataset.'
        
#         print('Salvando scability.',end=' ')
        
#         for cc, value in candidates_ref_values.items():
#             self.__sdst_cc_num_of_rows = candidates_ref_values[cc]['rows']
#             self.__sdst_cc_cost = candidates_ref_values[cc]['cost']
            
#         self.df_scability = self.df_scability.append({'num_of_elements':str(num_of_els),
#                                                       'time_minutes':time_elapse,
#                                                       'run_simulation':run_sim,
#                                                       'num_of_candidates':len(candidates_ref_values),
#                                                       'dataset':self.__sdst_dataset},
#                                                      ignore_index=True)
        
        #list of column names
        self.sdst_field_names = ['num_of_elements','time_minutes','run_simulation','num_of_candidates','dataset']
        
        # Dictionary
        self.sdst_data = {'num_of_elements':str(num_of_els),'time_minutes':time_elapse,'run_simulation':run_sim,
                     'num_of_candidates':len(candidates_ref_values),'dataset':self.__sdst_dataset}
        
        with open('./coclustering_file_outputs/df_scability.csv', 'a+', newline='') as self.fsdst_object:
      
            # Pass the file object and a list 
            # of column names to DictWriter()
            # You will get a object of DictWriter
            self.sdst_dictwriter_object = DictWriter(self.fsdst_object, delimiter=";", fieldnames=self.sdst_field_names)

            #Pass the dictionary as an argument to the Writerow()
            self.sdst_dictwriter_object.writerow(self.sdst_data)

            #Close the file object
            self.fsdst_object.close()

    
    def compute_measures_at_once(self,set_of_candidates,candidates_ref_values,map_tid_to_el,trajs_data_dict_list,file_dataset,store_dist=None):
        '''
        Method to compute the measures at once for given dataset.
        It is aimed to avoid unecessary recomputation for the candidates.
        '''
        print('Compute_measures_at_once method class Performance.')
#         short_path_metrics(value_ref,set_of_clusters,cc_type_process='incremental'):
        spm_metric_list = ['mean','z_score']
        smp_ref_list = ['rows','cost','combine']
        smp_z_thres_list = [-1,0,1]

# short_path_metrics(ref,cc_type_analysis,value_ref,set_of_clusters,cc_type_process='incremental',cc_z_threshold=None):
        for cc_type_analysis in spm_metric_list:
            for ref in smp_ref_list:
                if cc_type_analysis == 'mean':
                    print('Process: {}; Metric: {}; Ref: {}'.format(cc_type_process,cc_type_analysis,ref))
#                     candidates_to_remove = bad_candidates(set_of_clusters,cc_type_analysis,ref)
#                     tmp_set_of_candidates = value_ref.copy()
#                     for candidate in candidates_to_remove:#at this point, value_ref is the set of candidates
# #                             del value_ref[candidate]
#                         del tmp_set_of_candidates[candidate]
#                         return tmp_set_of_candidates
                    final_coclusters = short_path_metrics(ref,cc_type_analysis,set_of_candidates,candidates_ref_values,'sample')
                    self.summary_clusters(final_coclusters, map_tid_to_el, trajs_data_dict_list)
                    self.calculate_entropy_purity(file_dataset)
                    self.__store_clustering_statistics(file_dataset,candidates_ref_values,cc_type_analysis,ref)
                else:
                    for cc_z_threshold in smp_z_thres_list:
                        print('Process: {}; Metric: {}; Ref: {}; z_thres: {}'.format(cc_type_process,cc_type_analysis,ref,cc_z_threshold))
#                         candidates_to_remove = bad_candidates(set_of_clusters,cc_type_analysis,ref,cc_z_threshold)
#                         tmp_set_of_candidates = value_ref.copy()
#                         for candidate in candidates_to_remove:#at this point, value_ref is the set of candidates
# #                                 del value_ref[candidate]
#                             del tmp_set_of_candidates[candidate]
#                             return tmp_set_of_candidates
                        final_coclusters = short_path_metrics(ref,cc_type_analysis,set_of_candidates,candidates_ref_values,'sample',cc_z_threshold)
                        self.summary_clusters(final_coclusters, map_tid_to_el,trajs_data_dict_list)
                        self.calculate_entropy_purity(file_dataset)
                        self.__store_clustering_statistics(file_dataset,candidates_ref_values,cc_type_analysis,ref,cc_z_threshold)

#         final_coclusters = short_path_metrics(set_of_candidates,candidates_ref_values,'sample')
#         self.summary_clusters(final_coclusters, map_tid_to_el,trajs_data_dict_list)
#         self.calculate_entropy_purity(file_dataset)
        print('END of measures_at_once method')
    
    def __store_clustering_statistics(self,dataset,candidates_ref_values,cc_type_analysis,ref,cc_z_threshold=''):
        
        if dataset == 'fs_ny_top_users_193.dat':
            self.__scstats_dataset = 'All users'
        elif dataset == 'fs_ny_top_users_81.dat':
            self.__scstats_dataset = 'Top 81 users'
        elif dataset == 'fs_ny_top_users_10.dat':
            self.__scstats_dataset = 'Top 10 users'
        elif (dataset == 'sjgs.dat') or (dataset == 'sjgs2.dat'):
            self.__scstats_dataset = datset.split('.')[0].upper()
        else:
            self.__scstats_dataset = 'Undefined dataset.'

        #list of column names
        self.scs_field_names = ['dataset','metric','cc_reference','num_of_candidates','num_of_clusters',
                                'avg_std_cv_rows','avg_std_cv_cost','num_of_groupped_elements','avg_std_cv_seq_len',
                                'avg_std_cv_relative_rows_compression','avg_std_cv_num_of_users','overall_entropy']
        
        if cc_z_threshold == '':
            self.__scstats_metric = cc_type_analysis
        else:
            self.__scstats_metric = cc_type_analysis+'['+str(cc_z_threshold)+']'
            
        self.__scstats_num_of_cc = len(candidates_ref_values)
        self.__scstats_avg_rows = np.round(np.mean(self.num_of_trajs_per_cluster),3)
        self.__scstats_std_rows = np.round(np.std(self.num_of_trajs_per_cluster),3)
        self.__scstats_cv_rows = np.round((self.__scstats_std_rows/self.__scstats_avg_rows)*100,3)
        self.__scstats_str_rows = str(self.__scstats_avg_rows)+'\u00B1'+str(self.__scstats_std_rows)+'['+str(self.__scstats_cv_rows)+']'
        self.__scstats_avg_cost = np.round(np.mean(self.cost_value_per_cluster),3)
        self.__scstats_std_cost = np.round(np.std(self.cost_value_per_cluster),3)
        self.__scstats_cv_cost = np.round((self.__scstats_std_cost/self.__scstats_avg_cost)*100,3)
        self.__scstats_str_cost = str(self.__scstats_avg_cost)+'\u00B1'+str(self.__scstats_std_cost)+'['+str(self.__scstats_cv_cost)+']'
        self.__scstats_avg_seq_len = np.round(np.mean(self.seq_len_per_cluster),3)
        self.__scstats_std_seq_len = np.round(np.std(self.seq_len_per_cluster),3)
        self.__scstats_cv_seq_len = np.round((self.__scstats_std_seq_len/self.__scstats_avg_seq_len)*100,3)
        self.__scstats_str_seq_len = str(self.__scstats_avg_seq_len)+'\u00B1'+str(self.__scstats_std_seq_len)+'['+str(self.__scstats_cv_seq_len)+']'
        self.__scstats_avg_relative_compress = np.round(np.mean(self.relative_clusters_value),3)
        self.__scstats_std_relative_compress = np.round(np.std(self.relative_clusters_value),3)
        self.__scstats_cv_relative_compress = np.round((self.__scstats_std_relative_compress/self.__scstats_avg_relative_compress)*100,3)
        self.__scstats_str_relative_compress = str(self.__scstats_avg_relative_compress)+'\u00B1'+str(self.__scstats_std_relative_compress)+'['+str(self.__scstats_cv_relative_compress)+']'
        self.__scstats_avg_num_of_users = np.round(np.mean(self.num_of_users_per_cluster),3)
        self.__scstats_std_num_of_users = np.round(np.std(self.num_of_users_per_cluster),3)
        self.__scstats_cv_num_of_users = np.round((self.__scstats_std_num_of_users/self.__scstats_avg_num_of_users)*100,3)
        self.__scstats_str_num_of_users = str(self.__scstats_avg_num_of_users)+'\u00B1'+str(self.__scstats_std_num_of_users)+'['+str(self.__scstats_cv_num_of_users)+']'
        
        
        
        # Dictionary
        self.scs_data = {'dataset':self.__scstats_dataset,'metric':self.__scstats_metric,'cc_reference':ref,
                     'num_of_candidates':self.__scstats_num_of_cc,'num_of_clusters':len(self.perf_cc_clusters),
                     'avg_std_cv_rows':self.__scstats_str_rows,
                     'avg_std_cv_cost':self.__scstats_str_cost,
                     'num_of_groupped_elements':len(self.unique_elements_grouped),
                     'avg_std_cv_seq_len':self.__scstats_str_seq_len,
                     'avg_std_cv_relative_rows_compression':self.__scstats_str_relative_compress,
                     'avg_std_cv_num_of_users':self.__scstats_str_num_of_users,
                     'overall_entropy':self.overall_entropy}
        
        with open('./coclustering_file_outputs/df_clustering_stats.csv','a+',newline='',encoding='utf8') as self.fscs_object:
      
            # Pass the file object and a list 
            # of column names to DictWriter()
            # You will get a object of DictWriter
            self.scs_dictwriter_object = DictWriter(self.fscs_object, delimiter=";", fieldnames=self.scs_field_names)

            #Pass the dictionary as an argument to the Writerow()
            self.scs_dictwriter_object.writerow(self.scs_data)

            #Close the file object
            self.fscs_object.close()
    
    def set_variables(self,num_objs):
        self.perf_df_clustering_output_measures = pd.DataFrame(columns = ['Iteration_i','Candidate_iteration_k',
                                                                          'Candidate_cost'])
        self.total_num_of_objs_df = num_objs
    
    ### descontinuado
    def append_result(self,it_i,cc_it_k,cc_cost):
        self.perf_df_clustering_output_measures = self.perf_df_clustering_output_measures.append({'Iteration_i':int(it_i),
                                                                                        'Candidate_iteration_k':'Candidate_'+str(cc_it_k),
                                                                                        'Candidate_cost':float(cc_cost)},
                                                                                        ignore_index=True)
    def plot_cost(self):
        '''
        Method to show the cost function value along the iterations.
        '''
#         print(self.df_clustering_output_measures.head())
#         self.df_clustering_output_measures['Cocluster_cost'] = self.df_clustering_output_measures['Cocluster_cost'] / self.df_clustering_output_measures['Cocluster_cost'].abs().max()
#         print(self.df_clustering_output_measures.head())
        a4_dims = (11.7, 8.27)
        fig, ax = plt.subplots(figsize=a4_dims)
        sns.lineplot(data=self.perf_df_clustering_output_measures, x="Iteration_i", y="Candidate_cost"
                     , hue="Candidate_iteration_k")#, style="Cluster_iteration_k", markers=True, dashes=False)
        
        if self.perf_df_clustering_output_measures['Candidate_iteration_k'].nunique() > 15:
            plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0)
#         self.plt.show()

    def summary_clusters(self, cc_dict, map_id_to_att,trajs_data_dict_list):
        '''
        Method to map back the attributes to its original value and put it avaible as final result visualization.
        '''
        self.perf_cc_clusters = {}
        self.num_of_trajs_per_cluster = []
        self.cost_value_per_cluster = []
        self.seq_len_per_cluster = []
        self.unique_elements_grouped = []
        
        for cluster_k,value in cc_dict.items():
#             print('Cluster ',cluster_k)
            remap_seq_output = []
            for att_id in cc_dict[cluster_k]['cc_atts'].split('-'):
                remap_seq_output.append(map_id_to_att[att_id])
            
            for el in remap_seq_output:
                if el not in self.unique_elements_grouped:
                    self.unique_elements_grouped.append(el)
            
            self.seq_len_per_cluster.append(len(remap_seq_output))
            sequence = '-'.join(remap_seq_output).strip()
            self.num_of_trajs_per_cluster.append(len(cc_dict[cluster_k]['cc_objs']))
            self.cost_value_per_cluster.append(cc_dict[cluster_k]['cc_cost'])
            
            self.perf_cc_clusters.update({cluster_k:{'cc_atts':sequence,'cc_objs':cc_dict[cluster_k]['cc_objs'],
                                                     'cc_cost':cc_dict[cluster_k]['cc_cost'],
                                                     'cc_over_coef':cc_dict[cluster_k]['cc_over_coef']}})
#             self.set_of_objects = self.set_of_objects.union(set(cc_dict[cluster_k]['cc_objs']))
        
#         self.perf_cc_clusters.update({'num_of_objects': len(self.set_of_objects)})
        print('### Clustering statistics ###')
        print('Number of co-clusters: ',len(self.perf_cc_clusters))
        self.__avg_rows = np.round(np.mean(self.num_of_trajs_per_cluster),3)
        self.__std_rows = np.round(np.std(self.num_of_trajs_per_cluster),3)
        self.__cv_rows = np.round((self.__std_rows/self.__avg_rows)*100,3)
        self.__avg_cost = np.round(np.mean(self.cost_value_per_cluster),3)
        self.__std_cost = np.round(np.std(self.cost_value_per_cluster),3)
        self.__cv_cost = np.round((self.__std_cost/self.__avg_cost)*100,3)
        print('AVG rows:{:.2f}\u00B1{:.2f}[CV:{:.2f}%], AVG cost:{:.2f}\u00B1{:.2f}[CV:{:.2f}%]'.format(self.__avg_rows,
                                                                                                        self.__std_rows,
                                                                                                        self.__cv_rows,
                                                                                                        self.__avg_cost,
                                                                                                        self.__std_cost,
                                                                                                        self.__cv_cost))
        self.__avg_seq_len = np.round(np.mean(self.seq_len_per_cluster),3)
        self.__std_seq_len = np.round(np.std(self.seq_len_per_cluster),3)
        self.__cv_seq_len = np.round((self.__std_seq_len/self.__avg_seq_len)*100,3)
        print('AVG sequece length:{:.2f}\u00B1{:.2f}[CV:{:.2f}%]'.format(self.__avg_seq_len,
                                                                         self.__std_seq_len,self.__cv_seq_len))
        print('Number of unique elements grouped: '+str(len(self.unique_elements_grouped)))
        
        
        if VERBOSE:
            for key, value in self.perf_cc_clusters.items():
                print('Co-cluster-{}, Sequence: {}, Num of trajs: {}, Cost: {}'.format(key,
                                                                           self.perf_cc_clusters[key]['cc_atts'],
                                                                           len(self.perf_cc_clusters[key]['cc_objs']),
                                                                           self.perf_cc_clusters[key]['cc_cost']))
#         self.perf_cc_clusters.update({'num_of_objects': len(trajs_data_dict_list)})
    
    def get_clusters(self):
        '''
        Method to show the found co-clusters as follows:
        1. It shows the current co-cluster K with the absolute number of objects into it and the relative number regarding
        the total number of objects in the dataset;
        2. It shows the co-cluster sequence of elements and the objects containing it.
        '''
        self.__it_k = 0
        for cluster_k, value in self.perf_cc_clusters.items():
            if cluster_k != 'num_of_objects':
                self.__it_k += 1
                self.relative = len(self.perf_cc_clusters[cluster_k]['cc_objs'])/self.total_num_of_objs_df
                print('Cluster #'+str(self.__it_k)+' - Candidate {0} [Absolute:{1} | Relative:{2:2.2f} | Cost: {3:} | Ov_coef: {4:1.2f} | Seq length: {5}]'.format(
                                                                                               cluster_k,
                                                                                               len(self.perf_cc_clusters[cluster_k]['cc_objs']),
                                                                                               self.relative,
                                                                                               self.perf_cc_clusters[cluster_k]['cc_cost'],
                                                                                               self.perf_cc_clusters[cluster_k]['cc_over_coef'],
                                                                                               len(self.perf_cc_clusters[cluster_k]['cc_atts'].split('-'))))
                
                if len(self.perf_cc_clusters[cluster_k]['cc_objs']) < 10:
                    print('Attributes sequence "{}" and trajectories "{}".'.format(self.perf_cc_clusters[cluster_k]['cc_atts'],
                                                                                   str(self.perf_cc_clusters[cluster_k]['cc_objs']).strip('{}')))
                else:
                    print('Attributes sequence "{}" and trajectories "{},[...]".'.format(self.perf_cc_clusters[cluster_k]['cc_atts'],
                                                                                str(list(self.perf_cc_clusters[cluster_k]['cc_objs'])[0:8]).strip('[]')))
                print('')
    
    def create_alluvial_diagram(self):
        # Function to create the CSV file that contains the data to generate the Sankey diagram
        
        if self.__sdst_dataset == 'All users':
            df_traj_user = pd.read_csv('./data/real_application/foursquare_NY/fs_ny_top_users_193.csv', sep=';')
            self.__file_alluvial_csv = './coclustering_file_outputs/df_build_alluvial_top193.csv'
        elif self.__sdst_dataset == 'Top 10 users':
            df_traj_user = pd.read_csv('./data/real_application/foursquare_NY/fs_ny_top_users_10.csv', sep=';')
            self.__file_alluvial_csv = './coclustering_file_outputs/df_build_alluvial_top10.csv'
        elif self.__sdst_dataset == 'Top 81 users':
            df_traj_user = pd.read_csv('../data/real_application/foursquare_NY/fs_ny_top_users_81.csv', sep=';')
            self.__file_alluvial_csv = './coclustering_file_outputs/df_build_alluvial_top81.csv'
        elif (self.__sdst_dataset == 'SJGS') or (self.__sdst_dataset == 'SJGS2'):
            df_traj_user = pd.read_csv('./data/real_application/gene_sequences/SJGS/splice_data.csv', sep=';')
            self.__file_alluvial_csv = './coclustering_file_outputs/df_build_alluvial_'+str(self.__sdst_dataset.lower())+'.csv'
    
    #     df_traj_user.drop(columns=['tid','lat_lon','time','day','type','root_type','rating','weather'],inplace=True)
        df_traj_user = df_traj_user[['new_tid','label']]
        #     print(df_traj_user.head())

        traj_id = 50
        user_label = df_traj_user[df_traj_user['new_tid'] == traj_id]['label'].unique()[0]
        print('Trajectory "{}" belongs to User: "{}"'.format(traj_id,user_label))
        
        self.__max_seq_len = 0
        for cluster_k, value in self.perf_cc_clusters.items():
            self.__tmp = len(self.perf_cc_clusters[cluster_k]['cc_atts'].split('-'))
            if self.__tmp > self.__max_seq_len:
                self.__max_seq_len = self.__tmp
        
        print('Max sequence length: '+str(self.__max_seq_len))
        
        self.__levels = ['lvl'+str(i) for i in range(1,self.__max_seq_len+3)]
        print('Levels: '+str(self.__levels))
        self.__columns_df_alluvial = self.__levels
        self.__columns_df_alluvial.append('count')
        self.__columns_df_alluvial.append('cluster')
        print('Columns alluvial df: '+str(self.__columns_df_alluvial))
        
        self.__df_alluvial = pd.DataFrame(columns = self.__columns_df_alluvial)    
        self.__df_alluvial.to_csv(self.__file_alluvial_csv,index=False,sep=';')
        
        self.__alluvial_field_names = self.__columns_df_alluvial
        self.__iter_k = 0
        for cluster_k, value in self.perf_cc_clusters.items():       
            self.__iter_k += 1
            self.__tmp_seq = self.perf_cc_clusters[cluster_k]['cc_atts'].split('-')
            self.__alluvial_data = dict.fromkeys(self.__columns_df_alluvial)
            for traj_id in self.perf_cc_clusters[cluster_k]['cc_objs']:
                user_label = df_traj_user[df_traj_user['new_tid'] == int(traj_id)]['label'].unique()[0]
                self.__alluvial_data['lvl1'] = 'U-'+str(user_label)
                self.__alluvial_data['lvl2'] = traj_id
                
                for element_level in range(0,len(self.__tmp_seq)):
                    self.__alluvial_data['lvl'+str(element_level+3)] = self.__tmp_seq[element_level]
                self.__alluvial_data['count'] = 1
                self.__alluvial_data['cluster'] = 'Cluster-'+str(self.__iter_k)
                
                with open(self.__file_alluvial_csv, 'a+', newline='') as self.__alluvial_object:
      
                    # Pass the file object and a list 
                    # of column names to DictWriter()
                    # You will get a object of DictWriter
                    self.__alluvial_dictwriter_object = DictWriter(self.__alluvial_object, delimiter=";",
                                                                   fieldnames=self.__alluvial_field_names)

                    #Pass the dictionary as an argument to the Writerow()
                    self.__alluvial_dictwriter_object.writerow(self.__alluvial_data)

                    #Close the file object
                    self.__alluvial_object.close()
        
        ### plot alluvial ###
        '''It is ploted out of this process. Check the supplement material.'''
#         self.__df_alluvial = pd.read_csv(self.__file_alluvial_csv,sep=';')
#         self.__fig = self.__genSankey(self.__df_alluvial,cat_cols=self.__levels,value_cols='count')
#         plotly.offline.plot(self.__fig, validate=False)
            
    def __genSankey(self,df,cat_cols=[],value_cols='',title='Sankey Diagram'):
        # old generate sankey. it works well with no repeated variable values
        self.__dpi=300
        self.__board = plt.figure(figsize=(3, 2),dpi=self.__dpi)
        # maximum of 6 value cols -> 6 colors
#         colorPalette = ['#4B8BBE','#306998','#FFE873','#FFD43B','#646464']
        # maximum of 20 value cols -> 20 colors
        colorPalette = ['#1f77b4', '#aec7e8', '#ff7f0e', '#ffbb78', '#2ca02c', '#98df8a', '#d62728', '#ff9896',
                        '#9467bd', '#c5b0d5', '#8c564b', '#c49c94', '#e377c2', '#f7b6d2', '#7f7f7f', '#c7c7c7',
                        '#bcbd22', '#dbdb8d','#17becf', '#9edae5']
        labelList = []
        colorNumList = []
        for catCol in cat_cols:
            labelListTemp =  list(set(df[catCol].values))
            colorNumList.append(len(labelListTemp))
            labelList = labelList + labelListTemp

        # remove duplicates from labelList
        labelList = list(dict.fromkeys(labelList))

        # define colors based on number of levels
        colorList = []
        for idx, colorNum in enumerate(colorNumList):
            colorList = colorList + [colorPalette[idx]]*colorNum

        # transform df into a source-target pair
        for i in range(len(cat_cols)-1):
            if i==0:
                sourceTargetDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
                sourceTargetDf.columns = ['source','target','count']
            else:
                tempDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
                tempDf.columns = ['source','target','count']
                sourceTargetDf = pd.concat([sourceTargetDf,tempDf])
            sourceTargetDf = sourceTargetDf.groupby(['source','target']).agg({'count':'sum'}).reset_index()

        # add index for source-target pair
        sourceTargetDf['sourceID'] = sourceTargetDf['source'].apply(lambda x: labelList.index(x))
        sourceTargetDf['targetID'] = sourceTargetDf['target'].apply(lambda x: labelList.index(x))

        # creating the sankey diagram
        data = dict(
            type='sankey',
            node = dict(
              pad = 10,
              thickness = 15,
              line = dict(
                color = "black",
                width = 0.5
              ),
              label = labelList,
              color = colorList
            ),
            link = dict(
              source = sourceTargetDf['sourceID'],
              target = sourceTargetDf['targetID'],
              value = sourceTargetDf['count']
            )
          )

        layout =  dict(
            title = title,
#             height = 372,
#             width = 550,
            font = dict(
              size = 10
            )
        )

    #     fig = go.Figure(data = [go.Sankey(data,layout)])
#         fig = go.Figure(data = [go.Sankey(data)])
        fig = dict(data=[data], layout=layout)
        return fig
    
    def store_dist(self,set_of_candidates):
        self.set_of_candidates = set_of_candidates
    
    def __get_entropy_purity(self):
        print('Overall entropy H: '+str(self.overall_entropy))
        print('Purity: '+str(self.purity))
        self.__gep_avg_relative = np.round(np.mean(self.relative_clusters_value),3)
        self.__gep_std_relative = np.round(np.std(self.relative_clusters_value),3)
        self.__gep_cv_relative = np.round((self.__gep_std_relative/self.__gep_avg_relative)*100,3)
        print('AVG relative co-clusters: {:.2f}\u00B1{:.2f}[CV:{:.2f}%]'.format(self.__gep_avg_relative,
                                                                                self.__gep_std_relative,
                                                                                self.__gep_cv_relative))
        self.__gep_avg_num_users = np.round(np.mean(self.num_of_users_per_cluster),3)
        self.__gep_std_num_users = np.round(np.std(self.num_of_users_per_cluster),3)
        self.__gep_cv_num_users = np.round((self.__gep_std_num_users/self.__gep_avg_num_users)*100,3)
        print('AVG num. of users: {:.2f}\u00B1{:.2f}[CV:{:.2f}%]'.format(self.__gep_avg_num_users,
                                                                         self.__gep_std_num_users,
                                                                         self.__gep_cv_num_users))
        print('')
    
    def calculate_entropy_purity(self, file_dataset):
        
        self.split = file_dataset.split('.')
        if self.split[0] != 'sjgs' or self.split[0] != 'sjgs2' or split[0] != 'splice_data':
            if self.split[-1] == 'dat':
                self.path = './data/real_application/foursquare_NY/concat_dimensions/'
                self.df_trajs_users = create_df_map_traj_user(pd.read_csv(self.path+self.split[0]+'.csv', sep=";"))
            else:
                self.path = './data/real_application/foursquare_NY/concat_dimensions/'
                self.df_trajs_users = create_df_map_traj_user(pd.read_csv(self.path+file_dataset, sep=";"))
        else:
            if self.split[-1] == 'dat':
                self.path = './data/real_application/gene_sequences/SJGS/'
                self.df_trajs_users = create_df_map_traj_user(pd.read_csv(self.path+self.split[0]+'.csv', sep=";"))
            else:
                self.path = './data/real_application/gene_sequences/SJGS/'
                self.df_trajs_users = create_df_map_traj_user(pd.read_csv(self.path+file_dataset, sep=";"))
        
        self.relative_clusters_value = []
        self.entropy_per_cluster = []
        self.num_of_objs_per_cluster = []
        self.total_num_of_objs = len(self.df_trajs_users)
        self.overall_entropy = 0
        self.max_prob_per_cluster = []
        self.purity = 0
        self.num_of_users_per_cluster = []
        
        for cluster_k, value in self.perf_cc_clusters.items():
            self.users = {}
            if cluster_k != 'num_of_objects':
                self.num_of_objs_k = len(self.perf_cc_clusters[cluster_k]['cc_objs'])
                self.relative_clusters_value.append((self.num_of_objs_k/self.total_num_of_objs)*100)
#                 print('Cluster-'+str(cluster_k)+' | # of trajs: '+str(self.num_of_objs_k))
                self.trajs = list(map(int,self.perf_cc_clusters[cluster_k]['cc_objs']))
                self.df_cluster = self.df_trajs_users[self.df_trajs_users['Tid'].isin(self.trajs)]
##                 self.users = self.df_cluster['User'].value_counts().to_dict()
                self.users = self.df_cluster['User'].value_counts()
                self.num_of_users_per_cluster.append(len(self.users))
                self.h_k = entropy(self.users,base=2)
##                 self.users = self.users.to_dict()
                #print(self.users,end=' | ')
                #print("Entropy h_k: "+str(self.h_k))
                self.entropy_per_cluster.append(self.h_k)
                self.num_of_objs_per_cluster.append(self.num_of_objs_k)
##                 self.total_num_of_objs += self.num_of_objs_k
##                self.max_prob_per_cluster.append(np.array(list(self.users.values())).max())
                self.max_prob_per_cluster.append(list(self.users)[0])
        
        self.overall_entropy = np.sum((np.array(self.entropy_per_cluster)*
                                       (np.array(self.num_of_objs_per_cluster)/self.total_num_of_objs)))
        self.purity = np.array(self.max_prob_per_cluster).sum()/self.total_num_of_objs
        self.__get_entropy_purity()
    
    def show_boxplot(self):
        '''
        Method to show the distribution values of the candidates.
        '''
        #         array = np.random.uniform(size=20)
        self.array = list(self.set_of_candidates.values())
        self.ref = ''
        if np.mean(self.array) < 0:
            self.ref = 'Cost ref'
        else:
            self.ref = 'Rows ref'
        ax = sns.boxplot(data=self.array)
        ax = sns.swarmplot(data=self.array, color=".25")
        plt.xticks([0],[self.ref])
#         plt.xlabel("Reference")
        plt.ylabel("Values")
        plt.title(self.ref+" distribution")
    #     plt.show(ax)
    
    def test_norm_dist(self):
        import scipy.stats as stats
        try:
            self.mean = np.mean(list(self.set_of_candidates.values()))
            self.std = np.std(list(self.set_of_candidates.values()),ddof=1)
            self.dist_values = list(self.set_of_candidates.values())
#             self.shapiro_stat, self.shapiro_p_value = stats.shapiro(self.dist_values)
#     #         print('O valor da estatística de shapiro-wilk = '+str(self.shapiro_stat))
#     #         print('O valor do p-value de shapiro-wilk = '+str(self.shapiro_p_value))
#             if self.shapiro_p_value >=0.5:
#                 print('Com 95% de confiança, os dados são similares a uma distribuição normal segundo o teste de Shapiro-Wilk.')
#             else:
#                 print('Com 95% de confiança, os dados NÃO são similares a uma distribuição normal segundo o teste de Shapiro-Wilk.')
#             print('')
            self.__shapiro_wilk_test()
            self.__kolomogorov_smirnov_test()
            self.__anderson_darling_test()
            print('')
        except Exception as inst:
            print('Please, test the variable individually.')
            print('Error:',inst)
#             self.dist_values_rows = []
#             self.dist_values_cost = []

#             for key,value in self.set_of_candidates.items():
#                 self.dist_values_rows.append(self.set_of_candidates[key]['rows'])
#                 self.dist_values_cost.append(self.set_of_candidates[key]['cost'])
            
#             self.shapiro_stat_rows, self.shapiro_p_value_rows = stats.shapiro(self.dist_values_rows)
#             self.shapiro_stat_cost, self.shapiro_p_value_cost = stats.shapiro(self.dist_values_cost)
    
    def __shapiro_wilk_test(self):
        self.shapiro_stat, self.shapiro_p_value = stats.shapiro(self.dist_values)
#         print('O valor da estatística de shapiro-wilk = '+str(self.shapiro_stat))
#         print('O valor do p-value de shapiro-wilk = '+str(self.shapiro_p_value))
        if self.shapiro_p_value >=0.5:
            print('Segundo o teste de Shapiro-Wilk, com 95% de confiança, os dados são similares a uma distribuição normal.')
        else:
            print('Segundo o teste de Shapiro-Wilk, com 95% de confiança, os dados NÃO são similares a uma distribuição normal.')
    
    def __kolomogorov_smirnov_test(self):
        self.ks_stat, self.ks_p_value = stats.kstest(self.dist_values,cdf='norm', args=(self.mean,self.std), N=len(self.dist_values))
        self.ks_critico = self.__kolmogorov_smirnov_critico(len(self.dist_values))
        if self.ks_critico >= self.ks_stat:
            print('Segundo o teste Kolomogorov-Smirnov, com 95% de confiança, os dados são similares a uma distribuição normal.')
        else:
            print('Segundo o teste Kolomogorov-Smirnov, com 95% de confiança, os dados NÃO são similares a uma distribuição normal.')
    # Checking the critical value of the Kolmogorov-Smirnov test
    def __kolmogorov_smirnov_critico(self,n):
        # table of critical values for the kolmogorov-smirnov test - 95% confidence
        # Source: https://www.soest.hawaii.edu/GG/FACULTY/ITO/GG413/K_S_Table_one_Sample.pdf
        # Source: http://www.real-statistics.com/statistics-tables/kolmogorov-smirnov-table/
        # alpha = 0.05 (95% confidential level)

        if n <= 40:
            # valores entre 1 e 40
            self.kolmogorov_critico = [0.97500, 0.84189, 0.70760, 0.62394, 0.56328, 0.51926, 0.48342, 0.45427, 0.43001, 0.40925, 
                          0.39122, 0.37543, 0.36143, 0.34890, 0.33760, 0.32733, 0.31796, 0.30936, 0.30143, 0.29408, 
                          0.28724, 0.28087, 0.27490, 0.26931, 0.26404, 0.25907, 0.25438, 0.24993, 0.24571, 0.24170, 
                          0.23788, 0.23424, 0.23076, 0.22743, 0.22425, 0.22119, 0.21826, 0.21544, 0.21273, 0.21012]
            self.ks_critico = self.kolmogorov_critico[n - 1]
        elif n > 40:
            # valores acima de 40:
            self.kolmogorov_critico = 1.36/(np.sqrt(n))
            self.ks_critico = self.kolmogorov_critico
        else:
            pass            

        return self.ks_critico
    
    def __anderson_darling_test(self):
        self.ad_stat, self.ad_critico, self.ad_teorico = stats.anderson(self.dist_values,'norm')
        if self.ad_stat < self.ad_critico[2]:
            print('Segundo o teste de Anderson-Darling, com 95% de confiança, os dados são similares a uma distribuição normal.')
        else:
            print('Segundo o teste de Anderson-Darling, com 95% de confiança, os dados NÃO são similares a uma distribuição normal.')

    def test_skewness(self):
        self.dist_values = list(self.set_of_candidates.values())
        self.mean = np.mean(self.dist_values)
        self.median = np.median(self.dist_values)
        vals,counts = np.unique(self.dist_values, return_counts=True)
        index = np.argmax(counts)
        self.mode = vals[index]
        
        
        if (self.mean == self.median) and (self.mean == self.mode):
            print('Distribuição normal | Mean:{} = Median: {} = Mode: {}.'.format(self.mean,self.median,self.mode))
        #positive values
        if (self.mean < self.median) and (self.median < self.mode):
            print('Assimetria à Esquerda (negativa) | Mean:{} < Median: {} < Mode: {}.'.format(self.mean,self.median,self.mode))
        if (self.mode < self.median) and (self.median < self.mean):
            print('Assimetria à Direita (positiva) | Mode:{} < Median: {} < Mean: {}.'.format(self.mode,self.median,self.mean))
        #negativa values
        if self.mean < 0:
            print('mean:',self.mean,' median:',self.median,' mode:',self.mode)
            if (self.mean > self.median) and (self.median > self.mode):
                print('Assimetria à Esquerda (negativa) | Mean:{} < Median: {} < Mode: {}.'.format(self.mean,self.median,self.mode))
            if (self.mode > self.median) and (self.median > self.mean):
                print('Assimetria à Direita (positiva) | Mode:{} < Median: {} < Mean: {}.'.format(self.mode,self.median,self.mean))
        print('')



#### Modificação do candidate deviation método. Esta modificação é para fazer os cálculos das medidas sem ter que
#### recalcular os candidatos no mesmo dataset. Os candidatos não mudam no modo de descoberta automática.
# def short_path_metrics(ref,value_ref,set_of_clusters,cc_type_process='incremental'):
def short_path_metrics(ref,cc_type_analysis,value_ref,set_of_clusters,cc_type_process='incremental',cc_z_threshold=None):
    '''
    Method to return the avg number of the reference in the set of co-clusters.
    If the set is bigger than 1 it calculates the avg, otherwise it is 0.
    Parameters:
        ref_analysis: 1. index_rows_set -> considers the rows; 2. cost_function -> considers the cost.
        test_value: The value to test.
        set_of_clusters: The current set of co-clusters containing its values for the ref_analysis    
    '''
   
    if len(set_of_clusters) >= 2:
        try:# single ref
            mean = np.mean(list(set_of_clusters.values()))
            std = np.std(list(set_of_clusters.values()))
        except:# double ref
            sum_rows = []
            sum_cost = []
            for key,value in set_of_clusters.items():
                sum_rows.append(set_of_clusters[key]['rows'])
                sum_cost.append(set_of_clusters[key]['cost'])
            mean_rows = np.mean(sum_rows)
            mean_cost = np.mean(sum_cost)
            std_rows = np.std(sum_rows)
            std_cost = np.std(sum_cost)
        
        if cc_type_process == 'incremental':
            if ref == "rows":
                ### normal mean
                if cc_type_analysis == 'mean':
                    return len(value_ref['index_rows_set']) >= np.floor(mean)
                else:
                ### z-score: we consider values greater than -1 once it is a positive distribution
                    try:
                        z = (len(value_ref['index_rows_set'])-mean)/std
                    except:
                        z = (value_ref-mean)/std
                    print('Z-score(rows): ',z)
                    return z >= cc_z_threshold
            elif ref == "cost":
                ### normal mean
                if cc_type_analysis == 'mean':
                    return value_ref['cost_function'] <= np.ceil(mean)
                else:
                ### z-score: we consider values smaller than 1 once it is a negative distribution
                    try:
                        z = (value_ref['cost_function']-mean)/std
                    except:
                        z = (value_ref-mean)/std
                    print('Z-score(cost): ',z)
                    return z <= -cc_z_threshold
            else:#combine
                ### normal mean
                if cc_type_analysis == 'mean':
#                     print('Mean(rows):',mean_rows,' Mean(cost):',mean_cost)
                    return ((len(value_ref['index_rows_set']) >= np.floor(mean_rows)) or 
                            (value_ref['cost_function'] <= np.ceil(mean_cost)))
                else:
                ### z-score
                    z_rows = (len(value_ref['index_rows_set'])-mean_rows)/std_rows
                    z_cost = (value_ref['cost_function']-mean_cost)/std_cost
#                     print('Z-score(rows): ',z_rows,' Z-score(cost): ',z_cost)
                    return ((z_rows >= -cc_z_threshold) or (z_cost <= cc_z_threshold))
                
        elif cc_type_process == 'sample':
            candidates_to_remove = []
            
            if ref != 'combine':
#             try:# single ref: rows OR cost
#                 mean
                if cc_type_analysis == 'mean':
                    for key,value in set_of_clusters.items():
#                         print('Candidate-'+key+' Mean:',mean,' Value ref:',value,end='')
                        if ref == 'rows' and set_of_clusters[key]['rows'] < mean_rows:
#                             print(' -> Remove')
                            candidates_to_remove.append(key)
                        elif ref == 'cost' and set_of_clusters[key]['cost'] > mean_cost:
#                             print(' -> Remove')
                            candidates_to_remove.append(key)
                        else:
#                             print(' -> Keep')
                            pass
                else:#z-score
                    for key,value in set_of_clusters.items():
#                         z = (value-mean)/std
#                         print('Candidate-'+key+' Z-score:',z,end='')
                        if ref == 'rows':
                            z = (set_of_clusters[key]['rows']-mean_rows)/std_rows
                            if z < cc_z_threshold:
                                candidates_to_remove.append(key)
#                             print(' -> Remove')
                        elif ref == 'cost':
                            z = (set_of_clusters[key]['cost']-mean_cost)/std_cost
                            if z > -cc_z_threshold:
                                candidates_to_remove.append(key)
#                             print(' -> Remove')
                        else:
#                             print(' -> Keep')
                            pass

            else:
#             except:#double ref combine: rows AND cost
                
                if cc_type_analysis == 'mean':
                    for key,value in set_of_clusters.items():
#                         print('Candidate-'+key+' Mean(rows):',mean_rows,' Mean(cost):',mean_cost,end='')
                        if (set_of_clusters[key]['rows'] < mean_rows) and (set_of_clusters[key]['cost'] > mean_cost):
#                             print(' -> Remove')
                            candidates_to_remove.append(key)
                        else:
#                             print(' -> Keep')
                            pass

                else:#z-score
                    for key,value in set_of_clusters.items():
                        z_rows = (set_of_clusters[key]['rows']-mean_rows)/std_rows
                        z_cost = (set_of_clusters[key]['cost']-mean_cost)/std_cost
#                         print('Candidate-'+key+' Z-score(row):',z_rows,' Z-score(cost):',z_cost,end='')
                        if (z_rows < cc_z_threshold) and (z_cost > -cc_z_threshold):
#                             print(' -> Remove')
                            candidates_to_remove.append(key)
                        else:
#                             print(' -> Keep')
                            pass
            
#             return candidates_to_remove  
#             print("Remove candidates: ",candidates_to_remove)
#             print("Number of candidates to remove: ",len(candidates_to_remove))
            tmp_set_of_candidates = value_ref.copy()
            for candidate in candidates_to_remove:#at this point, value_ref is the set of candidates
#                 del value_ref[candidate]
                del tmp_set_of_candidates[candidate]
            return tmp_set_of_candidates

        else: # just pass step. Storing the candidate co-clusters to analyze them with sample analysis if desirable
            return True               
                
    else:# pass step to reach a minimum number of elements to perform computation
        return True


In [48]:

# show_boxplot()

In [998]:
def show_boxplot():
    array = np.random.uniform(size=20)
    ax = sns.boxplot(data = array)
    ax = sns.swarmplot(data=array, color=".25")
    plt.xticks([0],['SDF'])
    plt.xlabel("Reference")
#     plt.show(ax)

In [None]:
f = {}
f['1'] = {}
f['1'].update({'cc_objs':[]})
f['1'].update({'cc_atts':[]})
f['1'].update({'cc_elements':[]})
print(f)
print(f['1']['cc_objs'])
f.update({'2':{}})
print(f)


In [11]:
# def update_queue_s(cocluster_sequence_str, tested_sequence_str, s_poi_freq_queue_list, poi_node_queue):
def update_queue_s(cocluster_sequence_str, s_poi_freq_queue_list, poi_node_queue):
    '''
    Method to update the nodes in queue s. It decrements the value of a given node in s.
    The input are:
        1. The current string sequence of a cocluster;
        2. The tested string sequence to improve a cocluster;
        3. The queue s;
        4. A single node of queue s.
    '''
    # update list s when the first sequence is identified
#     if cocluster_sequence_str == '':
    tmp_split = cocluster_sequence_str.split('-')
    if len(tmp_split) == 2:
#         tmp_split = tested_sequence_str.split('-')
        s_poi_freq_queue_list.append(poi_node_queue)
        for attribute in tmp_split:
            for node_s in s_poi_freq_queue_list:
                if attribute == node_s[0]:
                    node_s[1] -= 1
                    if node_s[1] <= 0: # all occurences were used, then remove the element from the queue
                        print('Element with 0 removed.')
                        s_poi_freq_queue_list.remove(node_s)
                    break
    else: # update a single node in case a sequence is already discovered
        poi_node_queue[1] -= 1
        if poi_node_queue[1] <= 0: # all occurences were used, then remove the element from the queue
            s_poi_freq_queue_list.remove(node_s)
        else:
            s_poi_freq_queue_list.append(poi_node_queue)

In [12]:
def update_uppercase_S(cc_atts, cc_objs, S_dict):
    '''
    Method to update the dictonary S. It decrements the frequency of the given attributes in S.
    S is updated regarding the frequency of an attribute times the number of objects that it appears in a
    co-cluster.
    E.g., Given a co-clsuter with sequence Home-Work-Home with 5 trajectories. Then, in S, Home is 
    decremented with value 10 (2*5) and Work with value 5 (1*5).
    
    The input are:
        1. Co-cluster attributes;
        2. Co-cluster objects;
        3. The dictionary S of attributes and its frequency.
    '''
    tmp_dict = {}
    for attribute in cc_atts: # groups repeation
#         S_poi_freq_dict[attribute] -= 1
        try:
            tmp_dict[attribute] += 1
        except:
            tmp_dict.update({attribute:1})
    for attribute, value in tmp_dict.items():
        S_dict[attribute] -= (tmp_dict[attribute]*len(cc_objs))
        if S_dict[attribute] <= 0:
            S_dict.pop(attribute)

In [None]:
a1 = ['a','b','c','d']
a2 = [1,2,3,4]
[str(i)+str(j) for i in a1 for j in a2]

In [None]:
a = np.array([1,1,1,0,0,0,0])
b = np.array([1,1,3,0,0,0,0])
c = np.outer(a,b)
print(c)
d = (c*0)+1
print(d)
print(c+d)
e = d*4
print(e)
f = d-e
print(f)
print(sum(sum(f)))

In [None]:
def myFunc(e):
    return e[:][1]

er = [['f',2],['h',5],['t',1]]
print(er)
er.sort()
print(er)
er.sort(reverse=True, key=myFunc)
print(er)
er[0][1] -= 1
print(er)

In [None]:
t= [1,2,3,4,5,6,7,8]
print(t)
push_to_end = 3
complete_cicle = False
reload = True
print(t.pop(2))
print(t)
t= [1,2,3,4,5,6,7,8]
print(t)
# while(reload and complete_cicle != True):
#     for i in range(len(t)):
#         if t[i] == push_to_end:
#             tmp = t.pop(i)
#             t.append(tmp)
#             complete_cicle == True
#         else:
#             print(t[i])
#         if push_to_end == t[i] and complete_cicle == True:
#             reload = False
#             break
            


# Support functions for the algorithms

In [13]:
def get_data(input_data):
    '''
    This method will assign the variables used by the algorithm.
    
    INPUT
        input_data: A panda dataframe of the input data file.
    
    OUTPUT
        D: A binary matrix from the input data.
        N: A noise binary matrix with the same size of D.
        data_dict: A dictionary to store D as a vertical representation.
        data_res_dict: A copy of data_dict used to sort the attributes of D and find unconvered elements.
        
    '''
    
    data_pd = input_data #txt file with sequence of check-ins (POI)
    frequence_per_poi_dict = {} # store the frequence of a POI as "POI": num_of_occurrences
    poi_at_trajs_dict_set = {}  # store a set with each index line (tid trajectory) that contains a given POI.
                            # "POI": set(0,1,4,...); It is the S variable
#     global data_res_dict
    uncover_poi_dict = {} # It is the s* variable
#     global D # input data as a binary matrix
#     global N # noise matrix with the same size of D
    num_of_objects = 0
    num_of_attributes = 0
    map_id_to_attribute = {} # map the 
    map_attribute_to_id = {} # map the
    trajectory_dict = {} # it stores the trajectories with its check-ins. "TID": [POI1,POI2,...]
#     max_val_att = 0 
    att_id = 0 # assign an ID to each attribute
    
    # read each line
    for index, row in data_pd.iterrows():
        num_of_objects+=1
        object_data = row[0].split(" ")
#         trajectory_dict[str(index)] = {}
#         trajectory_dict[str(index)] = object_data
        trajectory_dict[str(index)] = []
        
#         for attribute in object_data: # we look at each item of the given transaction
        for att_j in range(len(object_data)): # we look at each item of the given transaction
            attribute = object_data[att_j]
            
            if attribute != "":
#                 if int(attribute) > max_val_att:
#                     max_val_att = int(attribute)
#                 if attribute not in map_unique_attributes_dataset:
#                 if attribute not in map_attribute_to_id.keys():
    
                if attribute not in map_attribute_to_id: # mapping
#                     unique_attributes_dataset.append(attribute)
                    map_attribute_to_id[attribute] = str(att_id)
                    map_id_to_attribute[str(att_id)] = attribute
                    att_id += 1
                
                # substitute the check-in by its ID
                trajectory_dict[str(index)].append(map_attribute_to_id[attribute])
                
                # store the indeces containing a given POI
                if map_attribute_to_id[attribute] in poi_at_trajs_dict_set:
#                     data_dict[map_attribute_to_id[attribute]].append(index)
                    poi_at_trajs_dict_set[map_attribute_to_id[attribute]].add(str(index))
                else:
#                     data_dict[map_attribute_to_id[attribute]] = [index]
                    poi_at_trajs_dict_set[map_attribute_to_id[attribute]] = set([str(index)])
                
                # store the frequence for each POI
                if map_attribute_to_id[attribute] in frequence_per_poi_dict:
                    current_value = frequence_per_poi_dict[map_attribute_to_id[attribute]]
                    frequence_per_poi_dict[map_attribute_to_id[attribute]] = current_value + 1
                else:
                    frequence_per_poi_dict[map_attribute_to_id[attribute]] = 1
            
                    
    uncover_poi_dict = poi_at_trajs_dict_set.copy()
#     num_of_attributes = len(data_dict)
#     num_of_attributes = max_val_att+1
#     num_of_attributes = len(map_attribute_to_id)
    print("######################################")
    print("Number of trajectories: "+str(index+1))
    print("Number of unique check-ins: "+str(len(map_attribute_to_id)))
    print("########################################")
    if VERBOSE:
        print("Map_attribute_to_id:"+str(map_attribute_to_id))
        print("")
        print("Map_id_to_attribute:"+str(map_id_to_attribute))
        print("")
        print("Frequence_per_poi:"+str(frequence_per_poi_dict))
        print("")
        print("Trajectories: "+str(trajectory_dict))
        print("")
        print("POI occurring at trajectories: "+str(poi_at_trajs_dict_set))
        print("Get data is DONE!")
        
    
#     D = np.zeros((num_of_objects,num_of_attributes),dtype=int)
#     for key, values in poi_at_trajs_dict.items():
#         print("key:"+str(key)+" Values:"+str(values))
#         for line in values:
# #             D[line][int(key)] = 1
# #             D[line][map_unique_attributes_dataset[key]] = 1
# #             print(line,key)
# #             print(type(line),type(key))
#             D[line][int(key)] = 1
#     N = np.zeros((num_of_objects,num_of_attributes),dtype=int)
    
#     return D, N, poi_at_trajs_dict, data_res_dict, map_id_to_attribute
    return map_id_to_attribute, frequence_per_poi_dict, poi_at_trajs_dict_set, trajectory_dict

In [943]:
df = pd.read_csv('data/real_application/foursquare_NY/fs_ny_top_users_10.csv', sep=";")
df_tmp = create_df_map_traj_user(df)
print(df_tmp[df_tmp['Tid'].isin([1,3,6])])
e = df_tmp[df_tmp['Tid'].isin([1,3,6])]['User'].value_counts()
print(e.to_dict())
print(df_tmp[df_tmp['Tid']==1]['User'].values[0])

   Tid  Traj_length  User
1    1           81   185
3    3           77   185
6    6           71   185
{185: 3}
185


In [27]:
# def create_df_map_traj_user(df):
def create_df_map_traj_user(df=pd.DataFrame):
    '''
    Method to support the calculation of the quality result.
    It returns a dataframe with the users and their trajectories with its respective length.
    '''
    try:
        #     df = pd.read_csv('data/real_application/foursquare_NY/fs_ny_top_users_10.csv', sep=";")
        df_map_traj_user = pd.DataFrame(columns=['Tid','Traj_length','User'])
        tids = []
        user = ''
        traj_length = 0

        sequence = []
        past_tid = None
        curr_tid = None
        num_of_seqs = 0
        map_element_id = 0
        unique_elements = {}
        map_id_to_element = {}

        for i in range(len(df)):
            curr_tid = df.loc[i,"new_tid"]
            if curr_tid not in tids:
                tids.append(int(curr_tid))
                user = int(df.loc[i,"label"])
                traj_length = len(df[df['new_tid'] == curr_tid])
                # append rows to an empty DataFrame
                df_map_traj_user = df_map_traj_user.append({'Tid' : curr_tid, 'Traj_length' : traj_length, 'User' : user},ignore_index = True)
        df_map_traj_user['Tid'] = df_map_traj_user['Tid'].astype(int, errors='ignore')
        df_map_traj_user['Traj_length'] = df_map_traj_user['Traj_length'].astype(int, errors='ignore')
        df_map_traj_user['User'] = df_map_traj_user['User'].astype(int, errors='ignore')

    #     print(df_map_traj_user.shape)
    #     print(df_map_traj_user.head())
    #     print('Todo DataFrame (traj_length):',' mean=',df_map_traj_user['Traj_length'].mean(),
    #           ' std=',df_map_traj_user['Traj_length'].std())
    #     u_185 = df_map_traj_user[df_map_traj_user['User']==185]
    #     print(u_185)
    #     print('User 185:',' mean=',u_185['Traj_length'].mean(),' std=',u_185['Traj_length'].std())
    #     df_map_traj_user.groupby(['label']).nunique()['new_tid'].mean()
        r = (df_map_traj_user.groupby(['User'])['Traj_length']
             .agg([np.count_nonzero,np.mean,np.std])
             .rename(columns={'count_nonzero':'Count_trajs',
                              'mean':'AVG_traj_length',
                              'std':'STD_traj_length'}))
    #     print('Número médio de trajs por usuário = {:.2f} com DP = {:.2f}'.format(r['Count_trajs'].mean(),
    #                                                                        r['Count_trajs'].std()))
    #     print('Tamanho médio das trajs por usuário = {:.2f} com DP médio = {:.2f}'.format(r['AVG_traj_length'].mean(),
    #                                                                        r['STD_traj_length'].mean()))
        return df_map_traj_user
    except:
        raise('Please, check the input data format.')
        return None
    

In [None]:
from collections import deque
my_fila = deque([{'hotel':4},{'casa':7},{'trabalho':9},{'padaria':2}])
my_fila2 = deque()
my_fila2.append({'hotel':4})
my_fila2.append({'padaria':2})
print('Fila 1: ',type(my_fila))
print('Fila 1: ',my_fila)
print('Fila 2: ',my_fila2)
print(my_fila)
my_fila.append({'festa':1})
print(my_fila)
my_fila.appendleft({'aeroporto':1})
print(my_fila)
print(my_fila[1])
my_fila.insert(1,{'padaria':3})
print(my_fila)
print(len(my_fila))
print(my_fila.pop())
print(type(my_fila[2]))
r = my_fila[4]
print(r)
print(list(r.keys())[0])
print(my_fila.index(my_fila[4],2,len(my_fila)))
print('Fila 1: ',my_fila)
print('Fila 2: ',my_fila2)
my_fila.pop()#delete from the right end
my_fila.popleft()#delete from the left end
print('Fila 1: ',my_fila)
f = my_fila.popleft()
print('Fila 1: ',my_fila)
print('Element poped: ',f)
my_fila.append(f)
print(my_fila)
# my_fila.popleft()
# print(my_fila)
# my_fila.popleft()
# print(my_fila)
# my_fila.popleft()
# print(my_fila)
# my_fila.popleft()
# print(my_fila)
# print(my_fila)
# while my_fila:
#     fx = my_fila.popleft()
#     print(fx)
#     print(my_fila)
# for p in my_fila:
#     print(p)
print(num_elements_to_test('log2',len(my_fila)))
print(num_elements_to_test('log10',len(my_fila)))
print(num_elements_to_test('length',len(my_fila)))
print(my_fila)
my_fila.remove({'casa':7})
print(my_fila)

In [28]:
def num_elements_to_test(option,number):
    
    if option == 'log2':
        return int(round(np.log2(number)))
    elif option == 'log10':
        return int(round(np.log10(number)))
    elif option == 'length':# Attention! The number for length must to be at most length of structure. e.g. array, dic.
        return int(round(number))
    else:
        return print('Choose a valid option!')

### Too noisy (line,col)?

In [30]:
def not_too_noisy(count_presence, C, e_obj, e_att, att_data_dict, E, dimension):
    num_of_atts = len(C[0])
    num_of_objs = len(C[1])
    if dimension == "obj":
        # obj must be present in at least (1-e_obj).||C_a||
        return count_presence >= ((1-e_obj) * num_of_atts) # return true if the obj is not too noisy
    else:
        # col must be present in at least (1-e_tt).||C_o||
        return count_presence >= ((1-e_att) * num_of_objs) # return true if the att is not too noisy

### cost function

In [16]:
def cost_function(numOfObj, numOfAtt, cov=0, noise=0):
    if VERBOSE:
        print('Num. objs: {0:2d}, Num. att: {1:2d}, Num. covered: {2:2d}, Num. noise: {3:2d}'.format(numOfObj,numOfAtt,cov,noise))
#     return ((numOfObj+numOfAtt) - (numOfObj*numOfAtt)) + cov + (2*noise)
    return ((numOfObj+numOfAtt) - (numOfObj*numOfAtt)) + cov + noise

### Sort attributes in dataset

In [None]:
test_dict_freq = {'10':10,'45':45,'65':9,'87':2,'0':100}
sorted_attributes = sort_attributes(test_dict_freq)

In [17]:
def sort_attributes(data_res):
    
    try:
        ##usar este for caso o value seja uma lista
        freq_res_dict = {}
        for key,value in data_res.items():
            freq_res_dict[key] = len(value)

        # Create a list of tuples sorted by index 1 i.e. value field     
        listofTuples = sorted(freq_res_dict.items() , reverse=True, key=lambda x: x[1])# usar se value for lista
        # Iterate over the sorted sequence
        # for elem in listofTuples :
        #     print(elem[0] , " ::" , elem[1] )
    #     print(listofTuples)
        sorted_attributes = [elem[0] for elem in listofTuples]
    except:
        ## este é usado caso value seja um número
        sorted_attributes = {k: v for k, v in sorted(data_res.items(), reverse=True, key=lambda item: item[1])}
    
#     if VERBOSE:
#         print("Sorted att: ",sorted_attributes)
    return sorted_attributes

### Update residual dataset

In [None]:
def update_residual_dataset(res_data, attributes_cocluster, objects_cocluster):
    for key, value in res_data.items():
        if key in attributes_cocluster:
            diff_objs = set(res_data[key]).difference(set(objects_cocluster))
            res_data[key] = list(diff_objs)
    return res_data

In [4]:
format_time_output(3600)

'1h:0m:0s'

In [5]:
def format_time_output(time_in_sec):
    ''' 
    This function converts the seconds for the format Hours:Minutes:Seconds.
    '''
    hours = np.floor((time_in_sec/3600))
    mins = np.floor((time_in_sec - (hours*3600))/60)
    secs = np.floor(time_in_sec%60)
#     print(str(int(hours))+'h:'+str(int(mins))+'m:'+str(int(secs))+'s')
    
    return str(int(hours))+'h:'+str(int(mins))+'m:'+str(int(secs))+'s'

In [46]:
format_time_minutes(123240.456)

2054.01

In [19]:
# this function converts the seconds for the format in minutes
def format_time_minutes(time_in_sec):
    return np.round(time_in_sec/60,2)
#     return time_in_sec/60

### Save results - check path

In [None]:
def check_path(path_method):
    current_dir = os.getcwd()
    print(current_dir)
    res = os.path.exists(path_method)
    # clean the folder to save new data
    if res:
        #check if it is empty
        dir_empty = os.listdir(path_method)
        if len(dir_empty) != 0:
    #         shutil.rmtree("OutputAnalysis/kmeans/")
            rm = !rm -r --preserve-root './OutputAnalysis/ococlus/'*
            if not rm:
                print("OCoClus' folder was cleaned.")
    #             os.chdir(path_method)
            else:
                print("sad")
                print(rm)
        else:
    #         print("Empty!")
            pass
    #         os.chdir(path_method)
    else: # nothing exist so create it
        # trying to insert to flase directory 
        try: 
    #         os.chdir(fd) 
            os.mkdir(path_method)
            print("The path was created: "+path_method)

        # Caching the exception     
        except: 
            print("Something wrong with specified directory. Exception- ", sys.exc_info())

### Save clustering result into a txt file

In [None]:
# def writeFileOutput(cols, rows, dataset, method='OCoClus', fileName='OCoClusResult'):
def writeFileOutput(co_clusters, dataset, method='OCoClus', fileName='OCoClusResult'):
    text = ""
#    for c in range(len(data.rows_)):
#        res = [i for i, val in enumerate(data.columns_[c]) if val]
#        for j in res:
#            text += str(j)+" "

#        res = [i for i, val in enumerate(data.rows_[c]) if val]
#        text += "["
#        for j in res:
#            text += str(j)+" "
#        text += "]\n"
    
    num_of_clusters = len(co_clusters)
    
#     for c in range(len(cols)):
    for c in range(num_of_clusters):
#         for i in cols[c]:
        for i in co_clusters[c][0]: # get the attributes in cluster c
            text += str(i)+" "
        
        text += "("+str(len(co_clusters[c][1]))+") [" # get the number of objects in clusters c
        for j in range(len(co_clusters[c][1])): # save in the file each obj
            if j+1 != len(co_clusters[c][1]):
                text += str(co_clusters[c][1][j])+" "
            else:
                text += str(co_clusters[c][1][j])
        text += "]\n"
    
    #print(text)
    if method == 'Dhillon':
        f = open('./datasets/outputs/'+fileName+'.txt', 'w+')#saving at dataset folder
        f.write(text)
        f.close()
        print("Output file saved in: "+"./datasets/outputs/"+fileName+".txt")
    elif method == 'Kluger':
        f = open('./datasets/outputs/'+fileName+'.txt', 'w+')#saving at dataset folder
        f.write(text)
        f.close()
        print("Output file saved in: "+"./datasets/outputs/"+fileName+".txt")
    elif method == 'OCoClus':
        f = open('./OutputAnalysis/ococlus/'+dataset+'/'+fileName+'.txt', 'w+')#saving at dataset folder
        f.write(text)
        f.close()
        print("Output file saved in: "+"./OutputAnalysis/ococlus/"+dataset+"/"+fileName+".txt")
    else:
        print("The output file was not generated. Method option not recognized.")

# Evaluation measure

### Reconstruction error

In [None]:
def Rec_error(data,clusters):
    '''
    This evaluation measure is computed during the algorithm life time.
    '''
    reconstructed_ococlus = np.zeros(data.shape,dtype=int)
    for nc in range(len(clusters)):
        for i in clusters[nc][1]: # object cluster
            for j in clusters[nc][0]: # attribute cluster
                reconstructed_ococlus[int(i)][int(j)] = 1
    print("Reconstruction error: ",np.sum(np.bitwise_xor(data,reconstructed_ococlus)))

### Omega format

In [None]:
def build_clustering_output_omega(co_clusters):
# def build_clustering_output_omega(rowClusters,columnClusters):
    '''
    Build the clustering output format to use in the omega index evaluation from Remy Cazabet version.
    It is optional and we just present this version as a complementary information. If you are interested,
    check it out on his team work group at https://github.com/isaranto/omega_index.
    '''
    
    num_of_clusters = len(co_clusters)    
    clustering = {}
    
    for nc in range(num_of_clusters):
        rowCluster = co_clusters[nc][1]
        columnCluster = co_clusters[nc][0]
        clustering["c"+str(nc)] = []
        
        for i in rowCluster:
            for j in columnCluster:
                clustering["c"+str(nc)].append(("01"+str(i)+"02"+str(j)))
        
    return clustering

### eXascale Infolab 
We used the xmeasure and OvpNMI project that pushished evaluation measures for overlapping task. We can check it on https://github.com/eXascaleInfolab/xmeasures or https://exascale.info/. Look their project on github to know how to use it.

In [None]:
def xmeasures_format(dict_gt):
    '''
    This function build the xmeasure format to use it on their evaluation measure.
    '''
    newData = []
    for i in range(len(dict_gt)):
#         print(dict_gt['c'+str(i)])
        stringLine = dict_gt['c'+str(i)][0]
        for j in range(1,len(dict_gt['c'+str(i)])):
#             stringLine = stringLine+" "+dict_gt['c'+str(i)][j]
            stringLine += " "+dict_gt['c'+str(i)][j]
        newData.append(stringLine)
    
    return newData