# Install packages

In [1]:
import os

In [2]:
current_path = os.getcwd()
if current_path.split('/')[-1] == 'content':
  !pip install trajminer -q

  from google.colab import drive

  drive.mount('/content/drive')
  print(f"current path:{current_path}. Moving to the project path.")
  os.chdir("drive/MyDrive/Colab Notebooks/mat_tree/")
  print(f"Current path: {os.getcwd()}")
else:
  print("Working locally.")
  print(f"current path:{current_path}.")

Working locally.
current path:g:\Meu Drive\Colab Notebooks\mat_tree\build_similarity_matrix.


# Import libs

In [3]:
import pandas as pd
import numpy as np
import trajminer as tm
from trajminer.similarity import MSM, MUITAS
from trajminer.utils.distance import discrete, euclidean
from tqdm import tqdm
from multiprocessing import Pool, cpu_count

# from scipy.spatial.distance import euclidean

# Code

In [4]:
dtypes = {'UF':'string', 'voto':'category', 'orientacaoGoverno':'category', 'anoProposicao':'int32',
          'anoVotacao':'int32', 'tipoProposicao':'category', 'governo':'category', 'parlamentar':'string',
        #   'data':'datetime64[ns]',
           'idVotacao':'string', 'tid':'int64', 'label':'category', 'horaVotacao':'int32', 'diaDaSemanaVotacao':'int32', 'diaDaSemanaVotacaoNome':'category', 'diaDoAno':'int32', 'diaDoMesVotacao':'int32', 'mesVotacao': 'int32', 'delayVotacaoAnos':'int32', 'alinhamento':'category'}
dataset_path = "../datasets/basometro/basometro.csv"
basometro_df = pd.read_csv(dataset_path, encoding='latin-1', dtype=dtypes, parse_dates=['data'])

In [5]:
basometro_df.head(2)

Unnamed: 0,UF,voto,orientacaoGoverno,anoProposicao,anoVotacao,tipoProposicao,governo,parlamentar,data,idVotacao,tid,label,horaVotacao,diaDaSemanaVotacao,diaDaSemanaVotacaoNome,diaDoAno,diaDoMesVotacao,mesVotacao,delayVotacaoAnos,alinhamento
0,PR,Sim,Sim,2002,2003,MSC,Lula 1,Abelardo Lupion,2003-02-25 18:54:00,25-2-2003.18.54.736,5318,DEM,18,1,Tuesday,56,25,2,1,votou com o governo
1,PR,Sim,Sim,2002,2003,MPV,Lula 1,Abelardo Lupion,2003-03-18 19:30:00,18-3-2003.19.30.742,5318,DEM,19,1,Tuesday,77,18,3,1,votou com o governo


In [6]:
print(basometro_df['data'].min(), basometro_df['data'].max())

2003-01-04 19:04:00 2022-12-04 18:56:00


In [7]:
basometro_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 20 columns):
 #   Column                  Non-Null Count    Dtype         
---  ------                  --------------    -----         
 0   UF                      1048575 non-null  string        
 1   voto                    1048575 non-null  category      
 2   orientacaoGoverno       1048575 non-null  category      
 3   anoProposicao           1048575 non-null  int32         
 4   anoVotacao              1048575 non-null  int32         
 5   tipoProposicao          1048575 non-null  category      
 6   governo                 1048575 non-null  category      
 7   parlamentar             1048575 non-null  string        
 8   data                    1048575 non-null  datetime64[ns]
 9   idVotacao               1048575 non-null  string        
 10  tid                     1048575 non-null  int64         
 11  label                   1048575 non-null  category      
 12  horaVotacao   

In [8]:
basometro_df.shape

(1048575, 20)

In [9]:
print(f"Number of CPUs: {cpu_count()}")
print(f"Unique TIDs: {basometro_df.tid.nunique()}")
print(f"Unique deputies: {basometro_df.parlamentar.nunique()}")
print(f"First tid: {basometro_df.tid.min()}. Last tid: {basometro_df.tid.max()}")

Number of CPUs: 8
Unique TIDs: 17037
Unique deputies: 1797
First tid: 1. Last tid: 20255


In [10]:
print(basometro_df[basometro_df.tid == 5318].shape)
print(basometro_df[basometro_df.tid == 5319].shape)

(21, 20)
(31, 20)


In [11]:
n_points = 2
aspects = ['UF','governo','orientacaoGoverno','tipoProposicao','label','diaDaSemanaVotacaoNome','alinhamento',
           'diaDoAno','mesVotacao','anoVotacao','delayVotacaoAnos']

display(basometro_df[basometro_df.tid == 5318][aspects].head(n_points))
t1 = basometro_df[basometro_df.tid == 5318][aspects].head(n_points).values

display(basometro_df[basometro_df.tid == 5319][aspects].head(n_points))
t2 = basometro_df[basometro_df.tid == 5319][aspects].head(n_points).values

Unnamed: 0,UF,governo,orientacaoGoverno,tipoProposicao,label,diaDaSemanaVotacaoNome,alinhamento,diaDoAno,mesVotacao,anoVotacao,delayVotacaoAnos
0,PR,Lula 1,Sim,MSC,DEM,Tuesday,votou com o governo,56,2,2003,1
1,PR,Lula 1,Sim,MPV,DEM,Tuesday,votou com o governo,77,3,2003,1


Unnamed: 0,UF,governo,orientacaoGoverno,tipoProposicao,label,diaDaSemanaVotacaoNome,alinhamento,diaDoAno,mesVotacao,anoVotacao,delayVotacaoAnos
410435,PB,Lula 1,Sim,MSC,DEM,Tuesday,votou com o governo,56,2,2003,1
410436,PB,Lula 1,Liberado,MPV,DEM,Wednesday,nao se aplica,57,2,2003,1


# MSM

In [12]:
# test
dist_funcs = [discrete, discrete, discrete, discrete, discrete, discrete, discrete,
              euclidean, euclidean, euclidean, euclidean]
thres = [.5, .5, .5, .5, .5, .5, .5, .5, .5, .5, .5]
weights = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
msm = MSM(dist_funcs, thres, weights)
print(msm.similarity(t1,t2))
print(msm.similarity(t2,t1))

0.7272727272727273
0.7272727272727273


## Building matrix

### Sequential

In [13]:
# aspects = ['UF','governo','orientacaoGoverno','tipoProposicao','label','diaDaSemanaVotacaoNome','alinhamento',
#            'diaDoAno','mesVotacao','anoVotacao','delayVotacaoAnos']
# unique_tids = basometro_df.tid.sort_values().unique().tolist()
# num_tids = len(unique_tids)
# # idx_combination = [(i,j) for i in range(num_tids) for j in range(i+1,num_tids)]
# msm_matrix = np.ones((num_tids,num_tids), dtype=np.float32)

# ### msm setup
# dist_funcs = [discrete, discrete, discrete, discrete, discrete, discrete, discrete,
#               euclidean, euclidean, euclidean, euclidean]
# thres = [.5, .5, .5, .5, .5, .5, .5, .5, .5, .5, .5]
# weights = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

# ### calc similarity
# # for idx in tqdm(idx_combination, desc="Computing similaties..."):
# tot_comb = int((num_tids**2)/2)
# counter_ = 0
# for i in range(num_tids):
#     for j in range(i+1, num_tids):
#       counter_ += 1
#       percent_ = np.round((counter_/tot_comb)*100,2)
#       print(f"\rProcessing... {percent_}% | {counter_}/{tot_comb}", end="")
#       # i = idx[0]
#       # j = idx[1]
#       if i != j:
#         t1 = basometro_df[basometro_df.tid == unique_tids[i]][aspects].values
#         t2 = basometro_df[basometro_df.tid == unique_tids[j]][aspects].values
#         msm = MSM(dist_funcs, thres, weights)
#         msm_score = msm.similarity(t1,t2)
#         msm_matrix[i][j] = msm_score
#         msm_matrix[j][i] = msm_score

# df_msm_sim = pd.DataFrame(msm_matrix, columns=unique_tids, index=unique_tids)
# del msm_matrix, unique_tids, t1, t2, msm#, idx_combination

# print(f"Num of unique tids: {num_tids}")
# print(f"Shape: {df_msm_sim.shape}")
# display(df_msm_sim.head(2))
# display(df_msm_sim.tail(2))

# ### save file csv
# dataset_path_output = "datasets/basometro/msm.csv"
# df_msm_sim.to_csv(dataset_path_output, header=True,index=True)
# del df_msm_sim

### Parallel

In [14]:
# aspects = ['UF','governo','orientacaoGoverno','tipoProposicao','label','diaDaSemanaVotacaoNome','alinhamento',
#            'diaDoAno','mesVotacao','anoVotacao','delayVotacaoAnos']
# unique_tids = basometro_df.tid.sort_values().unique().tolist()[:10]
# num_tids = len(unique_tids)
# msm_matrix = np.ones((num_tids,num_tids), dtype=np.int32)

# ### msm setup
# dist_funcs = [discrete, discrete, discrete, discrete, discrete, discrete, discrete,
#               euclidean, euclidean, euclidean, euclidean]
# thres = [.5, .5, .5, .5, .5, .5, .5, .5, .5, .5, .5]
# weights = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

# ## for idx_i in tqdm(range(num_tids-1), desc="Processing MSM matriz..."):
# def cal_sim_msm(idx_i):
#     i = idx_i
#     j = idx_i+1
#     t1 = basometro_df[basometro_df.tid == unique_tids[i]][aspects].values
#     t2 = basometro_df[basometro_df.tid == unique_tids[j]][aspects].values
#     msm = MSM(dist_funcs, thres, weights)
#     # msm_score = msm.similarity(t1,t2)
#     # msm_matrix[i][j] = msm_score
#     # msm_matrix[j][i] = msm_score
#     return [i,j,msm.similarity(t1,t2)]

# if __name__ == "__main__":
#     with Pool(7) as pool:
#         result = pool.map(cal_sim_msm, range(num_tids-1))
#     print(f"Process DONE.")

In [15]:
%%writefile parallel_similarity_basometro.py
import pandas as pd
import numpy as np
import trajminer as tm
from trajminer.similarity import MSM, MUITAS
from trajminer.utils.distance import discrete, euclidean

dtypes = {'UF':'string', 'voto':'category', 'orientacaoGoverno':'category', 'anoProposicao':'int32',
          'anoVotacao':'int32', 'tipoProposicao':'category', 'governo':'category', 'parlamentar':'string',
        #   'data':'datetime64[ns]',
           'idVotacao':'string', 'tid':'int64', 'label':'category', 'horaVotacao':'int32', 'diaDaSemanaVotacao':'int32', 'diaDaSemanaVotacaoNome':'category', 'diaDoAno':'int32', 'diaDoMesVotacao':'int32', 'mesVotacao': 'int32', 'delayVotacaoAnos':'int32', 'alinhamento':'category'}
dataset_path = "../datasets/basometro/basometro.csv"
basometro_df = pd.read_csv(dataset_path, encoding='latin-1', dtype=dtypes, parse_dates=['data'])

aspects = ['UF','governo','orientacaoGoverno','tipoProposicao','label','diaDaSemanaVotacaoNome','alinhamento',
           'diaDoAno','mesVotacao','anoVotacao','delayVotacaoAnos']
unique_tids = basometro_df.tid.sort_values().unique().tolist()
num_tids = len(unique_tids)

### msm setup
dist_funcs = [discrete, discrete, discrete, discrete, discrete, discrete, discrete,
              euclidean, euclidean, euclidean, euclidean]
thres = [.5, .5, .5, .5, .5, .5, .5, .5, .5, .5, .5]
weights = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

def process_msm(idx):
    i = idx[0]
    j = idx[1]
    t1 = basometro_df[basometro_df.tid == unique_tids[i]][aspects].values
    t2 = basometro_df[basometro_df.tid == unique_tids[j]][aspects].values
    msm = MSM(dist_funcs, thres, weights)
    return [unique_tids[i],unique_tids[j],msm.similarity(t1,t2)]

Overwriting parallel_similarity_basometro.py


In [16]:
from parallel_similarity_basometro import process_msm

aspects = ['UF','governo','orientacaoGoverno','tipoProposicao','label','diaDaSemanaVotacaoNome','alinhamento',
           'diaDoAno','mesVotacao','anoVotacao','delayVotacaoAnos']
unique_tids = basometro_df.tid.sort_values().unique().tolist()
header_cols = [str(x).strip() for x in unique_tids]
num_tids = len(unique_tids)
# idx_combination = [(i,j) for i in range(num_tids) for j in range(i+1,num_tids)]
# num_comb = len(idx_combination)
# num_comb = int(((num_tids**2)/2))
num_comb = num_tids**2
counter_ = 0
msm_matrix = np.ones((1,num_tids), dtype=np.float32)
# max_pool = cpu_count()
max_pool = 6
print(f"max_pool: {max_pool}")

if __name__ == "__main__":
    with Pool(max_pool) as p:
        ### block to restart from the last tid
        try:
          tmp = pd.read_csv("../datasets/basometro/msm.csv")
          num_saved_tids = tmp.shape[0]
          counter_ = num_saved_tids*num_tids
          last_pos = num_saved_tids
          last_tid = tmp.index[-1]
          print(f"Number of stored tids:{num_saved_tids}, idx_pos: {last_pos}, last_tid: {last_tid}")
        except:
          last_pos = 0
          print(f"First row run.")

        for i in range(last_pos,num_tids):
          i_j_combination = [(i,j) for j in range(num_tids)]

          counter_ += len(i_j_combination)
          percent_ = np.round((counter_/num_comb)*100,2)
          print(f"\rProcessing: {percent_}% | ({counter_}/{num_comb})", end="")

          pool_outputs = list(
              tqdm(
                  p.imap(process_msm,
                      i_j_combination
                      ),
                  total=len(i_j_combination),
                  desc="\r"
              )
          )

          new_row = [pool_outputs[0][0]]
          for idx, score in enumerate(pool_outputs):
            # print(idx,score)
            new_row.append(score[2])
            # msm_matrix[0][idx] = score[2]

          if i == 0:
            # pd.DataFrame(msm_matrix, columns=['index']+header_cols, dtype='float32') \
            pd.DataFrame(np.array(new_row).reshape((1,num_tids+1)), columns=['index']+header_cols, dtype='float16') \
              .to_csv("../datasets/basometro/msm.csv", header=True, index=False, index_label=['index'])
          else:
            df_read = pd.read_csv("../datasets/basometro/msm.csv")
            # df_tmp = pd.DataFrame(msm_matrix, columns=['index']+header_cols, dtype='float32')
            df_tmp = pd.DataFrame(np.array(new_row).reshape((1,num_tids+1)), columns=['index']+header_cols, dtype='float16')
            df_concat = pd.concat([df_read, df_tmp])#, ignore_index=True)
            df_concat.to_csv("../datasets/basometro/msm.csv", header=True, index=False, index_label=['index'])
            del df_read, df_concat
          print("\r",end='\r', flush=True)

        # pool_outputs = list(
        #     tqdm(
        #         p.imap(process_msm,
        #             # idx_combination
        #             [(i,j) for i in range(num_tids) for j in range(i+1,num_tids)]
        #             ),
        #         total=num_comb,
        #         desc="Processing"
        #     )
        # )

    # del idx_combination
    # print(pool_outputs[:5])

max_pool: 6
Number of stored tids:1106, idx_pos: 1106, last_tid: 1105
Processing: 6.5% | (18859959/290259369)

: 100%|██████████| 17037/17037 [27:59<00:00, 10.15it/s]  


Processing: 6.5% | (18876996/290259369)

: 100%|██████████| 17037/17037 [16:11<00:00, 17.53it/s]


Processing: 6.51% | (18894033/290259369)

: 100%|██████████| 17037/17037 [23:47<00:00, 11.94it/s]  


Processing: 6.52% | (18911070/290259369)

: 100%|██████████| 17037/17037 [15:11<00:00, 18.70it/s]


Processing: 6.52% | (18928107/290259369)

: 100%|██████████| 17037/17037 [09:26<00:00, 30.08it/s] 


Processing: 6.53% | (18945144/290259369)

: 100%|██████████| 17037/17037 [19:50<00:00, 14.31it/s]  


Processing: 6.53% | (18962181/290259369)

: 100%|██████████| 17037/17037 [02:56<00:00, 96.73it/s] 


Processing: 6.54% | (18979218/290259369)

: 100%|██████████| 17037/17037 [18:52<00:00, 15.04it/s]  


Processing: 6.54% | (18996255/290259369)

: 100%|██████████| 17037/17037 [13:32<00:00, 20.98it/s]


Processing: 6.55% | (19013292/290259369)

: 100%|██████████| 17037/17037 [11:19<00:00, 25.08it/s] 


Processing: 6.56% | (19030329/290259369)

: 100%|██████████| 17037/17037 [19:36<00:00, 14.48it/s]  


Processing: 6.56% | (19047366/290259369)

: 100%|██████████| 17037/17037 [25:45<00:00, 11.03it/s]  


Processing: 6.57% | (19064403/290259369)

: 100%|██████████| 17037/17037 [21:02<00:00, 13.50it/s]  


Processing: 6.57% | (19081440/290259369)

: 100%|██████████| 17037/17037 [21:17<00:00, 13.34it/s]  


Processing: 6.58% | (19098477/290259369)

: 100%|██████████| 17037/17037 [26:12<00:00, 10.83it/s]  


Processing: 6.59% | (19115514/290259369)

: 100%|██████████| 17037/17037 [13:31<00:00, 20.99it/s]


Processing: 6.59% | (19132551/290259369)

: 100%|██████████| 17037/17037 [16:20<00:00, 17.38it/s]


Processing: 6.6% | (19149588/290259369)

: 100%|██████████| 17037/17037 [25:50<00:00, 10.99it/s]  


Processing: 6.6% | (19166625/290259369)

: 100%|██████████| 17037/17037 [25:24<00:00, 11.17it/s]  


Processing: 6.61% | (19183662/290259369)

: 100%|██████████| 17037/17037 [12:41<00:00, 22.37it/s]


Processing: 6.62% | (19200699/290259369)

: 100%|██████████| 17037/17037 [18:50<00:00, 15.07it/s]  


Processing: 6.62% | (19217736/290259369)

: 100%|██████████| 17037/17037 [24:10<00:00, 11.75it/s]  


Processing: 6.63% | (19234773/290259369)

: 100%|██████████| 17037/17037 [26:57<00:00, 10.54it/s]  


Processing: 6.63% | (19251810/290259369)

: 100%|██████████| 17037/17037 [15:39<00:00, 18.12it/s]


Processing: 6.64% | (19268847/290259369)

: 100%|██████████| 17037/17037 [15:32<00:00, 18.27it/s]


Processing: 6.64% | (19285884/290259369)

: 100%|██████████| 17037/17037 [15:12<00:00, 18.68it/s]


Processing: 6.65% | (19302921/290259369)

: 100%|██████████| 17037/17037 [22:33<00:00, 12.59it/s]  


Processing: 6.66% | (19319958/290259369)

: 100%|██████████| 17037/17037 [27:50<00:00, 10.20it/s]  


Processing: 6.66% | (19336995/290259369)

: 100%|██████████| 17037/17037 [21:24<00:00, 13.27it/s]  


Processing: 6.67% | (19354032/290259369)

: 100%|██████████| 17037/17037 [18:38<00:00, 15.24it/s]  


Processing: 6.67% | (19371069/290259369)

: 100%|██████████| 17037/17037 [11:30<00:00, 24.66it/s] 


Processing: 6.68% | (19388106/290259369)

: 100%|██████████| 17037/17037 [22:58<00:00, 12.36it/s]  


Processing: 6.69% | (19405143/290259369)

:  50%|█████     | 8579/17037 [01:39<01:38, 85.82it/s] 


KeyboardInterrupt: 

In [None]:
# for msm_score in pool_outputs:
#     msm_matrix[msm_score[0]][msm_score[1]] = msm_score[2]
#     msm_matrix[msm_score[1]][msm_score[0]] = msm_score[2]

# df_msm_sim = pd.DataFrame(msm_matrix, columns=unique_tids, index=unique_tids)
# del msm_matrix,unique_tids

# print(f"Num of unique tids: {num_tids}")
# print(f"Shape: {df_msm_sim.shape}")
# display(df_msm_sim.head(2))
# display(df_msm_sim.tail(2))

# ### save file csv
# dataset_path_output = "datasets/basometro/msm.csv"
# df_msm_sim.to_csv(dataset_path_output, header=True,index=True)
# del df_msm_sim

# MUITAS