### Initialization

In [None]:
#PoPPy folder with necessary directories is in folder PoPPy, but if you want you can clobe the original repo

#!git clone https://github.com/HongtengXu/PoPPy.git

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tqdm.notebook import tqdm

import torch
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader


os.chdir('PoPPy')
import dev.util as util
from model.MixHawkesProcess import MixHawkesProcessModel
from preprocess.DataIO import load_sequences_csv
from preprocess.DataOperation import data_info, EventSampler, enumerate_all_events
os.chdir('..')

### Loading Data

Loading dataset, building dataframe with all files, loading dataset with true clusters

In [None]:
path_to_data = 'data/'
dataset_name = 'IPTV'
path_to_dataset = path_to_data+dataset_name

df = pd.DataFrame([])

all_files_in_datafolder = os.listdir(path_to_dataset)

for file in all_files_in_datafolder:
    
    #skipping not relevant files 
    if file == 'all_users.csv' or file == 'info.json' or '(' in file:
        continue
        
    elif file == 'clusters.csv': 
        # getting dataset with true labels
        true_clust = pd.read_csv(f'{path_to_dataset}/clusters.csv') 
        true_clust.columns = ['file_name', true_clust.columns[1]]
        continue
    
    
    df_loc = pd.read_csv(f'{path_to_dataset}/{file}')
    df_loc['file_name'] = [int(file.replace('.csv', ''))] * len(df_loc)
    df_loc = df_loc.iloc[:, 1:]
    
    df = pd.concat([df, df_loc])
    
df.to_csv(f'{path_to_dataset}/all_users.csv', index=None)

In [None]:
df.head()
true_clust.head()

### DataLoader

Using dataloader from PoPPy model

In [None]:
# hyper-parameters
num_cluster = true_clust['cluster_id'].nunique() 
memory_size = 3
batch_size = 128
use_cuda = True
use_cuda = use_cuda and torch.cuda.is_available()
seed = 1
torch.manual_seed(seed)
if use_cuda:
    torch.cuda.manual_seed(seed)
kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}

epochs = 1

In [None]:
# test sequence loading functions
# load event sequences from csv file

domain_names = {'seq_id': 'id',  # in this dict necessary to enter names of columns of your dataset appropriate id, time and event
                'time': 'time',
                'event': 'event'} 
database = load_sequences_csv(f'{path_to_dataset}/all_users.csv', #path do dataset with ids in one file
                              domain_names=domain_names)
data_info(database)

# sample batches from database
trainloader = DataLoader(EventSampler(database=database, memorysize=memory_size),
                         batch_size=batch_size,
                         shuffle=True,
                         **kwargs)
validloader = DataLoader(EventSampler(database=database, memorysize=memory_size),
                         batch_size=batch_size,
                         shuffle=True,
                         **kwargs)

In [None]:
# initialize model
num_type = len(database['type2idx'])
mu_dict = {'model_name': 'NaiveExogenousIntensity',
           'parameter_set': {'activation': 'identity'}
           }
alpha_dict = {'model_name': 'NaiveEndogenousImpact',
              'parameter_set': {'activation': 'identity'}
              }

kernel_para = np.zeros((2, 1))
kernel_para[1, 0] = 0.5
kernel_para = torch.from_numpy(kernel_para)
kernel_para = kernel_para.type(torch.FloatTensor)
kernel_dict = {'model_name': 'GateKernel',
               'parameter_set': kernel_para}
mixhawkes_model = MixHawkesProcessModel(num_type=num_type,
                                        num_cluster=num_cluster,
                                        num_sequence=len(database['seq2idx']),
                                        mu_dict=[mu_dict],
                                        alpha_dict=[alpha_dict],
                                        kernel_dict=[kernel_dict],
                                        activation='identity',
                                        use_cuda=use_cuda)

### Training

In [None]:
# initialize optimizer
optimizer = optim.Adam(mixhawkes_model.lambda_model.parameters(), lr=0.01)
scheduler = lr_scheduler.ExponentialLR(optimizer, gamma=0.8)

# train model
mixhawkes_model.fit(trainloader, optimizer, epochs, scheduler=scheduler,
                    sparsity=100, nonnegative=0, use_cuda=use_cuda, validation_set=validloader)
# save model
mixhawkes_model.save_model('PoPPy/{}/DMHP_1.pt'.format(util.OUTPUT_DIR), mode='entire')
mixhawkes_model.save_model('PoPPy/{}/DMHP_1.pt'.format(util.OUTPUT_DIR), mode='parameter')

### Predicting clusters

In [None]:
# load model (works only if CUDA is avaliable: odd things from PoPPy repo)
mixhawkes_model.load_model(f'PoPPy/{util.OUTPUT_DIR}/DMHP_1.pt', mode='entire')
r = mixhawkes_model.responsibility
clusters_prediction = np.argmax(r.detach().cpu().numpy(), axis=1) # r - responsobility matrix with probabilities
plt.hist(clusters_prediction)
plt.show()
clusters_predicted_users = {float(database['idx2seq'][i]):cl for i, cl in enumerate(clusters_prediction)}
df['clust_pred'] = df['id'].apply(lambda x: clusters_predicted_users[x])
df.to_csv(f'{util.OUTPUT_DIR}/cluster_prediction_cluster{13}.csv')


### Purity calculation

In [None]:
from tablesplots.get_metrics import purity
df_gr = df.groupby(['file_name'], as_index=False)['clust_pred'].mean()
df_gr.head()
true_clust = pd.merge(true_clust, df_gr, on='file_name')
true_clust.head()
cl_pred = torch.Tensor(true_clust['clust_pred'].values)
cl_true = torch.Tensor(true_clust['cluster_id'].values)
purity(cl_true, cl_pred)