In [1]:
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir)

In [2]:
import pandas as pd
import numpy as np
from copy import copy
import operator
from copy import copy
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, mean_squared_error
from external.libpgm.sampleaggregator import SampleAggregator
from preprocess.discretization import get_nodes_type
from external.libpgm.hybayesiannetwork import HyBayesianNetwork
from typing import Tuple
from bayesian.train_bn import structure_learning, parameter_learning
from bayesian.save_bn import save_structure, save_params, read_structure, read_params

from bayesian.calculate_accuracy import calculate_acc
from bayesian.train_bn import structure_learning, parameter_learning
from preprocess.discretization import get_nodes_type, discretization, code_categories

from bayesian.save_bn import save_structure, save_params, read_structure, read_params

from joblib import Parallel, delayed
import cpuinfo


In [3]:
def parall_accuracy(bn: HyBayesianNetwork, data: pd.DataFrame, columns: list, method: str, parall_count: int = 1, normed: bool = True):
    def wrapper(bn: HyBayesianNetwork, data: pd.DataFrame, columns: list, method: str, normed: bool = True):
        if len(data) == 1:
            accuracy_dict, rmse_dict, real_param, pred_param, indexes = calculate_acc(bn, data, columns, method, normed)
            return {'real_param': [el[0] for el in real_param], 'pred_param': [el[0] for el in pred_param], 'indexes': indexes}
        else:
            raise Exception('Wrapper for one row from pandas.DataFrame')

    accuracy_dict = dict()
    rmse_dict = dict()
    pred_param = [[0 for j in range(data.shape[0])] for i in range(len(columns))]
    real_param = [[0 for j in range(data.shape[0])] for i in range(len(columns))]
    indexes = []
    node_type = get_nodes_type(data)
    
    processed_list = Parallel(n_jobs=parall_count)( delayed(wrapper)( bn, data.loc[[i]], columns, method, normed) for i in data.index)
    
    for i in range(data.shape[0]):
        curr_real = processed_list[i]['real_param']
        curr_pred = processed_list[i]['pred_param']
        curr_ind = processed_list[i]['indexes']
        for n, key in enumerate(columns):
            real_param[n][i] = curr_real[n]
            pred_param[n][i] = curr_pred[n]
            if curr_ind:
                indexes.extend([i for _ in range(len(curr_ind))])
    
    for n, key in enumerate(columns):
        if node_type[key] == 'disc':
            accuracy_dict[key] = round(accuracy_score(real_param[n], pred_param[n]),2)
        if node_type[key] == 'cont':
            if normed:
                rmse_dict[key] = round(mean_squared_error(real_param[n], pred_param[n], squared=False) / (np.max(real_param[n]) - np.min(real_param[n])), 2)
            else:
                rmse_dict[key] = round(mean_squared_error(real_param[n], pred_param[n], squared=False),2)

    return  accuracy_dict, rmse_dict, real_param, pred_param, indexes

In [15]:
geo = pd.read_csv('../data/socio.csv')
#columns = ['Tectonic regime', 'Period', 'Lithology', 'Structural setting', 'Hydrocarbon type', 'Gross','Netpay','Porosity','Permeability', 'Depth']
columns = geo.columns.tolist()
geo = geo[columns]
geo.dropna(inplace=True)
geo.reset_index(inplace=True, drop=True)
#geo = geo[:100]

In [16]:
geo['sex'] = geo['sex'].apply(lambda x: str(x))
geo['relation'] = geo['relation'].apply(lambda x: str(x))
geo['is_parent'] = geo['is_parent'].apply(lambda x: str(x))
geo['has_pets'] = geo['has_pets'].apply(lambda x: str(x))
geo['age'] = geo['age'].apply(lambda x: str(x))

In [17]:
geo_types = get_nodes_type(geo)
geo_types

{'sex': 'disc',
 'relation': 'disc',
 'is_parent': 'disc',
 'has_pets': 'disc',
 'age': 'disc',
 'mean_tr': 'cont',
 'median_tr': 'cont',
 'tr_per_month': 'cont'}

In [18]:
colums_for_code = []
columns_for_disc = []
for c in columns:
    if geo_types[c] == 'disc':
        colums_for_code.append(c)
    else:
        columns_for_disc.append(c)

In [19]:
geo_coded, label_coder = code_categories(geo, 'label', colums_for_code)
geo_discrete, coder = discretization(geo_coded, 'equal_frequency', columns_for_disc)
geo_only_discrete, discrete_coder = discretization(geo, 'equal_frequency', columns_for_disc)

In [21]:
path = 'parall_experiment'
bn_geo = structure_learning(geo_discrete, 'HC', geo_types, 'K2', cont_disc = False)
params = parameter_learning(geo, geo_types, bn_geo, 'simple')
save_structure(bn_geo, path)
save_params(params, f'{path}_param')
skel = read_structure(path)
params = read_params(f'{path}_param')
bn = HyBayesianNetwork(skel, params)


In [22]:
cpu_count = max(cpuinfo.get_cpu_info()['count']-1, 1)
accuracy_dict, rmse_dict, real_param, pred_param, indexes = parall_accuracy(bn, geo, columns, 'simple', parall_count = cpu_count, normed = True)

In [23]:
accuracy_dict

{'sex': 0.64,
 'relation': 0.23,
 'is_parent': 0.72,
 'has_pets': 0.95,
 'age': 0.06}

In [24]:
rmse_dict

{'mean_tr': 0.12, 'median_tr': 0.05, 'tr_per_month': 0.12}

In [200]:
indexes

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
