In [1]:
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

In [2]:
import pandas as pd
import numpy as np
from bayesian.train_bn import structure_learning, parameter_learning, parameter_learning_mix, n_component
from preprocess.discretization import get_nodes_type, discretization, inverse_discretization, code_categories, get_nodes_sign
from bayesian.save_bn import save_structure, save_params, read_structure, read_params
from bayesian.sampling import generate_synthetics
from external.libpgm.hybayesiannetwork import HyBayesianNetwork
from visualization.visualization import draw_BN
from bayesian.calculate_accuracy import calculate_acc
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from copy import copy
from external.libpgm.sampleaggregator import SampleAggregator
import operator
from sklearn.metrics import accuracy_score, mean_squared_error
from scipy import stats
from scipy.stats import multivariate_normal
from sklearn.mixture import GaussianMixture
import math
from pomegranate import DiscreteDistribution
import random
from sklearn.model_selection import train_test_split

In [3]:
geo = geo = pd.read_csv('../data/hackathon_processed.csv')
columns = ['Tectonic regime', 'Period', 'Lithology', 'Structural setting', 'Hydrocarbon type', 'Gross','Netpay','Porosity','Permeability', 'Depth']
geo = geo[columns]
geo.dropna(inplace=True)
geo.reset_index(inplace=True, drop=True)

In [4]:
geo_types = get_nodes_type(geo)
geo_types

{'Tectonic regime': 'disc',
 'Period': 'disc',
 'Lithology': 'disc',
 'Structural setting': 'disc',
 'Hydrocarbon type': 'disc',
 'Gross': 'cont',
 'Netpay': 'cont',
 'Porosity': 'cont',
 'Permeability': 'cont',
 'Depth': 'cont'}

In [5]:
geo_signs = get_nodes_sign(geo)
geo_signs

{'Gross': 'pos',
 'Netpay': 'pos',
 'Porosity': 'pos',
 'Permeability': 'pos',
 'Depth': 'pos'}

In [6]:
colums_for_code = []
columns_for_disc = []
for c in columns:
    if geo_types[c] == 'disc':
        colums_for_code.append(c)
    else:
        columns_for_disc.append(c)

In [7]:
geo_coded, label_coder = code_categories(geo, 'label', colums_for_code)
geo_discrete, coder = discretization(geo_coded, 'equal_frequency', columns_for_disc)
geo_only_discrete, discrete_coder = discretization(geo, 'equal_frequency', columns_for_disc)

In [39]:
accuracy_dict = dict()
rmse_dict = dict()
pred_param = [[0 for j in range(geo.shape[0])] for i in range(len(columns))]
real_param = [[0 for j in range(geo.shape[0])] for i in range(len(columns))]
for i in range (geo.shape[0]):
    test = dict(geo.iloc[i,:])
    train_data = geo_discrete.drop(index=i)
    param_train = geo.drop(index=i)
    train_data.reset_index(inplace=True, drop = True)
    param_train.reset_index(inplace=True, drop = True)
    bn = structure_learning(train_data, 'HC', geo_types, 'K2')
    params = parameter_learning(param_train, geo_types, bn)
    save_structure(bn, 'all_net')
    skel = read_structure('all_net')
    save_params(params, 'all_net_param')
    params = read_params('all_net_param')
    all_bn = HyBayesianNetwork(skel, params)
    for n, key in enumerate(columns):
        train_dict = copy(test)
        train_dict.pop(key)
        try:
            sample = generate_synthetics(all_bn, geo_signs, evidence=train_dict)
            if geo_types[key] == 'disc':
                dict_top_probs = dict()
                probs = dict(sample.groupby(key)[key].count() / sample.shape[0])
                sorted_res = sorted(probs.items(), key=operator.itemgetter(1), reverse=True)
                pred_param[n][i] = sorted_res[0][0]
                real_param[n][i] = test[key]
            if node_type[key] == 'cont':
                pred_param[n][i] = np.mean(sample[key].values)
                real_param[n][i] = test[key]
        except:
            continue
for n, key in enumerate(columns):
        if node_type[key] == 'disc':
            accuracy_dict[key] = round(accuracy_score(real_param[n], pred_param[n]),2)
        if node_type[key] == 'cont':
            rmse_dict[key] = round(mean_squared_error(real_param[n], pred_param[n], squared=False),2)

{'V': ['Tectonic regime',
  'Period',
  'Lithology',
  'Structural setting',
  'Hydrocarbon type',
  'Gross',
  'Netpay',
  'Porosity',
  'Permeability',
  'Depth'],
 'E': [['Hydrocarbon type', 'Tectonic regime'],
  ['Hydrocarbon type', 'Period'],
  ['Hydrocarbon type', 'Lithology'],
  ['Period', 'Lithology'],
  ['Structural setting', 'Lithology'],
  ['Hydrocarbon type', 'Structural setting'],
  ['Hydrocarbon type', 'Gross'],
  ['Lithology', 'Gross'],
  ['Porosity', 'Netpay'],
  ['Gross', 'Porosity'],
  ['Tectonic regime', 'Porosity'],
  ['Gross', 'Permeability'],
  ['Period', 'Depth'],
  ['Lithology', 'Depth'],
  ['Netpay', 'Depth']]}