In [7]:
import pandas as pd
from data_process.preprocessing import discretization, get_nodes_type, code_categories
from block_learning.train_bn import structure_learning, parameter_learning
from block_learning.partial_bn_train import connect_partial_bn
from libpgm.hybayesiannetwork import HyBayesianNetwork
from libpgm.sampleaggregator import SampleAggregator
import seaborn as sns
import numpy as np
import time
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from block_learning.save_bn import save_structure, save_params
from block_learning.read_bn import read_structure, read_params
from kmodes.kmodes import KModes

In [8]:
data = pd.read_csv('data/final_dataset.csv')
data.head(10)

Unnamed: 0,age,sex,is_closed,has_high_education,relation_status,number_of_relatives,len_of_about,number_of_activities,number_of_books,number_of_friends,...,90,sum_am,gamer,parent,driver,has_pets,cash_usage,gulyaka,zhavoronok,sum_act
0,34,1,0,1,1,2,14,1,1,911,...,2100.0,86523.380875,51,0,0,0,0,1,0,1
1,0,1,0,0,0,0,0,0,0,687,...,26360.0,33493.455566,2,0,1,0,0,1,0,2
2,37,1,0,0,0,0,0,0,0,24,...,1128.0,18458.058882,17,0,0,1,0,0,0,1
3,37,1,0,0,0,0,0,0,0,2535,...,1246.59,62419.014987,38,0,0,0,0,0,0,0
4,39,1,0,1,4,0,0,1,1,291,...,27617.0,53262.99101,4,1,0,0,0,1,0,2
5,34,1,0,0,0,0,0,0,0,413,...,2258.0,29671.422055,26,0,0,0,0,1,0,1
6,36,0,0,0,0,0,0,0,0,175,...,2717.0,55001.432499,26,0,1,0,0,0,0,1
7,0,0,0,0,0,0,0,0,0,982,...,1615.85,51665.131081,60,0,0,1,0,1,0,2
8,35,0,0,0,0,0,0,0,0,279,...,4090.0,19942.358601,6,0,1,0,0,0,0,1
9,41,0,0,0,0,0,0,0,0,134,...,2398.0,4406.399963,4,0,1,0,0,0,0,1


In [9]:
data = data.loc[data['sex'] != 0]

In [10]:
data.shape

(28190, 44)

In [11]:
data = data[['age',	'sex',	'has_high_education',	'relation_status',	'number_of_relatives','len_of_about',	'number_of_activities',	'number_of_books',	'number_of_interests',	'number_of_movies','Love & relation',	'Gifts & holidays',	'History & politics',	'Music',	'Purchase & sale',	'Cooking', 'Fitness & cosmetology',	'Job & money','nodes',	'edges',	'size',	'betweenness_centrality',	'density',	'eigenvector_centrality',	'degree_assortativity_coefficient', 'number_of_followers', 'max_tr',	'mean_tr',	'med_tr', 'parent',	'driver',	'has_pets',	'cash_usage']]

In [12]:
data = data.loc[data['mean_tr'] < 1000]

In [13]:
data.shape

(18644, 33)

In [14]:
discrete_data = discretization(data, 'kmeans', bins= 5, columns=['Love & relation',	'Gifts & holidays',	'History & politics',	'Music',	'Purchase & sale',	'Cooking', 'Fitness & cosmetology',	'Job & money', 'nodes',	'edges',	'size', 'betweenness_centrality',	'density',	'eigenvector_centrality',	'degree_assortativity_coefficient','max_tr',	'mean_tr',	'med_tr'])

In [15]:
discrete_data.head(10)

Unnamed: 0,age,sex,has_high_education,relation_status,number_of_relatives,len_of_about,number_of_activities,number_of_books,number_of_interests,number_of_movies,...,eigenvector_centrality,degree_assortativity_coefficient,number_of_followers,max_tr,mean_tr,med_tr,parent,driver,has_pets,cash_usage
0,34,1,1,1,2,14,1,1,1,1,...,1,4,720,1,4,2,0,0,0,0
1,37,1,0,0,0,0,0,0,0,0,...,4,1,0,0,1,0,0,0,1,0
2,37,1,0,0,0,0,0,0,0,0,...,1,4,3916,1,3,2,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,2,3,7853,0,0,0,0,0,0,0
4,43,1,1,0,0,40,1,3,1,1,...,2,4,464,0,3,1,0,0,0,0
5,34,1,1,4,1,0,1,1,4,1,...,3,3,333,1,2,1,0,1,0,0
6,35,1,1,0,1,0,1,1,1,1,...,1,4,1395,2,2,1,1,0,0,0
7,0,1,0,0,0,0,0,0,0,0,...,1,3,535,3,2,0,0,0,0,0
8,34,1,0,0,0,0,0,0,0,0,...,2,4,1490,0,2,0,0,0,1,0
9,32,1,0,0,0,0,0,0,0,0,...,3,3,1108,0,2,1,1,1,1,0


In [16]:
module1 = discrete_data[['age',	'sex',	'has_high_education',	'relation_status',	'number_of_relatives']]
module2 = discrete_data[['len_of_about',	'number_of_activities',	'number_of_books',	'number_of_interests',	'number_of_movies']]
module3 = discrete_data[['Love & relation',	'Gifts & holidays',	'History & politics',	'Music',	'Purchase & sale',	'Cooking', 'Fitness & cosmetology',	'Job & money']]
module4 = discrete_data[['nodes',	'edges',	'size',	'betweenness_centrality',	'density',	'eigenvector_centrality',	'degree_assortativity_coefficient', 'number_of_followers']]
module5 = discrete_data[['max_tr',	'mean_tr',	'med_tr', 'parent',	'driver',	'has_pets',	'cash_usage']]

In [17]:
node_type1 = get_nodes_type(data[['age',	'sex',	'has_high_education',	'relation_status',	'number_of_relatives']])
node_type1

{'age': 'disc',
 'sex': 'disc',
 'has_high_education': 'disc',
 'relation_status': 'disc',
 'number_of_relatives': 'disc'}

In [18]:
node_type2 = get_nodes_type(data[['len_of_about',	'number_of_activities',	'number_of_books',	'number_of_interests',	'number_of_movies']])
node_type2

{'len_of_about': 'disc',
 'number_of_activities': 'disc',
 'number_of_books': 'disc',
 'number_of_interests': 'disc',
 'number_of_movies': 'disc'}

In [19]:
node_type3 = get_nodes_type(data[['Love & relation',	'Gifts & holidays',	'History & politics',	'Music',	'Purchase & sale',	'Cooking', 'Fitness & cosmetology',	'Job & money']])
node_type3

{'Love & relation': 'cont',
 'Gifts & holidays': 'cont',
 'History & politics': 'cont',
 'Music': 'cont',
 'Purchase & sale': 'cont',
 'Cooking': 'cont',
 'Fitness & cosmetology': 'cont',
 'Job & money': 'cont'}

In [20]:
node_type4 = get_nodes_type(data[['nodes',	'edges',	'size',	'betweenness_centrality',	'density',	'eigenvector_centrality',	'degree_assortativity_coefficient', 'number_of_followers']])
node_type4

{'nodes': 'disc',
 'edges': 'disc',
 'size': 'disc',
 'betweenness_centrality': 'cont',
 'density': 'cont',
 'eigenvector_centrality': 'cont',
 'degree_assortativity_coefficient': 'cont',
 'number_of_followers': 'disc'}

In [21]:
node_type5 = get_nodes_type(data[['max_tr',	'mean_tr',	'med_tr', 'parent',	'driver',	'has_pets',	'cash_usage']])
node_type5

{'max_tr': 'cont',
 'mean_tr': 'cont',
 'med_tr': 'cont',
 'parent': 'disc',
 'driver': 'disc',
 'has_pets': 'disc',
 'cash_usage': 'disc'}

In [22]:
start = time.time()
bn1 = structure_learning(module1, 'MI', node_type1)

In [23]:
bn1

{'V': ['age',
  'sex',
  'has_high_education',
  'relation_status',
  'number_of_relatives'],
 'E': [['number_of_relatives', 'sex'],
  ['age', 'has_high_education'],
  ['sex', 'has_high_education'],
  ['age', 'relation_status'],
  ['sex', 'relation_status']]}

In [24]:
param1 = parameter_learning(data[['age',	'sex',	'has_high_education',	'relation_status',	'number_of_relatives']], node_type1, bn1)

In [25]:
save_structure(bn1, 'skel1')
skel1 = read_structure('skel1')
save_params(param1, 'params1')
params1 = read_params('params1')
hybn1 = HyBayesianNetwork(skel1, params1)

In [26]:
end = time.time()
print(end - start)

7.2928996086120605


In [27]:
bn2 = structure_learning(module2, 'K2', node_type2)

  0%|          | 4/1000000 [00:00<26:11:13, 10.61it/s]


In [28]:
bn2

{'V': ['len_of_about',
  'number_of_activities',
  'number_of_books',
  'number_of_interests',
  'number_of_movies'],
 'E': [['number_of_activities', 'len_of_about'],
  ['number_of_books', 'number_of_interests'],
  ['number_of_interests', 'number_of_activities'],
  ['number_of_movies', 'number_of_books']]}

In [29]:
param2 = parameter_learning(data[['len_of_about',	'number_of_activities',	'number_of_books',	'number_of_interests',	'number_of_movies']], node_type2, bn2)

In [30]:
save_structure(bn2, 'skel2')
skel2 = read_structure('skel2')
save_params(param2, 'params2')
params2 = read_params('params2')
hybn2 = HyBayesianNetwork(skel2, params2)


In [31]:
hybn_1_2 = connect_partial_bn(hybn1, hybn2, data[['age',	'sex',	'has_high_education',	'relation_status',	'number_of_relatives', 'len_of_about',	'number_of_activities',	'number_of_books',	'number_of_interests',	'number_of_movies']], 'LV1')

In [32]:
end = time.time()
print(end - start)

17.16846799850464


In [33]:
bn3 = structure_learning(module3, 'MI', node_type3)
bn3

{'V': ['Love & relation',
  'Gifts & holidays',
  'History & politics',
  'Music',
  'Purchase & sale',
  'Cooking',
  'Fitness & cosmetology',
  'Job & money'],
 'E': [['Fitness & cosmetology', 'Love & relation'],
  ['Cooking', 'Gifts & holidays'],
  ['Purchase & sale', 'Gifts & holidays'],
  ['Fitness & cosmetology', 'History & politics'],
  ['Fitness & cosmetology', 'Music'],
  ['Music', 'Purchase & sale'],
  ['Cooking', 'Purchase & sale'],
  ['Love & relation', 'Cooking'],
  ['Cooking', 'Job & money'],
  ['Purchase & sale', 'Job & money']]}

In [34]:
param3 = parameter_learning(data[['Love & relation',	'Gifts & holidays',	'History & politics',	'Music',	'Purchase & sale',	'Cooking', 'Fitness & cosmetology',	'Job & money']], node_type3, bn3)

In [35]:
save_structure(bn3, 'skel3')
skel3 = read_structure('skel3')
save_params(param3, 'params3')
params3 = read_params('params3')
hybn3 = HyBayesianNetwork(skel3, params3)