<a href="http://agrum.org" target="blank"><img src="http://agrum.gitlab.io/theme/img/logoAgrum.png" align="left" style="height:100px"/></a><a rel="license" href="http://creativecommons.org/licenses/by-nc/4.0/"><img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by-nc/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Dataset" property="dct:title" rel="dct:type">This pyAgrum's notebook</span> is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-nc/4.0/">Creative Commons Attribution-NonCommercial 4.0 International License</a>.

In [1]:
import pyAgrum as gum
import pyAgrum.lib.notebook as gnb

from pyAgrum.lib._utils.oslike import head

### generating data with missing values (at random)

In [2]:
src=gum.fastBN("A->B<-C->D->E<-B;D->F")
gum.generateCSV(src,"EM_nomissing.csv",5000,random_order=False)
src

In [3]:
import pandas as pd
import numpy as np

def add_missing(src,dst,proba):
  df=pd.read_csv(src)
  mask=np.random.choice([True, False], size=df.shape,p=[proba,1-proba])
  df.mask(mask).to_csv(dst,na_rep='?',index=False,float_format='%.0f')

gum.generateCSV(src,"EM_nomissing.csv",5000,random_order=False)
add_missing("EM_nomissing.csv","EM_missing.csv",proba=0.1)

In [4]:
print("No missng")
head("EM_nomissing.csv")
print("Missing")
head("EM_missing.csv")

No missng
A,B,C,D,E,F
1,0,1,1,0,0
1,1,0,1,0,1
1,1,1,0,0,0
1,1,0,1,0,0
1,1,0,0,0,0
1,0,1,1,0,1
1,1,1,1,0,0
0,0,1,1,0,0
1,0,1,1,0,1

Missing
A,B,C,D,E,F
1,0,1,1,0,0
1,1,0,1,0,1
1,?,1,?,0,?
1,1,?,1,0,0
1,1,0,0,0,0
1,0,?,1,0,?
1,1,?,1,0,?
0,0,1,1,0,0
1,?,1,1,0,1



### learning with missing data

In [5]:
learner = gum.BNLearner('EM_missing.csv', ["?"])
print(f"Missing values in EM_missing.csv : {learner.hasMissingValues()}")

Missing values in EM_missing.csv : True


In [6]:
# this will fail : missing data !
# learner.learnParameters(src.dag())

In [7]:
learner.useEM(1e-3)
learner.useAprioriSmoothing()
bn=learner.learnParameters(src.dag())
print(f"# iterations : {learner.nbrIterations()}")
gnb.sideBySide(gnb.getInference(src),gnb.getInference(bn))

# iterations : 5


### learning with smaller error (and no smoothing)

In [8]:
learner = gum.BNLearner('EM_missing.csv', ["?"])
learner.setVerbosity(True)
learner.useEM(1e-8)
bn2=learner.learnParameters(src.dag())
print(f"# iterations : {learner.nbrIterations()}")
gnb.sideBySide(gnb.getInference(src),gnb.getInference(bn2))

# iterations : 14


In [9]:
print(learner.history())

(0.1176449666510023, 0.02634100884131496, 0.0067901780296888824, 0.0018124118788435646, 0.0004908856695220577, 0.0001338924833636206, 3.665638586804743e-05, 1.0056958859805939e-05, 2.7627688753283056e-06, 7.595939621678697e-07, 2.0895798731192673e-07, 5.750475698107303e-08, 1.582955050661122e-08, 4.35834510562542e-09)
