<a href="http://agrum.org" target="blank"><img src="http://agrum.gitlab.io/theme/img/logoAgrum.png" align="left" style="height:100px"/></a><a rel="license" href="http://creativecommons.org/licenses/by-nc/4.0/"><img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by-nc/4.0/88x31.png" /></a><br />This pyAgrum's notebook is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-nc/4.0/">Creative Commons Attribution-NonCommercial 4.0 International License</a>.

In [1]:
import pyAgrum as gum
import pyAgrum.lib.notebook as gnb

from pyAgrum.lib._utils.oslike import head

import os
#the bases will be saved in "out/*.csv"
EMnomissing=os.path.join("out","EM_nomissing.csv")
EMmissing=os.path.join("out","EM_missing.csv")

### generating data with missing values (at random)

In [2]:
src=gum.fastBN("A->B<-C->D->E<-B;D->F")
gum.generateCSV(src,EMnomissing,5000,random_order=False)
src

In [3]:
import pandas as pd
import numpy as np

def add_missing(src,dst,proba):
  df=pd.read_csv(src)
  mask=np.random.choice([True, False], size=df.shape,p=[proba,1-proba])
  df.mask(mask).to_csv(dst,na_rep='?',index=False,float_format='%.0f')

gum.generateCSV(src,EMnomissing,5000,random_order=False)
add_missing(EMnomissing,EMmissing,proba=0.1)

In [4]:
print("No missing")
head(EMnomissing)
print("Missing")
head(EMmissing)

No missing
A,B,C,D,E,F
0,1,1,1,1,1
1,0,0,0,0,1
1,1,0,0,1,0
0,0,0,0,0,1
0,1,1,0,0,0
1,0,0,0,0,0
0,0,0,1,1,0
0,0,1,0,1,0
0,1,1,0,0,0

Missing
A,B,C,D,E,F
0,1,1,1,1,?
1,0,0,0,0,1
1,1,0,0,1,?
0,0,0,?,0,1
0,1,1,0,0,?
1,0,0,0,0,0
0,0,0,1,1,0
0,0,1,0,1,0
0,?,1,0,0,0



### learning with missing data

In [5]:
learner = gum.BNLearner(EMmissing, ["?"])
print(f"Missing values in {EMmissing} : {learner.hasMissingValues()}")

Missing values in out\EM_missing.csv : True


In [6]:
# this will fail : missing data !
# learner.learnParameters(src.dag())

In [7]:
learner.useEM(1e-3)
learner.useAprioriSmoothing()
bn=learner.learnParameters(src.dag())
print(f"# iterations : {learner.nbrIterations()}")
gnb.sideBySide(gnb.getInference(src),gnb.getInference(bn))

# iterations : 5


0,1
"structs Inference in 1.00ms A  2021-05-01T22:35:10.468430  image/svg+xml  Matplotlib v3.4.1, https://matplotlib.org/  B  2021-05-01T22:35:10.556437  image/svg+xml  Matplotlib v3.4.1, https://matplotlib.org/  A->B E  2021-05-01T22:35:10.731429  image/svg+xml  Matplotlib v3.4.1, https://matplotlib.org/  B->E C  2021-05-01T22:35:10.630503  image/svg+xml  Matplotlib v3.4.1, https://matplotlib.org/  C->B D  2021-05-01T22:35:10.680430  image/svg+xml  Matplotlib v3.4.1, https://matplotlib.org/  C->D D->E F  2021-05-01T22:35:10.803430  image/svg+xml  Matplotlib v3.4.1, https://matplotlib.org/  D->F","structs Inference in 1.00ms A  2021-05-01T22:35:11.077434  image/svg+xml  Matplotlib v3.4.1, https://matplotlib.org/  B  2021-05-01T22:35:11.164437  image/svg+xml  Matplotlib v3.4.1, https://matplotlib.org/  A->B E  2021-05-01T22:35:11.283430  image/svg+xml  Matplotlib v3.4.1, https://matplotlib.org/  B->E C  2021-05-01T22:35:11.204431  image/svg+xml  Matplotlib v3.4.1, https://matplotlib.org/  C->B D  2021-05-01T22:35:11.244430  image/svg+xml  Matplotlib v3.4.1, https://matplotlib.org/  C->D D->E F  2021-05-01T22:35:11.323439  image/svg+xml  Matplotlib v3.4.1, https://matplotlib.org/  D->F"


### learning with smaller error (and no smoothing)

In [8]:
learner = gum.BNLearner(EMmissing, ["?"])
learner.setVerbosity(True)
learner.useEM(1e-8)
bn2=learner.learnParameters(src.dag())
print(f"# iterations : {learner.nbrIterations()}")
gnb.sideBySide(gnb.getInference(src),gnb.getInference(bn2))

# iterations : 14


0,1
"structs Inference in 1.00ms A  2021-05-01T22:35:14.818427  image/svg+xml  Matplotlib v3.4.1, https://matplotlib.org/  B  2021-05-01T22:35:14.875426  image/svg+xml  Matplotlib v3.4.1, https://matplotlib.org/  A->B E  2021-05-01T22:35:15.034432  image/svg+xml  Matplotlib v3.4.1, https://matplotlib.org/  B->E C  2021-05-01T22:35:14.927434  image/svg+xml  Matplotlib v3.4.1, https://matplotlib.org/  C->B D  2021-05-01T22:35:14.982430  image/svg+xml  Matplotlib v3.4.1, https://matplotlib.org/  C->D D->E F  2021-05-01T22:35:15.085430  image/svg+xml  Matplotlib v3.4.1, https://matplotlib.org/  D->F","structs Inference in 1.00ms A  2021-05-01T22:35:15.413429  image/svg+xml  Matplotlib v3.4.1, https://matplotlib.org/  B  2021-05-01T22:35:15.452430  image/svg+xml  Matplotlib v3.4.1, https://matplotlib.org/  A->B E  2021-05-01T22:35:15.570429  image/svg+xml  Matplotlib v3.4.1, https://matplotlib.org/  B->E C  2021-05-01T22:35:15.491427  image/svg+xml  Matplotlib v3.4.1, https://matplotlib.org/  C->B D  2021-05-01T22:35:15.531427  image/svg+xml  Matplotlib v3.4.1, https://matplotlib.org/  C->D D->E F  2021-05-01T22:35:15.610430  image/svg+xml  Matplotlib v3.4.1, https://matplotlib.org/  D->F"


In [9]:
print(learner.history())

(0.12275211287435556, 0.03018940972953259, 0.008779541798954339, 0.0024930678105413258, 0.0007097430858201197, 0.00020312961135064928, 5.83462518669295e-05, 1.6789259746185282e-05, 4.83394284320989e-06, 1.3916303039374428e-06, 4.004515203659874e-07, 1.1516477929380269e-07, 3.309952326373988e-08, 9.507569981039353e-09)
