# Dirichlet prior as database

BNLearner gives access of many priors for the parameters and structural learning. One of them is the Dirichlet prior which needs a a prior for every possible parameter in a BN. aGrUM/pyAgrum allows to use a database as a source of Dirichlet prior.

In [1]:
%matplotlib inline
from pylab import *
import matplotlib.pyplot as plt

import os

import pyAgrum as gum
import pyAgrum.lib.notebook as gnb

sizePrior=10000
sizeData=30000

## generating databases for Dirichlet prior and for the learning 

In [8]:
bnPrior = gum.fastBN("A->B;C;D")
gum.generateCSV(bnPrior, "dirichlet.csv", sizePrior, with_labels=True,random_order=True)

bnData = gum.fastBN("A->B->C->D")
gum.generateCSV(bnData, "database.csv", sizeData, with_labels=True,random_order=False)

gnb.sideBySide(bnData,bnPrior,
               captions=[f"Database ({sizeData} cases)",f"Prior ({sizePrior} cases)"])

0,1
G A A B B A->B C C B->C D D C->D,G A A B B A->B C C D D
Database (30000 cases),Prior (10000 cases)


## Learning databases

In [9]:
# bnPrior is used to give the variables and their domains
learnerData = gum.BNLearner("database.csv") 
learnerPrior = gum.BNLearner("dirichlet.csv") 
learnerData.useScoreBIC()
learnerPrior.useScoreBIC()
gnb.sideBySide(learnerData.learnBN(),learnerPrior.learnBN(),
              captions=["Learning from Data","Learning from Prior"])

0,1
G A A B B B->A C C C->B D D D->C,G C C D D B B A A A->B
Learning from Data,Learning from Prior


## Learning with Dirichlet prior

In [10]:
def learnWithRatio(ratio):
    # bnPrior is used to give the variables and their domains
    learner = gum.BNLearner("database.csv") #, bnPrior) 
    learner.useAprioriDirichlet("dirichlet.csv")
    learner.setAprioriWeight(ratio*sizePrior)
    learner.setDatabaseWeight((1-ratio)) #*sizeData)
    learner.useScoreBIC() # or another score with no included prior
    return learner.learnBN()

ratios=[0.0,1.0] #0.01,0.05,0.2,0.5,0.8,0.9,0.95,0.99,1.0]
gnb.sideBySide(*[learnWithRatio(r) for r in ratios],
              captions=[*[f"with ratio {r}<br/> [datasize : {r*sizePrior+(1-r)*sizeData}]" for r in ratios]])

0,1
G A A B B B->A C C C->B D D D->C,G A A B B C C D D C->D
with ratio 0.0  [datasize : 30000.0],with ratio 1.0  [datasize : 10000.0]


In [23]:
def learnWithRatio(ratio):
    # bnPrior is used to give the variables and their domains
    learner = gum.BNLearner("database.csv", bnPrior) 
    learner.useAprioriDirichlet("dirichlet.csv")
    learner.setAprioriWeight(ratio*sizePrior)
    learner.setDatabaseWeight((1-ratio)) #*sizeData)
    learner.useScoreBIC() # or another score with no included prior
    return learner.learnBN()

ratios=[0.0,0.01,0.05,0.2,0.5,0.8,0.9,0.95,0.99,1.0]
gnb.sideBySide(*[learnWithRatio(r) for r in ratios],
              captions=[*[f"with ratio {r}<br/> [datasize : {r*sizePrior+(1-r)*sizeData}]" for r in ratios]])

0,1,2,3,4,5,6,7,8,9
G A A B B B->A C C B->C D D C->D E E D->E F F E->F G G F->G,G A A B B B->A C C C->B D D D->C E E D->E F F E->F G G F->G,G A A B B B->A C C C->B D D D->B D->C E E D->E E->B F F E->F G G E->G G->F,G A A E E A->E B B B->A C C B->C D D B->D D->A D->C D->E E->C F F E->F G G F->G,G A A B B B->A C C B->C D D B->D C->D D->A E E E->A E->C E->D F F F->E G G G->B G->C G->F,G A A B B B->A C C B->C D D B->D E E B->E G G B->G C->G D->A D->C E->A E->C E->D F F E->F G->F,G A A B B B->A C C B->C D D B->D E E B->E G G B->G C->E D->A D->C D->E E->A F F E->F,G A A E E A->E B B B->A C C B->C D D B->D G G B->G D->A E->C F F,G A A D D A->D B B C C E E F F G G,G A A B B C C D D D->A E E F F G G
with ratio 0.0  [datasize : 30000.0],with ratio 0.01  [datasize : 29800.0],with ratio 0.05  [datasize : 29000.0],with ratio 0.2  [datasize : 26000.0],with ratio 0.5  [datasize : 20000.0],with ratio 0.8  [datasize : 14000.0],with ratio 0.9  [datasize : 12000.0],with ratio 0.95  [datasize : 11000.000000000002],with ratio 0.99  [datasize : 10200.0],with ratio 1.0  [datasize : 10000.0]


In [24]:
def learnWithRatio(ratio):
    # bnPrior is used to give the variables and their domains
    learner = gum.BNLearner("database.csv", bnData) 
    learner.useAprioriDirichlet("dirichlet.csv")
    learner.setAprioriWeight(ratio*sizePrior)
    learner.setDatabaseWeight((1-ratio)) #*sizeData)
    learner.useScoreBIC() # or another score with no included prior
    return learner.learnBN()

ratios=[0.0,0.01,0.05,0.2,0.5,0.8,0.9,0.95,0.99,1.0]
gnb.sideBySide(*[learnWithRatio(r) for r in ratios],
              captions=[*[f"with ratio {r}<br/> [datasize : {r*sizePrior+(1-r)*sizeData}]" for r in ratios]])

0,1,2,3,4,5,6,7,8,9
G A A B B B->A C C B->C D D C->D E E D->E F F E->F G G F->G,G A A B B B->A C C C->B D D D->C E E D->E F F E->F G G F->G,G A A B B B->A C C C->B D D D->B D->C E E D->E E->B F F E->F G G E->G G->F,G A A E E A->E B B B->A C C B->C D D B->D D->A D->C D->E E->C F F E->F G G F->G,G A A B B B->A C C B->C D D B->D C->D D->A E E E->A E->C E->D F F F->E G G G->B G->C G->F,G A A B B B->A C C B->C D D B->D E E B->E G G B->G C->G D->A D->C E->A E->C E->D F F E->F G->F,G A A B B B->A C C B->C D D B->D E E B->E G G B->G C->E D->A D->C D->E E->A F F E->F,G A A E E A->E B B B->A C C B->C D D B->D G G B->G D->A E->C F F,G A A D D A->D B B C C E E F F G G,G A A B B C C D D D->A E E F F G G
with ratio 0.0  [datasize : 30000.0],with ratio 0.01  [datasize : 29800.0],with ratio 0.05  [datasize : 29000.0],with ratio 0.2  [datasize : 26000.0],with ratio 0.5  [datasize : 20000.0],with ratio 0.8  [datasize : 14000.0],with ratio 0.9  [datasize : 12000.0],with ratio 0.95  [datasize : 11000.000000000002],with ratio 0.99  [datasize : 10200.0],with ratio 1.0  [datasize : 10000.0]
