In [2]:
import pandas as pd
import numpy as np
'''
True causal graph:
==================
family_income->hours_studied->course_grade<-family_income (i.e. hours studied affected by both family income and course grade, course grade affected by family income)

family_income → hours_studied:
- family_income="high"->hours_studied="medium"
- family_income="medium"->hours_studied="high"
- family_income="low"->hours_studied="low"

family_income & hours_studied → course_grade:
- hours_studied='medium' and family_income== 'high' | hours_studied="high" -> course_grade="high"
- hours_studied='low' -> course_grade="low"
- otherwise -> course_grade="medium"
'''
print('Generating toy dataset for student grades...')
seed=42

# Seed for reproducibility
np.random.seed(seed)

# Generating data for 100 students
num_students = 1000

# Step 1: Generate family_income variable
family_income_categories = ['low', 'medium', 'high']
family_income = np.random.choice(family_income_categories, size=num_students)

# Step 2: Generate hours_studied based on family_income
hours_studied = []
for income in family_income:
    if income == 'high':
        hours_studied.append('medium')
    elif income == 'medium':
        hours_studied.append('high')
    else:  # low income
        hours_studied.append('low')

# Step 3: Generate course_grade based on hours_studied and family_income
course_grade = []
for income, hours in zip(family_income, hours_studied):
    if hours == 'high' or (hours == 'medium' and income == 'high'):
        course_grade.append('high')
    elif hours == 'low':
        course_grade.append('low')
    else:
        course_grade.append('medium')

# Creating a DataFrame
df = pd.DataFrame({
    'family_income': family_income,
    'hours_studied': hours_studied,
    'course_grade': course_grade
})

# Saving the DataFrame to a CSV file
df.to_csv('student_grades_toy.csv', index=False)

df_sample = df.sample(200, random_state=seed).to_csv('student_grades_toy_sample_size_200.csv', index=False)

Generating toy dataset for student grades...


In [5]:
import pyAgrum as gum
import pyAgrum as gum
import pyAgrum.lib.notebook as gnb

# Create a new Bayesian Network
bn = gum.BayesNet('StudentPerformance')

# Add nodes
fi = bn.add(gum.LabelizedVariable('family_income', 'Family Income', ['low', 'medium', 'high']))
hs = bn.add(gum.LabelizedVariable('hours_studied', 'Hours Studied', ['low', 'medium', 'high']))
cg = bn.add(gum.LabelizedVariable('course_grade', 'Course Grade', ['low', 'medium', 'high']))

# Add arcs
bn.addArc(fi, hs)
bn.addArc(fi, cg)
bn.addArc(hs, cg)

# Define CPT for family_income (prior distribution, based on random choice in your data generation)
bn.cpt(fi).fillWith([1/3, 1/3, 1/3])

# Define CPT for hours_studied given family_income
bn.cpt(hs)[{'family_income': 'high'}] = [0, 1, 0]  # high income leads to medium hours
bn.cpt(hs)[{'family_income': 'medium'}] = [0, 0, 1]  # medium income leads to high hours
bn.cpt(hs)[{'family_income': 'low'}] = [1, 0, 0]  # low income leads to low hours


'''
family_income → hours_studied:
- family_income="high"->hours_studied="medium"
- family_income="medium"->hours_studied="high"
- family_income="low"->hours_studied="low"

family_income & hours_studied → course_grade:
- hours_studied='medium' and family_income== 'high' | hours_studied="high" -> course_grade="high"
- hours_studied='low' -> course_grade="low"
- otherwise -> course_grade="medium"
'''
# Define CPT for course_grade given hours_studied and family_income
# high hours or medium hours with high income lead to high grade
bn.cpt(cg)[{'hours_studied': 'high', 'family_income': 'high'}] = [0, 0, 1]
bn.cpt(cg)[{'hours_studied': 'high', 'family_income': 'medium'}] = [0, 0, 1]
bn.cpt(cg)[{'hours_studied': 'high', 'family_income': 'low'}] = [0, 0, 1]
bn.cpt(cg)[{'hours_studied': 'medium', 'family_income': 'high'}] = [0, 0, 1]

# medium hours with medium or low income lead to medium grade
bn.cpt(cg)[{'hours_studied': 'medium', 'family_income': 'medium'}] = [0, 1, 0]
bn.cpt(cg)[{'hours_studied': 'medium', 'family_income': 'low'}] = [0, 1, 0]

# low hours lead to low grade
bn.cpt(cg)[{'hours_studied': 'low', 'family_income': 'low'}] = [1, 0, 0]
bn.cpt(cg)[{'hours_studied': 'low', 'family_income': 'medium'}] = [1, 0, 0]
bn.cpt(cg)[{'hours_studied': 'low', 'family_income': 'high'}] = [1, 0, 0]

# Use pyAgrum to visualize the network
gnb.flow.row(bn,bn.cpt("family_income"),bn.cpt("hours_studied"),bn.cpt("course_grade"))
gum.saveBN(bn, "./student_toy_graph.bif")

family_income,family_income,family_income
low,medium,high
0.3333,0.3333,0.3333

Unnamed: 0_level_0,hours_studied,hours_studied,hours_studied
family_income,low,medium,high
low,1.0,0.0,0.0
medium,0.0,0.0,1.0
high,0.0,1.0,0.0

Unnamed: 0_level_0,Unnamed: 1_level_0,course_grade,course_grade,course_grade
hours_studied,family_income,low,medium,high
low,low,1.0,0.0,0.0
low,medium,1.0,0.0,0.0
low,high,1.0,0.0,0.0
medium,low,0.0,1.0,0.0
medium,medium,0.0,1.0,0.0
medium,high,0.0,0.0,1.0
high,low,0.0,0.0,1.0
high,medium,0.0,0.0,1.0
high,high,0.0,0.0,1.0
