<a href="https://colab.research.google.com/github/armandordorica/MIE1516_A1_Variable_Elimination/blob/master/MIE1516_A1_Q3_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [106]:
!pip install PrettyTable
!pip install truths



In [0]:
import prettytable
import numpy as np
import pandas as pd
pretty=prettytable.PrettyTable()

## Global functions (for now)

In [0]:
def toggle(var):
  if var == 0:
    return 1
  if var == 1:
    return 0

def boolean_truth_table(num_vars):
  truth_table = []
  num_vars = num_vars
  num_cols = num_vars
  var = 1
  num_rows = 2**num_vars
  for col_num in range(0,num_cols):
    truth_table.append([])
    col_num = col_num+1
    # print("col: {} toggle every {} values".format(col_num, 2**(num_vars-col_num)))
    for row_num in range(0, num_rows):
      if row_num%2**(num_vars-col_num)==0:
        var = toggle(var)
      truth_table[col_num-1].append(var)
      # print(truth_table[col_num-1])
      # print(var)
  return truth_table

def generate_truth_table(list_of_variables):
  d = dict()
  num_vars = len(list_of_variables)
  for i in range(0, num_vars): 
    d[str(list_of_variables[i])] = boolean_truth_table(len(list_of_variables))[i]
  df = pd.DataFrame(data=d)
  return df

def multiply_tabular_cpds(tabular_cpd1, tabular_cpd2, var_to_marginalize):
  df1 = tabular_cpd1.to_pandas_df()
  df1 = tabular_cpd1.to_pandas_df()

  vars_factor_1 = tabular_cpd1.get_factor().vars_in_factor()
  vars_factor_1.remove(var_to_marginalize)

  vars_factor_2 = tabular_cpd2.get_factor().vars_in_factor()
  vars_factor_2.remove(var_to_marginalize)

  total_vars = []
  total_vars.append(vars_factor_1[0])
  total_vars.append(vars_factor_2[0])
  output_table = generate_truth_table(total_vars) 

  vars_factor_1.append('Pr')
  vars_factor_2.append('Pr')

  result = pd.merge(df1, df2, on='B', how='inner')
  r0 = result[result['B']==0].Pr_x*result[result['B']==0].Pr_y
  r1 = (result[result['B']==1].Pr_x*result[result['B']==1].Pr_y).reset_index(drop=True)
  output_table['Pr']= r0.add(r1, fill_value=0)
  output_table
  return output_table

def multiply_tabular_cpds_v2(cpd_a, cpd_b):
  """
  input: two tabular cpds which can be in either TabularCPD or dataframe format
  output: dataframe which is the product of the two cpds multiplied by 
  their common term
  """
  if isinstance(cpd_a, TabularCPD):
    left = cpd_a.to_truth_table()
  else: 
    left = cpd_a
  if isinstance(cpd_b, TabularCPD):
    right = cpd_b.to_truth_table()
  else:
    right = cpd_b
  common_columns= np.intersect1d(left.columns, right.columns)
  common_columns= list(common_columns)
  common_columns.remove('Pr')
  result = pd.merge(left, right, on=common_columns, how='inner')
  result['Pr']=result['Pr_x'] *result['Pr_y']
  return result.drop(columns=['Pr_x', 'Pr_y'])

def df_marginalize(df, vars_to_marginalize):
  """
  input: a dataframe in truth table format with a "Pr" column - typically an output of `multiply_tabular_cpds_v2`
        , a list of variables to marginalize on 
  output: a dataframe with the removed column that you marginalized on
  """
  if isinstance(df, TabularCPD):
    df = df.to_truth_table()
  df_columns = list(df.columns)
  df_columns.remove('Pr')

  l1 = df_columns
  l2 = vars_to_marginalize
  l3 = [x for x in l1 if x not in l2]
  group_by_list = l3
  if len(group_by_list)>0:
    df = df.groupby(group_by_list).sum()
    return df.drop(columns=vars_to_marginalize).reset_index()   
  else: 
    return df 

In [0]:
class Node:
  def __init__(self, name, parents='', children=''):
    self.name = name
    self.parents = parents
    self.children = children

In [0]:
class MyBayesianModel: 
  def __init__(self, list_of_edges, nodes=[]):
    self.list_of_edges = list(list_of_edges)
    self.nodes = nodes # list(set([k for i in list_of_edges for k in i]))
    self.tabular_cpds=[]
    self.suggested_order=[]

  def get_suggested_order(self):
    if len(self.suggested_order)==0:
      variables = [''] *len(self.nodes)
      parents = np.zeros(len(self.nodes))
      children= np.zeros(len(self.nodes))

      for i in range(0, len(self.nodes)):
        variables[i]=self.nodes[i].name
        parents[i]=len(self.nodes[i].parents)
        children[i]=len(self.nodes[i].children)
        # print(self.nodes[i].name, self.nodes[i].parents, self.nodes[i].children)

      df = pd.DataFrame(variables, columns =['Variables']) 
      df['Parents'] = parents
      df['Children'] = children
      df['Total'] = df['Parents']+df['Children']
      suggested_order = list(df.sort_values(by=['Total'])['Variables'])
      self.suggested_order = suggested_order 
      return suggested_order
    else:
      return self.suggested_order

  def add_cpds(self, list_of_cpds):
    
      for i in range(0, len(list_of_cpds)):
        list_of_cpds[i].model = self
        # while the model doesn't have the full list of cpds, keep appending
        if (len(self.tabular_cpds) <= len(list_of_cpds)):
          self.tabular_cpds.append(list_of_cpds[i])
          edges_with_children=[item for item in list_of_cpds[i].model.list_of_edges if item[0] == list_of_cpds[i].variable]
    
        #adding nodes and parents to the model that it belongs to
          if len(edges_with_children)>0: 
            list_of_cpds[i].model.nodes.append(Node(list_of_cpds[i].variable, 
                              list_of_cpds[i].evidence, 
                              [item for item in list_of_cpds[i].model.list_of_edges if item[0] == list_of_cpds[i].variable][0][1]))
            
          elif len(edges_with_children)==0: 
            list_of_cpds[i].model.nodes.append(Node(list_of_cpds[i].variable, 
                              list_of_cpds[i].evidence ))

  def print_all_factors(self):
    for i in range(0, len(self.tabular_cpds)):
      self.tabular_cpds[i].print_factor()

  def get_variables(self):
      list_of_tuples = list(self.list_of_edges)
      list_of_items = [item for t in list_of_tuples for item in t] 
      list_set = set(list_of_items) 
      # convert the set to the list 
      unique_list_of_vars = (list(list_set))
      print(unique_list_of_vars)

  def available_cpds(self):
    for i in range(0, len(self.tabular_cpds)):
      self.tabular_cpds[i].print_factor()

  def eliminate_variable(self, variable_to_eliminate):
    print("Marginalize on {}".format(variable_to_eliminate))
    print("Multiply factors:")
    for i in range(0,len(self.tabular_cpds)):
      if self.tabular_cpds[i].get_factor().contains_var(variable_to_eliminate):
        self.tabular_cpds[i].print_factor()

    print("CPDs are:\n")
    for i in range(0,len(self.tabular_cpds)):
      if self.tabular_cpds[i].get_factor().contains_var(variable_to_eliminate):
        self.tabular_cpds[i].print_cpd()

In [0]:
class Factor:
  def __init__(self, indep_var, dep_vars=[]):
    self.indep_var = indep_var
    self.dep_vars = dep_vars
  
  def print_factor(self):
    if len(self.dep_vars)>0:
      self.dep_vars = set(self.dep_vars)
      self.dep_vars = list(self.dep_vars)
      self.dep_vars.sort()
      #print(self.dep_vars)
      dep_vars = str(self.dep_vars[0])
      for i in range (1, len(self.dep_vars)):
        dep_vars = dep_vars + "," + self.dep_vars[i]
      print("P({}|{})".format(self.indep_var, dep_vars))
    if len(self.dep_vars)==0:
      print("P({})".format(self.indep_var))

  def vars_in_factor(self):
    factors = list()
    factors.append(str(self.indep_var))
    for i in range (0, len(self.dep_vars)):
      factors.append(self.dep_vars[i])
    
    list_set = set(factors) 
    # convert the set to the list 
    unique_list_of_vars = list(list_set)
    unique_list_of_vars.sort(reverse=False)
    # print(unique_list_of_vars)
    return unique_list_of_vars

  def contains_var(self, variable):
    if variable in self.vars_in_factor():
      return True
    else:
      return False

  

In [0]:
class TabularCPD: 
  """
  input to initialize: 
        * variable - dependent variable
        * variable_card - how many possible values for dependent variable 
        * values - tabular probabilities 
        * evidence - independent variable 
        * evidence_card - list of possibilities for each of the dependent variables
  """
  def __init__(self, variable, variable_card, values, evidence='', evidence_card=''): 
    self.variable = variable 
    self.variable_card = variable_card
    self.values = values
    self.evidence = evidence
    self.evidence_card = evidence_card

  # Initializing factors of the CPD depending on the format (whether evidence is provided or not)
    if len(self.evidence)>0:
      self.factors = []
      self.factors.append(Factor(self.variable, self.evidence))
    
    if len(self.evidence)==0:
      self.factors = []
      self.factors.append(Factor(self.variable))
      
    self.all_variables = []
    self.all_variables.append(self.variable)
    for i in range (0, len(self.evidence)):
      self.all_variables.append(self.evidence[i])

    self.truth_table = generate_truth_table(self.all_variables)
    self.probabilities = []

    for i in range(0, len(self.values)):
      for j in range(0, len(self.values[i])):
        self.probabilities.append(self.values[i][j])

    probs = np.asarray(self.probabilities, dtype=np.float32)
    self.truth_table['Pr'] = probs

  def get_factor(self):
    """Returns all the factors associated with a TabularCPD"""
    return self.factors[0]

  def print_factor(self):
    self.factors[0].print_factor()
  
  def to_truth_table(self):
    self.truth_table = generate_truth_table(self.all_variables)

    self.probabilities = []

    for i in range(0, len(self.values)):
      for j in range(0, len(self.values[i])):
        self.probabilities.append(self.values[i][j])

    probs = np.asarray(self.probabilities, dtype=np.float32)
    self.truth_table['Pr'] = probs

    return self.truth_table

  def marginalize(self, var_to_marginalize): 
    #marginalize by "E"
    #var_to_marginalize = 'E'
    #keeping only the values to group by that are not the variable you want to marginalize on 
    group_by_list = list(filter(lambda a: a != var_to_marginalize, self.all_variables))
    df = self.truth_table.groupby(group_by_list).sum()
    df = df.drop(columns=[var_to_marginalize])
    return df 

  def to_pandas_df(self): 
    df = pd.DataFrame.from_records(self.values)
    if self.evidence!='':
      df = df.transpose()

    # for every column append the name of the tabular self variable 
    cols_in_df = len(df.columns)
    rows_in_df = len(df.index)


    for i in range(0, cols_in_df):
      df.rename(columns={ df.columns[i]: str(self.variable)+"_"+str(i) }, inplace = True)

    df.reset_index(inplace=True)

    if self.evidence!='':
      for i in range(0, rows_in_df):
        df.iloc[i,0] = str(self.evidence[0])+"_"+str(i)


    df.set_index('index')

    if self.evidence_card == [2,2]:
      df = pd.DataFrame.from_records(self.values)
      df = df.transpose()

      # for every column append the name of the tabular self variable 
      cols_in_df = len(df.columns)
      rows_in_df = len(df.index)

      for i in range(0, cols_in_df):
        df.rename(columns={ df.columns[i]: str(self.variable)+"_"+str(i) }, inplace = True)

      data = df
      df =generate_truth_table(self.evidence) 
      df['index']= str(self.evidence[0]) + "_" + df[str(self.evidence[0])].astype(str) + "_" + str(self.evidence[1]) + "_"+df[str(self.evidence[1])].astype(str)
      df = df[['index']]
      df_final = pd.concat([df, data], axis=1)
      df_final.set_index('index')
      return df_final.set_index('index')
    return df.set_index('index')

In [0]:
class VariableElimination:
  def __init__(self, model):
    self.model = model

### **Test 1**

In [0]:
model = []
model = MyBayesianModel([('E', 'A'), ('B', 'A'), ('A', 'J'), ('A', 'M')])
cpd_e = TabularCPD(variable='E', variable_card = 2, values=[[0.998,0.002]])
cpd_b = TabularCPD(variable='B', variable_card = 2, values=[[0.999,0.001]])

cpd_j = TabularCPD(variable='J', variable_card=2, 
                   values=[[0.95, 0.1], 
                           [0.05, 0.9]],
                   evidence = ['A'], 
                   evidence_card=[2] 
                   )

cpd_m = TabularCPD(variable='M', variable_card=2, 
                   values=[[0.99, 0.3], 
                           [0.01, 0.7]],
                   evidence = ['A'], 
                   evidence_card=[2] 
                   )

cpd_a =  TabularCPD(variable='A', variable_card=2, 
                   values=[[0.999, 0.06, 0.71, 0.05], 
                           [0.001, 0.94, 0.29, 0.95]],
                   evidence = ['E','B'], 
                   evidence_card=[2,2] 
                   )
model.add_cpds([cpd_a, cpd_b, cpd_e, cpd_j, cpd_m])

In [355]:
model.print_all_factors()

P(A|B,E)
P(B)
P(E)
P(J|A)
P(M|A)


In [327]:
f1 = df_marginalize(cpd_m, ['M'])
f1

Unnamed: 0,A,Pr
0,0,1.0
1,1,1.0


In [0]:
#f2(B,E)
f2 = df_marginalize(multiply_tabular_cpds_v2(cpd_a, cpd_j),['A'])
f2 = f2[f2['J']==1].drop(columns=['J'])

In [0]:
#f3(B) = P(E) over E * f2(B,E)
f3 = df_marginalize(multiply_tabular_cpds_v2(cpd_e,f2),['E'])

In [341]:
f3

Unnamed: 0,B,Pr
0,0,0.051341
1,1,0.849017


In [0]:
answer = df_marginalize(multiply_tabular_cpds_v2(cpd_b,f3),['B'])

In [346]:
answer

Unnamed: 0,B,Pr
0,0,0.05129
1,1,0.000849


In [0]:
def multiply_tabular_cpds_v2(cpd_a, cpd_b):
  """
  input: two tabular cpds 
  output: dataframe which is the product of the two cpds multiplied by 
  their common term
  """
  if isinstance(cpd_a, TabularCPD):
    left = cpd_a.to_truth_table()
  else: 
    left = cpd_a
  if isinstance(cpd_b, TabularCPD):
    right = cpd_b.to_truth_table()
  else:
    right = cpd_b
  common_columns= np.intersect1d(left.columns, right.columns)
  common_columns= list(common_columns)
  common_columns.remove('Pr')
  result = pd.merge(left, right, on=common_columns, how='inner')
  result['Pr']=result['Pr_x'] *result['Pr_y']
  return result.drop(columns=['Pr_x', 'Pr_y'])

In [298]:
f3 = df_marginalize(multiply_tabular_cpds_v2(f2, cpd_e),['E'])


ValueError: ignored

In [277]:
df_marginalize(cpd_e.to_truth_table(), ['E'])

Unnamed: 0,Pr
0,0.998
1,0.002


In [278]:
df_marginalize(cpd_b.to_truth_table(), ['B'])

Unnamed: 0,Pr
0,0.999
1,0.001


True

### **Test 2**

In [0]:
model = MyBayesianModel([('C', 'B'), ('B', 'A')])
cpd_c = TabularCPD(variable='C', variable_card = 2, values=[[0.8,0.2]])
cpd_a = TabularCPD(variable='A', variable_card=2, 
                   values=[[0.3, 0.9], 
                           [0.7, 0.1]],
                   evidence = ['B'], 
                   evidence_card=[2] 
                   )
cpd_b = TabularCPD(variable='B', variable_card=2, 
                   values=[[0.5, 0.4], 
                           [0.5, 0.6]], 
                   evidence = ['C'], 
                   evidence_card=[2])
model.add_cpds([cpd_a, cpd_b, cpd_c])

In [29]:
model.get_suggested_order()

['A', 'C', 'B']

In [30]:
model.print_all_factors()

P(A|B)
P(B|C)
P(C)


In [0]:
def print_current_factors(factors_list):
  for i in range(0, len(factors_list)):
    factors_list[i].print_factor()

In [0]:
def dedupe_list(factors_list):
  list_set = set(factors_list) 
  # convert the set to the list 
  factors_list = (list(list_set))

In [33]:
print_current_factors(factors_list)
factors_list = model.tabular_cpds
order = model.get_suggested_order()
order

['A', 'C', 'B']

In [34]:
print_current_factors(factors_list)

P(A|B)
P(B|C)
P(C)


In [35]:
desired_variable = 'A'
order = model.get_suggested_order()
order.remove(desired_variable)
order
i = 0
print(order,i)

['C', 'B'] 0


In [37]:
current_variable = order[i]
current_variable

'C'

In [38]:
#factors that contain the first variable 
cpds_to_multiply = []
for i in range(0, len(model.tabular_cpds)):
  if model.tabular_cpds[i].get_factor().contains_var(current_variable):
    model.tabular_cpds[i].get_factor().print_factor()
    cpds_to_multiply.append(model.tabular_cpds[i])

P(B|C)
P(C)


In [39]:
for i in range(0, len(cpds_to_multiply)):
  cpds_to_multiply[i].print_factor()

P(B|C)
P(C)


In [0]:
factors_list.append(df_marginalize(multiply_tabular_cpds_v2(cpds_to_multiply[0], cpds_to_multiply[1]), [order[0]]))
factors_list.remove(cpds_to_multiply[0])
factors_list.remove(cpds_to_multiply[1])

In [41]:
factors_list

[<__main__.TabularCPD at 0x7f349fa35ba8>,    B    Pr
 0  0  0.48
 1  1  0.52]

In [42]:
i=1
current_variable = order[i]
current_variable

'B'

In [50]:
cpds_to_multiply = []
for i in range(0, len(model.tabular_cpds)):
  if isinstance(model.tabular_cpds[i], TabularCPD):
    if model.tabular_cpds[i].get_factor().contains_var(current_variable):
      model.tabular_cpds[i].get_factor().print_factor()
      cpds_to_multiply.append(model.tabular_cpds[i])
##see if dataframe contains a variable and add it to the cpds_to_multiply list
  elif not isinstance(model.tabular_cpds[i], TabularCPD):
    df_vars = list(model.tabular_cpds[1].columns)
    if current_variable in df_vars: 
      cpds_to_multiply.append(model.tabular_cpds[i])

P(A|B)


In [0]:
factors_list.append(df_marginalize(multiply_tabular_cpds_v2(cpds_to_multiply[0], cpds_to_multiply[1]), [order[1]]))
factors_list.remove(cpds_to_multiply[0])
factors_list.remove(cpds_to_multiply[1])

In [58]:
factors_list[0]

Unnamed: 0,A,Pr
0,0,0.612
1,1,0.388


Next steps:
1. Redo Assignment 1 Q1 by hand. 
1.1 Look at an example of multiplying two CPDs
2. Piece the algorithm together to try to compute it
3. Test the algorithm again with the diagram from quiz 1 and compare against results from PGMPY 
4. Convert all pandas functions to numpy 
5. Write documentation 

Does variable elimination happen once for each model or everytime you try to execute a different query? 

In [123]:
model.get_suggested_order()

['C', 'B']

In [122]:
query(model, ['A'])

ValueError: ignored

In [0]:
def query(model, desired_variable, evidence=''):
  factors_list = []
  desired_variable =desired_variable[0]
  factors_list = model.tabular_cpds
  order = model.get_suggested_order()
  order.remove(desired_variable)

  for i in range(0, len(order)):
    eliminate_variable(factors_list, order[i])
  return factors_list[0].reset_index(drop=True)


def eliminate_variable(factors_list, variable_to_eliminate): 
  cpds_to_multiply = get_cpds_to_multiply(factors_list, variable_to_eliminate)
  product = multiply_tabular_cpds_v2(cpds_to_multiply[0], cpds_to_multiply[1])
  resulting_factor = df_marginalize(product, variable_to_eliminate)
  factors_list.append(resulting_factor)
  factors_list.remove(cpds_to_multiply[0])
  factors_list.remove(cpds_to_multiply[1])


def get_cpds_to_multiply(factors_list, variable_to_eliminate):
  """
  input: list of all pending factors and variable to eliminate 
  output: returns list of pending factors that contain the variable of interest 
  """
  cpds_to_multiply = []
  factors_list = factors_list
  current_variable = variable_to_eliminate
  for i in range(0, len(model.tabular_cpds)):
    if isinstance(model.tabular_cpds[i], TabularCPD):
      if model.tabular_cpds[i].get_factor().contains_var(current_variable):
        model.tabular_cpds[i].get_factor().print_factor()
        cpds_to_multiply.append(model.tabular_cpds[i])
  ##see if dataframe contains a variable and add it to the cpds_to_multiply list
    elif not isinstance(model.tabular_cpds[i], TabularCPD):
      df_vars = list(model.tabular_cpds[1].columns)
      if current_variable in df_vars: 
        cpds_to_multiply.append(model.tabular_cpds[i])
  return cpds_to_multiply   
