In [2]:
import pandas as pd

# read data
cleaned_data = pd.read_csv("Adult_clean.csv")

# --- split training and testing set (80/20) ---
# shuffle data
shuffled_data = cleaned_data.sample(frac=1.0, random_state=42).reset_index(drop=True)

# split data by index
split_idx = int(0.8 * len(shuffled_data))
train = shuffled_data.iloc[:split_idx].copy()
test  = shuffled_data.iloc[split_idx:].copy()

# define income vs other feature
income   = "income"

features = []
for col in cleaned_data.columns:
    if col != "income":
        features.append(col)
        

# --- Naive Bayes model with income as single parent ---
# calculate prior over income
income_count = train[income].value_counts()
train_len = len(train)

# p_income: income_value -> probability
p_income = (income_count / train_len).to_dict() 

# states for each feature
feature_state = {}
for f in features:
    val = train[f].unique()
    feature_state[f] = sorted(val)
income_state  = sorted(train[income].unique())

# CPT: CPT[feature][income_value][feature_value] = probability
CPT = {}
for f in features:
    CPT[f] = {}
    val_f = feature_state[f]

    for i in income_state:
        # find rows where income = i
        income_group = train[train[income] == i]
        total_i = len(income_group)

        # freq of each value of feature f within a income group
        f_given_i = income_group[f].value_counts()

        # MLE estimate
        p_v_given_i = {}
        for v in val_f:
            count_vy = f_given_i.get(v, 0)
            if total_i > 0:
                p_v_given_i[v] = count_vy / total_i
            else:
                # if there is no obs for income value
                p_v_given_i[v] = 0.0

        CPT[f][i] = p_v_given_i

In [6]:
import pandas as pd
import numpy as np
from collections import defaultdict
from itertools import product
from sklearn.metrics import confusion_matrix, accuracy_score

# ============================================================
# 1. LOAD DATA
# ============================================================
df = pd.read_csv("Adult_clean.csv")

target = "income"
nodes = df.columns.tolist()

# ============================================================
# 2. TRAIN/TEST SPLIT (80/20)
# ============================================================
shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)
split = int(0.8 * len(shuffled))
train = shuffled.iloc[:split].copy()
test  = shuffled.iloc[split:].copy()

# ============================================================
# 3. IMPROVED BAYESIAN NETWORK STRUCTURE
# ============================================================
parents = {
    "age_bin": [],
    "education": ["age_bin"],
    "workclass": ["education"],
    "occupation": ["education"],
    "hours_bin": ["education"],
    "income": ["occupation", "hours_bin"]
}

# helper: list of states for each variable
states = {col: sorted(df[col].unique()) for col in df.columns}

# ============================================================
# 4. CPT LEARNING FUNCTION
# ============================================================
def learn_cpt(train, child, parents, laplace=1.0):
    """
    Compute CPT: P(child | parents) using Laplace smoothing.
    Returns: dict: parents_combo_tuple → {child_state: prob}
    """
    child_states = states[child]

    if len(parents) == 0:
        # root node: estimate P(child)
        counts = train[child].value_counts()
        total = len(train)
        cpt = {}
        for cs in child_states:
            cpt[()] = cpt.get((), {})
            cpt[()][cs] = (counts.get(cs, 0) + laplace) / (total + laplace * len(child_states))
        return cpt

    # parent combos
    parent_states = [states[p] for p in parents]
    parent_combos = list(product(*parent_states))

    cpt = {}

    for combo in parent_combos:
        subset = train.copy()
        for p, val in zip(parents, combo):
            subset = subset[subset[p] == val]

        total = len(subset)
        counts = subset[child].value_counts()

        cpt[combo] = {}
        for cs in child_states:
            cpt[combo][cs] = (counts.get(cs, 0) + laplace) / (total + laplace * len(child_states))

    return cpt

# ============================================================
# 5. LEARN CPTs FOR ALL NODES
# ============================================================
CPTs = {}
for child in parents:
    CPTs[child] = learn_cpt(train, child, parents[child], laplace=1.0)

# ============================================================
# 6. INFERENCE: P(income | evidence) for test rows
# ============================================================
def predict_income(row):
    inc_states = states["income"]
    scores = {}

    for inc in inc_states:
        score = 1.0

        # likelihood: compute joint probability (up to a constant)
        for node in parents:
            pa = parents[node]
            if pa == []:
                # root node
                probs = CPTs[node][()]
                score *= probs[row[node]]
            else:
                # find parent values tuple
                combo = tuple(row[p] for p in pa)
                probs = CPTs[node].get(combo, None)
                if probs is None:
                    # unseen combo → assign tiny probability
                    score *= 1e-9
                else:
                    score *= probs[row[node]]

        # override child's income value
        # force income = inc for its CPT
        combo_inc = tuple(row[p] for p in parents["income"])
        score *= CPTs["income"][combo_inc][inc]

        scores[inc] = score

    # normalize
    total = sum(scores.values())
    for k in scores:
        scores[k] /= total

    return max(scores, key=scores.get)

# ============================================================
# 7. RUN PREDICTIONS
# ============================================================
test_pred = test.apply(predict_income, axis=1)
acc = accuracy_score(test[target], test_pred)

print("====================================")
print(" Improved Bayesian Network Results")
print("====================================")
print(f"Accuracy: {acc:.4f}")

# ============================================================
# 8. CONFUSION MATRIX
# ============================================================
cm = confusion_matrix(test[target], test_pred, labels=states["income"])
print("\nConfusion Matrix (income states):")
print(pd.DataFrame(cm, index=states["income"], columns=states["income"]))

# ============================================================
# 9. BIC + CPT SPARSITY
# ============================================================
def compute_loglik_and_params(train, CPTs, parents):
    loglik = 0
    params = 0
    sparsity = {}

    for node in parents:
        cpt = CPTs[node]
        pa = parents[node]
        node_states = states[node]
        parent_states_list = [states[p] for p in pa] if pa else [[]]

        combos = list(cpt.keys())
        sparsity[node] = 0

        for combo in combos:
            for ns in node_states:
                p = cpt[combo][ns]
                if p < 1e-12:
                    sparsity[node] += 1
                params += 1

    # compute LL
    for _, row in train.iterrows():
        for node in parents:
            pa = parents[node]
            if pa:
                combo = tuple(row[p] for p in pa)
            else:
                combo = ()
            p = CPTs[node][combo][row[node]]
            if p < 1e-12:
                p = 1e-12
            loglik += np.log(p)

    return loglik, params, sparsity

loglik, params, sparsity = compute_loglik_and_params(train, CPTs, parents)
bic = -2 * loglik + params * np.log(len(train))

print("\nLog-Likelihood:", loglik)
print("Parameters:", params)
print("BIC:", bic)
print("Sparse CPT cells:")
for n, s in sparsity.items():
    print(f"  {n}: {s} sparse cells")


 Improved Bayesian Network Results
Accuracy: 0.7781

Confusion Matrix (income states):
       <=50K  >50K
<=50K   7006   420
>50K    1748   595

Log-Likelihood: -272051.10812160466
Parameters: 259
BIC: 546840.6716685322
Sparse CPT cells:
  age_bin: 0 sparse cells
  education: 0 sparse cells
  workclass: 0 sparse cells
  occupation: 0 sparse cells
  hours_bin: 0 sparse cells
  income: 0 sparse cells


In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

df = pd.read_csv("Adult_clean.csv")

# Make sure all features are strings (categorical)
for col in df.columns:
    df[col] = df[col].astype(str)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)


parents = {
    'age_bin': [],
    'education': ['age_bin'],
    'workclass': ['education'],
    'occupation': ['education'],
    'hours_bin': ['occupation'],
    'income': ['hours_bin', 'occupation']
}

nodes = list(parents.keys())


CPT = {}

def get_counts(subset, col):
    """Return a normalized probability table for a single variable."""
    counts = subset[col].value_counts()
    total = counts.sum()
    return {v: counts.get(v, 0) / total for v in subset[col].unique()}

for node in nodes:
    node_parents = parents[node]
    CPT[node] = {}

    if len(node_parents) == 0:
        # No parents → unconditional distribution
        CPT[node][None] = train_df[node].value_counts(normalize=True).to_dict()
    else:
        # With parents → need conditional distribution
        parent_vals = train_df[node_parents].drop_duplicates()

        for _, parent_row in parent_vals.iterrows():
            key = tuple(parent_row[p] for p in node_parents)
            subset = train_df
            for p in node_parents:
                subset = subset[subset[p] == parent_row[p]]

            if len(subset) == 0:
                continue

            dist = subset[node].value_counts(normalize=True).to_dict()
            CPT[node][key] = dist


def predict_row(row):
    """Compute P(income | evidence) via BN factorization."""

    # Test both income states
    income_states = list(CPT['income'][list(CPT['income'].keys())[0]].keys())
    scores = {}

    for income_state in income_states:
        prob = 1.0

        # Evaluate BN probability product
        for node in nodes:
            node_parents = parents[node]

            # income gets manually set to the tested state
            if node == 'income':
                node_value = income_state
            else:
                node_value = row[node]

            # Parent key
            if len(node_parents) == 0:
                key = None
            else:
                key = tuple(income_state if p=='income' else row[p] for p in node_parents)
            if key not in CPT[node] or node_value not in CPT[node][key]:
                prob *= 1e-9  # smoothing for missing combinations
            else:
                prob *= CPT[node][key][node_value]

        scores[income_state] = prob

    return max(scores, key=scores.get)


test_df['pred'] = test_df.apply(predict_row, axis=1)

acc = accuracy_score(test_df['income'], test_df['pred'])
cm = confusion_matrix(test_df['income'], test_df['pred'])

print("Accuracy:", acc)
print("\nConfusion matrix:")
print(cm)


Accuracy: 0.7794042378953834

Confusion matrix:
[[6968  446]
 [1709  646]]
