In [66]:
import pandas as pd
import numpy as np

class DecisionTreeID3:
    def __init__(self):
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y)

    def predict(self, X):
        return [self._predict_single(self.tree, x) for _, x in X.iterrows()]

    def _build_tree(self, X, y):
        if len(set(y)) == 1:
            return y.iloc[0]

        if X.empty:
            return y.value_counts().idxmax()

        # Select the best feature to split
        best_feature = self._select_best_feature(X, y)
        tree = {best_feature: {}}

        # Recursively build subtrees
        for value in X[best_feature].unique():
            sub_X = X[X[best_feature] == value].drop(columns=best_feature)
            sub_y = y[X[best_feature] == value]
            subtree = self._build_tree(sub_X, sub_y)
            tree[best_feature][value] = subtree

        return tree

    def _select_best_feature(self, X, y):
        info_gains = []
        for feature in X.columns:
            info_gains.append((feature, self._information_gain(X[feature], y)))
        return max(info_gains, key=lambda x: x[1])[0]

    def _information_gain(self, feature, y):
        total_entropy = self._entropy(y)
        weighted_entropy = sum((feature.value_counts()[value] / len(feature)) * self._entropy(y[feature == value]) for value in feature.unique())
        return total_entropy - weighted_entropy

    def _entropy(self, y):
        probabilities = y.value_counts(normalize=True)
        return -sum(probabilities * np.log2(probabilities))

    def _predict_single(self, tree, x):
        if not isinstance(tree, dict):
            return tree
        root = next(iter(tree))
        subtree = tree[root].get(x[root], 1)  # Handling unseen feature values
        return self._predict_single(subtree, x) if isinstance(subtree, dict) else subtree



In [67]:
def read_data(file_name):
    labels = []
    features = []
    with open(file_name, 'r') as file:
      for line in file:
          values = line.strip().split(', ')

          # Process the label and convert to 0 or 1
          label = 0 if values[0] == '>50K' else 1
          labels.append(label)

          feature_values = values[1:]
          features.append(feature_values)

    data = {
        'workclass': [],
        'education': [],
        'marital-status': [],
        'occupation': [],
        'relationship': [],
        'race': [],
        'sex': [],
        'native-country': []
    }
    for feature_values in features:
        data['workclass'].append(feature_values[0])
        data['education'].append(feature_values[1])
        data['marital-status'].append(feature_values[2])
        data['occupation'].append(feature_values[3])
        data['relationship'].append(feature_values[4])
        data['race'].append(feature_values[5])
        data['sex'].append(feature_values[6])
        data['native-country'].append(feature_values[7])

    df = pd.DataFrame(data)
    return df, labels

In [68]:
import numpy as np
import pandas as pd

X_train, Y_train = read_data('adult.train.10k.discrete')
X_test, Y_test = read_data('adult.test.10k.discrete')




In [69]:
dt = DecisionTreeID3()
dt.fit(X_train, pd.Series(Y_train))


In [71]:
y_pred_test = dt.predict(X_test)
y_pred_train = dt.predict(X_train)

In [72]:
from sklearn.metrics import accuracy_score

print(f'Accuracy on train: {accuracy_score(Y_train, y_pred_train)}')
print(f'Accuracy on test: {accuracy_score(Y_test, y_pred_test)}')

Accuracy on train: 0.8754
Accuracy on test: 0.8067
