In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
import sklearn

In [2]:
from math import log2,sqrt 

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn import tree
from sklearn import preprocessing 
from sklearn.tree import export_graphviz
from sklearn.model_selection import GridSearchCV

In [4]:
from six import StringIO
from IPython.display import Image
import pydotplus
from tqdm.notebook import tqdm_notebook as tqdm

In [5]:
column_names =['age', 'workclass','fnlwg', "education", 'education_num', 'marital_status',
                 "occupation", "relationship", "race", "sex", 'capital_gain', 'capital_loss', "hours_per_week",
               'country', 'target']
data = pd.read_csv("adult_data.csv",header=None, names = column_names)
data.head()

FileNotFoundError: ignored

In [None]:
data.describe()

In [None]:
data.info()



In [None]:
print(f'dataset shape is{data.shape}')

In [None]:
def entropy(class_y):

    if len(class_y) <=1:
        return 0

    total_count = np.bincount(class_y)
    probabilities = total_count[np.nonzero(total_count)] / len(class_y)
    if len(probabilities) <= 1:
        return 0

    return -np.sum(probabilities * np.log(probabilities))/ np.log(len(probabilities))


In [None]:
X=[0,0,0,1,1,1]
z=np.bincount(X)
print(z)
z[np.nonzero(z)]/len(X)

In [None]:
def information_gain(previous_y, current_y):
  
    conditional_entropy=0
    for y in current_y:
        conditional_entropy += (entropy(y)*len(y)/len(previous_y))
    info_gain = entropy(previous_y) - conditional_entropy
    return info_gain


In [None]:
#Test Case
test_class_y = [0,0,0,1,1,1,1,1]
print(entropy(test_class_y))

previous_y=[0,0,0,1,1,1]
current_y=[0,0],[1,1,1,0]
print(information_gain(previous_y,current_y))


In [None]:
def partition_classes(X, y, split_attribute, split_val):
    
    X = np.array(X)
    column_split = X[:,split_attribute]
    X_left = []
    y_right = []
    X_right =[]
    y_left = []

    counter=0

    if isinstance(split_val,str) == False:
        for i in column_split:
            if i <= split_val:
                X_left.append(X[counter])
                y_left.append(y[counter])
            else:
                X_right.append(X[counter])
                y_right.append(y[counter])
            counter+=1
         
    else:
        for i in column_split:
            if i == split_val:
                X_left.append(X[counter])
                y_left.append(y[counter])
            else:
                X_right.append(X[counter])
                y_right.append(y[counter])
            counter+=1;

    return X_left, X_right, y_left, y_right


In [None]:
def find_best_split(X, y, split_attribute):


    best_info_gain = 0
    X = np.array(X)
    column_split = X[:,split_attribute]

    column_split = np.unique(column_split)
    best_split_val = column_split[0]

    for split_val in column_split:
        current_X_left, current_X_right, current_y_left, current_y_right = partition_classes(X, y, split_attribute, split_val)
        current_y = []
        current_y.append(current_y_left)
        current_y.append(current_y_right)

        current_info_gain = information_gain(y,current_y)
        if current_info_gain > best_info_gain:
            best_info_gain = current_info_gain 
            best_split_val = split_val

    return best_split_val, best_info_gain


In [None]:
def find_best_feature(X, y):

    best_info_gain = 0
    best_feature = 0
    best_split_val = 0
    for feature_index in range(len(X[0])):
        current_best_split_val, current_best_info_gain = find_best_split(X, y, feature_index) 
        if current_best_info_gain>best_info_gain:
            best_info_gain = current_best_info_gain
            best_feature = feature_index
            best_split_val = current_best_split_val
    return best_feature, best_split_val


In [None]:
class MyDecisionTree(object):
    def __init__(self, max_depth=None):

        self.tree = {}
        self.residual_tree = {}
        self.max_depth = max_depth

    def fit(self, X, y, depth):

        unique_labels=np.unique(y)
        if (len(unique_labels)==1) or (depth == max_depth):
            unique_labels, counts_unique_labels = np.unique(y, return_counts=True)
            index = counts_unique_labels.argmax()
            classification = unique_labels[index]
            return classification
         

        best_feat, best_split = find_best_feature(X, y)
        best_split, information_gain = find_best_split(X, y, best_feat)
        X_left, X_right, y_left, y_right = partition_classes(X, y, best_feat, best_split)

        if isinstance(best_split,str):
            question = "{} == {}".format(best_feat, best_split)
        else:
            question = "{} <= {}".format(best_feat, best_split)
        node = {question: []}

        depth+=1
        yes_answer = self.fit(X_left,y_left, depth)
        no_answer = self.fit(X_right, y_right, depth)

        if yes_answer == no_answer:
            node = yes_answer
        else:
            node[question].append(yes_answer)
            node[question].append(no_answer)
        self.tree = node
        return node
                                                    
    def predict(self, record,flag=1):
        if flag == 1:
            self.residual_tree = self.tree
        question = list(self.residual_tree.keys())[0]
        feature, comparison, value = question.split()

        if comparison == "==":
            if record[int(feature)] == value:
                answer = self.residual_tree[question][0]
            else:
                answer = self.residual_tree[question][1]
        elif comparison == "<=":
            if record[int(feature)] <= float(value):
                answer = self.residual_tree[question][0]
            else:
                answer = self.residual_tree[question][1]
            
            
        if not isinstance(answer, dict):
            return answer       
        else:
            self.residual_tree = answer
            return self.predict(record,0) 

In [None]:
def DecisionTreeEvaluation(id3,X,y, verbose=False):

    y_predicted = []
    for record in X:
        y_predicted.append(id3.predict(record))
   
    results = [prediction == truth for prediction, truth in zip(y_predicted, y)]
    
    accuracy = float(results.count(True)) / float(len(results))
    if verbose:
        print("accuracy: %.4f" % accuracy)
    return accuracy 

In [None]:
column_names =['age', 'workclass','fnlwg', "education", 'education_num', 'marital_status',
                 "occupation", "relationship", "race", "sex", 'capital_gain', 'capital_loss', "hours_per_week",
               'country', 'target']
df_train = pd.read_csv("adult_data.csv",header=None, names = column_names)
df_train.drop(columns = "fnlwg", inplace = True)
df_train.head()

In [None]:
column_names =['age', 'workclass','fnlwg', "education", 'education_num', 'marital_status',
                 "occupation", "relationship", "race", "sex", 'capital_gain', 'capital_loss', "hours_per_week",
               'country', 'target']
df_test = pd.read_csv("adult_test.csv",header=None, names = column_names)
df_test.drop(columns = "fnlwg", inplace = True)
df_test.head()

In [None]:
df_test = df_test.iloc[1:]
df_test.head()

In [None]:
df_test['age'] = pd.to_numeric(df_test['age'])

In [None]:
df_test.info()

In [None]:
X_train = df_train.iloc[:,:-1]
y_train = df_train.iloc[:,-1]
print(y_train.head())
label_enc = LabelEncoder()
y_train = label_enc.fit_transform(y_train)
print(y_train)

In [None]:
X_test = df_test.iloc[:,: -1]
y_test = df_test.iloc[:,-1]

label_enc = LabelEncoder()
y_test = label_enc.fit_transform(y_test)
y_test

In [None]:
X_train_np, X_test_np = X_train.to_numpy(), X_test.to_numpy()
max_depth = 3
initial_depth = 0
id3_dt = MyDecisionTree(max_depth)
print("fitting the decision tree")
id3_dt.fit(X_train_np, y_train, initial_depth)
DecisionTreeEvaluation(id3_dt,X_train_np,y_test, True)

In [None]:
id3_dt.tree

In [None]:
X_train.head(3)

In [None]:
X_train_encoded = X_train.copy()
for key in ['workclass', "education", 'marital_status',
            "occupation", "relationship", "race", "sex", 'country']:
    label_enc_x = LabelEncoder()
    X_train_encoded[key] = label_enc_x.fit_transform(X_train[key])

In [None]:
X_test_encoded = X_test.copy()
for key in ['workclass', "education", 'marital_status',
            "occupation", "relationship", "race", "sex", 'country']:
    label_enc_x = LabelEncoder()
    X_test_encoded[key] = label_enc_x.fit_transform(X_test[key])

In [None]:
X_train

In [None]:
clf = tree.DecisionTreeClassifier( criterion='entropy', max_depth = max_depth)
clf.fit(X_train_encoded, y_train)
y_pred = clf.predict(X_test_encoded)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

In [None]:
feature_names = list(X_train)

dot_data = StringIO()
export_graphviz(clf, out_file=dot_data, feature_names = feature_names,class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('Iris_tree.png')
Image(graph.create_png())