In [24]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

# Sample dataset (replace with your project data)
data = {
    'Age': ['<=30', '<=30', '31-40', '>40', '>40', '>40', '31-40', '<=30', '<=30', '>40', '<=30', '31-40', '31-40', '>40'],
    'Income': ['High', 'High', 'High', 'Medium', 'Low', 'Low', 'Low', 'Medium', 'Low', 'Medium', 'Medium', 'Medium', 'High', 'Medium'],
    'Student': ['No', 'No', 'No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No'],
    'Credit_Rating': ['Fair', 'Excellent', 'Fair', 'Fair', 'Fair', 'Excellent', 'Excellent', 'Fair', 'Fair', 'Fair', 'Excellent', 'Fair', 'Fair', 'Excellent'],
    'Buys_Computer': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No']
}

df = pd.DataFrame(data)

# Convert categorical features to numerical using Label Encoding
label_encoder = LabelEncoder()

for column in df.columns:
    df[column] = label_encoder.fit_transform(df[column])

# Separate features (X) and target (y)
X = df.drop(columns=['Buys_Computer'])
y = df['Buys_Computer']

# Split the data into training and testing sets
Tr_X, Te_X, Tr_y, Te_y = train_test_split(X, y, test_size=0.2, random_state=42)

# Function to calculate entropy
def calculate_entropy(data):
    class_labels = data.unique()
    entropy = 0
    total_instances = len(data)

    for label in class_labels:
        p = len(data[data == label]) / total_instances
        entropy -= p * np.log2(p)

    return entropy

# Function to calculate information gain
def calculate_information_gain(data, feature, target):
    entropy_before_split = calculate_entropy(target)
    total_instances = len(data)

    weighted_entropy_after_split = 0

    for value in data.unique():
        subset_indices = data[data == value].index
        subset_target = target[subset_indices]
        subset_instances = len(subset_indices)
        weighted_entropy_after_split += (subset_instances / total_instances) * calculate_entropy(subset_target)

    information_gain = entropy_before_split - weighted_entropy_after_split
    return information_gain

# Calculate entropy and information gain for each feature
features = X.columns  # Use the column names of X as features
information_gains = {}

for feature in features:
    information_gains[feature] = calculate_information_gain(X[feature], feature, y)
    print(f"Information Gain for {feature}: {information_gains[feature]}")

# Find the feature with the highest information gain (the root node)
root_node = max(information_gains, key=information_gains.get)

# Find the highest information gain
highestinfo_gain = max(information_gains.values())

print(f"The first feature to select for constructing the decision tree is: {root_node}")
print(f"The highest information gain is: {highestinfo_gain}")

# Create and fit the Decision Tree model
model = DecisionTreeClassifier()
model.fit(Tr_X, Tr_y)

# Calculate the accuracy on the training set
trainaccuracy = model.score(Tr_X, Tr_y)
print(f"Training Set Accuracy: {trainaccuracy}")

# Calculate the accuracy on the test set
testaccuracy = model.score(Te_X, Te_y)
print(f"Test Set Accuracy: {testaccuracy}")

# Calculate the depth of the constructed tree
treedepth = model.get_depth()
print(f"The depth of the constructed Decision Tree is: {treedepth}")

# Visualize the constructed tree
plt.figure(figsize=(20, 10))
plot_tree(model, filled=True)
plt.show()



Information Gain for Age: 0.24674981977443933
Information Gain for Income: 0.02922256565895487
Information Gain for Student: 0.15183550136234159
Information Gain for Credit_Rating: 0.10224356360985076
The first feature to select for constructing the decision tree is: Age
The highest information gain is: 0.24674981977443933
Training Set Accuracy: 1.0
Test Set Accuracy: 1.0
The depth of the constructed Decision Tree is: 4
