In [None]:
# Group Number: 2
# Group Members:
# Kartik Pontula (20CS10031)
# Adarsh (19EC39002)
# Project Title: Credit Approval using Decision Tree based Learning Model
# Project Code: CADT

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

In [None]:
# Function definitions
def maxCountArg(y):
	unique_values, counts = np.unique(y, return_counts=True)
	# Find the index of the maximum count
	max_count_index = np.argmax(counts)
	# Retrieve the value with the maximum count
	value_with_max_count = unique_values[max_count_index]
	return value_with_max_count

def calculate_entropy(y):
	unique_classes, class_counts = np.unique(y, return_counts=True)
	probabilities = class_counts / len(y)
	entropy = -np.sum(probabilities * np.log2(probabilities))
	return entropy

def gini_index(labels):
	unique_classes = np.unique(labels)
	gini = 1.0 - sum(((np.sum(labels == c))/(len(labels)))**2 for c in unique_classes)
	return gini

In [None]:
# Decision tree implementation
class DecisionTree:
	def __init__(self, max_depth=None,scoring=gini_index):
		self.max_depth = max_depth
		self.scoring = scoring

	def fit(self, X, y):
		self.tree = self._grow_tree(X, y)

	def _grow_tree(self, X, y, depth=0):
		n_samples, n_features = X.shape
		unique_classes = np.unique(y)

		if len(unique_classes) == 1 or (self.max_depth is not None and depth == self.max_depth):
			return {'class': maxCountArg(y)}

		best_feature, best_threshold = self._best_split(X, y)

		left_indices = X[:, best_feature] <= best_threshold
		right_indices = ~left_indices

		if len(y) == np.count_nonzero(left_indices):
			return {'class': maxCountArg(y)}
		elif len(y) == np.count_nonzero(right_indices):
			return {'class': maxCountArg(y)}

		#print("prob left")
		left_subtree = self._grow_tree(X[left_indices], y[left_indices], depth + 1)
		#print("prob right",best_feature,best_threshold,X,y[right_indices],y[left_indices])
		right_subtree = self._grow_tree(X[right_indices], y[right_indices], depth + 1)

		node = {'feature_index': best_feature, 'threshold': best_threshold,
				'left': left_subtree, 'right': right_subtree}

		return node

	def _best_split(self, X, y):
		n_samples, n_features = X.shape
		best_info_gain = -float('inf')
		best_feature = None
		best_threshold = None

		entropy_parent = self.scoring(y)

		for feature in range(n_features):
			thresholds = np.unique(X[:, feature])
			for threshold in thresholds:
				left_indices = X[:, feature] <= threshold
				right_indices = ~left_indices

				entropy_left = self.scoring(y[left_indices])
				entropy_right = self.scoring(y[right_indices])

				info_gain = entropy_parent - ((np.sum(left_indices) / n_samples) * entropy_left +
											  (np.sum(right_indices) / n_samples) * entropy_right)

				if info_gain > best_info_gain:
					best_info_gain = info_gain
					best_feature = feature
					best_threshold = threshold

		return best_feature, best_threshold

	def predict(self, X):
		return np.array([self._predict_sample(x, self.tree) for x in X])

	def _predict_sample(self, sample, node):
		if 'class' in node:
			return node['class']
		else:
			feature_index = node['feature_index']
			threshold = node['threshold']
			if sample[feature_index] <= threshold:
				return self._predict_sample(sample, node['left'])
			else:
				return self._predict_sample(sample, node['right'])

	def prune(self, X, y):
		self._prune(self.tree, X, y)

	def _prune(self, node, X, y):
		if 'class' in node:
			return

		feature_index = node['feature_index']
		threshold = node['threshold']

		left_indices = X[:, feature_index] <= threshold
		right_indices = ~left_indices

		# Prune the left and right subtrees
		self._prune(node['left'], X[left_indices], y[left_indices])
		self._prune(node['right'], X[right_indices], y[right_indices])

		# Check if pruning this node improves accuracy
		original_accuracy = self._accuracy(y, self.predict(X))

		# Temporarily prune this node
		original_node = node.copy()
		node.clear()
		node['class'] = maxCountArg(y)

		pruned_accuracy = self._accuracy(y, self.predict(X))

		# If accuracy decreased, revert pruning
		if pruned_accuracy < original_accuracy:
			node.clear()
			node.update(original_node)

	def _accuracy(self, y_true, y_pred):
		return np.sum(y_true == y_pred) / len(y_true)

In [None]:
# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data"
names = ["A1", "A2", "A3", "A4", "A5", "A6", "A7", "A8", "A9", "A10", "A11", "A12", "A13", "A14", "A15", "class"]
data = pd.read_csv(url, names=names)

# Replace missing values ('?') with NaN
data.replace('?', np.nan, inplace=True)

# Convert A2, A3, A8, A11, A14, A15 to numeric and handle missing values
continuous_features = ["A2", "A3", "A8", "A11", "A14", "A15"]
for feature in continuous_features:
	data[feature] = pd.to_numeric(data[feature], errors='coerce')

# Discretize continuous features using median
for feature in continuous_features:
	median = data[feature].median()
	data[feature] = data[feature].fillna(median)
	data[feature] = pd.cut(data[feature], bins=3, labels=[i for i in range(3)])

# Convert categorical features to one-hot encoding
categorical_features = ["A1", "A4", "A5", "A6", "A7", "A9", "A10", "A12", "A13"]
data = pd.get_dummies(data, columns=categorical_features, drop_first=True)

# Split the data into train and test sets
X = data.drop('class', axis=1).values
y = data['class'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train the decision tree model using information gain
dt = DecisionTree(max_depth=10, scoring=calculate_entropy) # scoring can be calculate_entropy or gini_index
dt.fit(X_train, y_train)

# Make predictions
y_pred_train = dt.predict(X_train)
y_pred_test = dt.predict(X_test)

# Evaluate the model
print("Classification Report for Decision Tree (Information Gain):")
print("Train Set:")
print(classification_report(y_train, y_pred_train))
print("Test Set:")
print(classification_report(y_test, y_pred_test))

print('*' * 75)

# Prune the decision tree
dt.prune(X_train, y_train)

# Make predictions
y_pred_train = dt.predict(X_train)
y_pred_test = dt.predict(X_test)

# Evaluate the model
print("Classification Report for Decision Tree (Information Gain) after Pruning:")
print("Train Set:")
print(classification_report(y_train, y_pred_train))
print("Test Set:")
print(classification_report(y_test, y_pred_test))

Classification Report for Decision Tree (Information Gain):
Train Set:
              precision    recall  f1-score   support

           +       0.91      0.95      0.93       237
           -       0.96      0.93      0.95       315

    accuracy                           0.94       552
   macro avg       0.94      0.94      0.94       552
weighted avg       0.94      0.94      0.94       552

Test Set:
              precision    recall  f1-score   support

           +       0.87      0.74      0.80        70
           -       0.77      0.88      0.82        68

    accuracy                           0.81       138
   macro avg       0.82      0.81      0.81       138
weighted avg       0.82      0.81      0.81       138

***************************************************************************
Classification Report for Decision Tree (Information Gain) after Pruning:
Train Set:
              precision    recall  f1-score   support

           +       0.94      0.92      0.93      

In [None]:
# Train the decision tree model using Gini index
dt = DecisionTree(max_depth=10, scoring=gini_index)
dt.fit(X_train, y_train)

# Make predictions
y_pred_train = dt.predict(X_train)
y_pred_test = dt.predict(X_test)

# Evaluate the model
print("Classification Report for Decision Tree (Gini Index):")
print("Train Set:")
print(classification_report(y_train, y_pred_train))
print("Test Set:")
print(classification_report(y_test, y_pred_test))

print('*' * 75)

# Prune the decision tree
dt.prune(X_train, y_train)

# Make predictions
y_pred_train = dt.predict(X_train)
y_pred_test = dt.predict(X_test)

# Evaluate the model
print("Classification Report for Decision Tree (Gini Index) after Pruning:")
print("Train Set:")
print(classification_report(y_train, y_pred_train))
print("Test Set:")
print(classification_report(y_test, y_pred_test))

Classification Report for Decision Tree (Gini Index):
Train Set:
              precision    recall  f1-score   support

           +       0.94      0.95      0.94       237
           -       0.96      0.95      0.96       315

    accuracy                           0.95       552
   macro avg       0.95      0.95      0.95       552
weighted avg       0.95      0.95      0.95       552

Test Set:
              precision    recall  f1-score   support

           +       0.88      0.74      0.81        70
           -       0.77      0.90      0.83        68

    accuracy                           0.82       138
   macro avg       0.83      0.82      0.82       138
weighted avg       0.83      0.82      0.82       138

***************************************************************************
Classification Report for Decision Tree (Gini Index) after Pruning:
Train Set:
              precision    recall  f1-score   support

           +       0.94      0.94      0.94       237
       

In [None]:
# Initialize the scikit-learn DecisionTreeClassifier
dt_clf = DecisionTreeClassifier(random_state=42)

# Fit the model to the training data
dt_clf.fit(X_train, y_train)

# Make predictions on the training and test data
y_pred_train = dt_clf.predict(X_train)
y_pred_test = dt_clf.predict(X_test)

# Print the classification report
print("Classification Report for scikit-learn DecisionTreeClassifier:")
print("Train Set:")
print(classification_report(y_train, y_pred_train))
print("Test Set:")
print(classification_report(y_test, y_pred_test))

Classification Report for scikit-learn DecisionTreeClassifier:
Train Set:
              precision    recall  f1-score   support

           +       0.95      0.99      0.97       237
           -       0.99      0.96      0.98       315

    accuracy                           0.97       552
   macro avg       0.97      0.98      0.97       552
weighted avg       0.97      0.97      0.97       552

Test Set:
              precision    recall  f1-score   support

           +       0.84      0.74      0.79        70
           -       0.76      0.85      0.81        68

    accuracy                           0.80       138
   macro avg       0.80      0.80      0.80       138
weighted avg       0.80      0.80      0.80       138

