## Importing Necessary Packages and Libraries 

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter
import matplotlib.pyplot as plt

## Reading Dataset from the Local 

In [2]:
col_names = ['Variance', 'Skewness', 'Curtosis', 'Entropy', 'Class']
data = pd.read_csv(r"D:\1-1 sem\Machine Learning\banknote_dataset\data_banknote_authentication.txt", skiprows=1, header=None, names=col_names)

In [3]:
data.head(10)

Unnamed: 0,Variance,Skewness,Curtosis,Entropy,Class
0,4.5459,8.1674,-2.4586,-1.4621,0
1,3.866,-2.6383,1.9242,0.10645,0
2,3.4566,9.5228,-4.0112,-3.5944,0
3,0.32924,-4.4552,4.5718,-0.9888,0
4,4.3684,9.6718,-3.9606,-3.1625,0
5,3.5912,3.0129,0.72888,0.56421,0
6,2.0922,-6.81,8.4636,-0.60216,0
7,3.2032,5.7588,-0.75345,-0.61251,0
8,1.5356,9.1772,-2.2718,-0.73535,0
9,1.2247,8.7779,-2.2135,-0.80647,0


# Decision Tree Classification

In [4]:
X_decision_tree = data.iloc[:, :-1].values

In [11]:
Y_decision_tree = data.iloc[:, -1].values

In [12]:
X_train_decision_tree, X_test_decision_tree, y_train_decision_tree, y_test_decision_tree = train_test_split(X_decision_tree, Y_decision_tree, test_size=0.2, random_state=40)

In [17]:
class Node():
    def __init__(self, feature = None, threshold = None, left = None, right = None, *, value = None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value
        
    def is_leaf_node(self):
        return self.value is not None

In [18]:
def entropy(y):
    unique_labels, label_counts = np.unique(y, return_counts=True)
    probabilities = label_counts / len(y)
    entropy_value = -np.sum(probabilities * np.log2(probabilities))
    return entropy_value

In [19]:
class DecisionTree():
    def __init__(self, min_sample_split = 2, max_depth = 100, no_of_features = None):
        self.min_sample_split = min_sample_split
        self.max_depth = max_depth
        self.no_of_features = no_of_features
        self.root = None
        
    def fit(self, X, y):
        self.no_of_features = X.shape[1] if not self.no_of_features else min(self.no_of_features, X.shape[1])
        self.root = self._grow_tree(X, y)
    
    def _grow_tree(self, X, y, depth = 0):
        no_of_samples, no_of_features = X.shape
        no_of_labels = len(np.unique(y))
        
        #checking for the stopping criteria
        if(depth >= self.max_depth or no_of_labels == 1 or no_of_samples < self.min_sample_split):
            leaf_value = self._most_common_label(y)
            return Node(value = leaf_value)
        
        #Selecting an array of feature indices randomly from the given features
        feature_index = np.random.choice(no_of_features, self.no_of_features, replace = False)
        
        #greedy search for best splitting criteria
        best_feature, best_threshold = self._best_criteria(X, y, feature_index)
        left_idxs, right_idxs = self._split(X[:, best_feature], best_threshold)
        left = self._grow_tree(X[left_idxs, :], y[left_idxs], depth+1)
        right = self._grow_tree(X[right_idxs, :], y[right_idxs], depth+1)
        
        return Node(best_feature, best_threshold, left, right)
    
    def _best_criteria(self, X, y, feat_idxs):
        best_gain = -1
        split_index, split_threshold = None, None
        
        for feat_idx in feat_idxs:
            X_column = X[:, feat_idx]
            thresholds = np.unique(X_column)
            
            for threshold in thresholds:
                gain = self._information_gain(y, X_column, threshold)
                
                if gain > best_gain:
                    best_gain = gain
                    split_index = feat_idx
                    split_threshold = threshold
                    
        return split_index, split_threshold
    
    def _information_gain(self, y, X_column, split_threshold):
        #calculate the parents entropy
        parent_entropy = entropy(y)
        
        #generate split
        left_idxs, right_idxs = self._split(X_column, split_threshold)
        
        if len(left_idxs) == 0 or len(right_idxs) == 0:
            return 0
        
        #calculate [weighted average]E(children)
        n = len(y)
        n_l, n_r = len(left_idxs), len(right_idxs)
        e_l, e_r = entropy(y[left_idxs]), entropy(y[right_idxs])
        child_entropy = (n_l/n)*e_l + (n_r/n)*e_r
        
        #return information gain = E(parent) - [weighted average]E(children)
        information_gain = parent_entropy - child_entropy
        
        return information_gain
        
    def _split(self, X_column, split_threshold):
        #gives an array of all the indices in X_column that follow the splitting criteria
        left_idxs = np.argwhere(X_column <= split_threshold).flatten()
        right_idxs = np.argwhere(X_column > split_threshold).flatten()
        
        return left_idxs, right_idxs
        
    def _most_common_label(self, y):
        counter = Counter(y)
        most_common = counter.most_common(1)[0][0]
        return most_common
        
    def predict(self, X):
        return np.array([self._traverse_tree(x, self.root) for x in X])
    
    def _traverse_tree(self, x, node):
        if node.is_leaf_node():
            return node.value
        
        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        
        return self._traverse_tree(x, node.right)

In [20]:
Decision_Tree_Classifier = DecisionTree(max_depth = 10)

In [21]:
Decision_Tree_Classifier.fit(X_train_decision_tree, y_train_decision_tree)

In [22]:
y_predicted_decision_tree = Decision_Tree_Classifier.predict(X_test_decision_tree)

In [23]:
y_pred=pd.DataFrame(y_predicted_decision_tree)

In [24]:
y_test=pd.DataFrame(y_test_decision_tree)

## Evaluation Measurement

In [25]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[146   2]
 [  3 124]]


In [26]:
tp = conf_matrix[0][0]
tn = conf_matrix[1][1]
fp = conf_matrix[0][1]
fn = conf_matrix[1][0]

In [27]:
accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2 * (precision * recall) / (precision + recall)

In [28]:
mismatches =  conf_matrix.sum() - conf_matrix.trace()

In [29]:
print("\nAccuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)
print("Number of Mismatches:", mismatches)


Accuracy: 0.9818181818181818
Precision: 0.9864864864864865
Recall: 0.9798657718120806
F1 Score: 0.9831649831649831
Number of Mismatches: 5


## Logistic Regression

In [31]:
from sklearn.model_selection import train_test_split
x_logistic_train, x_logistic_test, y_logistic_train, y_logistic_test = train_test_split(X_decision_tree, Y_decision_tree, test_size=0.2, random_state=0)

In [32]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
sc.fit(x_logistic_train)
x_logistic_train_ = sc.transform(x_logistic_train)
x_logistic_test = sc.transform(x_logistic_test)

In [33]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression( random_state=0)
lr.fit(x_logistic_train_, y_logistic_train)

In [34]:
y_pred = lr.predict(x_logistic_test)

In [35]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

In [37]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
conf_matrix = confusion_matrix(y_logistic_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[154   6]
 [  0 115]]


In [38]:
tp = conf_matrix[0][0]
tn = conf_matrix[1][1]
fp = conf_matrix[0][1]
fn = conf_matrix[1][0]
accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2 * (precision * recall) / (precision + recall)
mismatches =  conf_matrix.sum() - conf_matrix.trace()

In [39]:
print("\nAccuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)
print("Number of Mismatches:", mismatches)


Accuracy: 0.9781818181818182
Precision: 0.9625
Recall: 1.0
F1 Score: 0.980891719745223
Number of Mismatches: 6
