In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import plotly.express as px
import pprint



In [None]:

import warnings
warnings.filterwarnings('ignore')

# Load dataset
df = pd.read_csv(r'E:\247\ML\VS_Code_ML\breast_cancer_data.csv')

print(df.head())


In [None]:
print("Shape:", df.shape)
print("Head:")
print(df.head())
print("Tail:")
print(df.tail())
df.info()

print(df.columns)

In [None]:

print(df.dtypes)
print('--'*50)

print(df.describe())
print('--'*50)

print(df.describe().T)
print('--'*50)


In [None]:

# Check for missing values
print("cheking missing values ",df.isnull().sum())
print('--'*50)
missing_values =df.isnull().sum()/len(df)*(100)
print(missing_values[missing_values > 0])

In [None]:
print("Dropping the redundant\n ")

df.drop(columns = ['id','Unnamed: 32'] , axis = 1, inplace=True)

df.shape

In [None]:
# Output label / Target variable / Y-label : data distribution 

# pie-plot : proportion of M v/s B

px.pie(df, 

       'diagnosis',

       color = 'diagnosis',

       color_discrete_sequence = ['#007500','#5CFF5C'],

       title = "Data Distribution")


In [None]:
# Inferences :

# dataset is imbalanced (M : B = 63:37)

# there are more cases of benign tumors than malignant tumors

# for imbalanced datasets, accuracy can be a misleading metric

# for example, if 90% of the cases are benign, the model will always predict "benign" 

# in such cases, we need "Balanced accuracy"

In [None]:
# visually compare the distribution of each feature

# for malignant tumours versus bening . 

# for a given feature, do its values tend to be different for malignant vs benign cases

for column in df.drop("diagnosis",axis=1).columns[:5]:

    # for loop auto iterates through the first five feature columns in the dataframe

    fig = px.box(data_frame =df ,

                 x='diagnosis',

                 color = 'diagnosis',

                 y = column,

                 color_discrete_sequence = ['#007500','#5CFF5C'],

                 orientation = 'v')

    
    fig.show()

In [None]:
# diagnosis : M or B :categorical

# encode : 1 or 0 :categorical

df['diagnosis'] = (df['diagnosis'] == 'M').astype(int)

# this line converts the categorical feature into numerical

 # setting M = 1  then B = 0

# take the correlation

corr = df.corr()

plt.figure(figsize = (20,20))

# heatmap 

sns.heatmap(corr , cmap='viridis_r' , annot=True)

plt.show()

# correlation : -1 to 1

In [None]:
df.corr()

In [None]:

# We should now choose which features are good enough predictors to be used to train the model 

# get the absoulte correlation 

cor_target = abs(corr['diagnosis'])

# select better correlated features

# this is the filtering step

# it creates a new list of relevant features

relevant_features = cor_target[cor_target>0.25]

# 0.25 is user defined. It is the hyper-parameter value

# collect the names of features

# list comprehension

names = [index for index,value in relevant_features.items()]

# Dtop the target vairable from the results

names.remove("diagnosis")

pprint.pprint(names)

In [None]:
x = df[names].values
y = df['diagnosis'].values.reshape(-1,1)
# this line creates target vector or a target label
# df['diagnosis'].values : (569,1)

In [None]:
print("Input features are:", x.shape,"Output Label shape is: ", y. shape)

In [None]:
# we need to scale

# Standardize / Z-score normalization
# apply on X

def scale(X):
    """
    Parameters : X (numpy.ndarray)
    Returns : numpy.ndarray
    """
    mean = np.mean(X, axis=0)
    std = np.std(X, axis=0)
    x_scaled = (X - mean) / std
    return x_scaled

X= scale(x)

Model implementation

In [None]:
# Node Class for Decision Tree

class Node:    
    def __init__(self, feature=None, threshold=None, left=None, right=None, gain=None, value=None):
        """
        Initializes a Node.

        Parameters:
        - feature: The index of the feature to split on.
        - threshold: The threshold value for the split. Defaults to None
        - left: The left child Node. Defaults to None
        - right: The right child Node. Defaults to None
        - gain: Information gain (or Gini reduction) from the split.
        - value: The class label if it's a leaf node.
        """
        self.feature = feature      # index of the feature to split on
        self.threshold = threshold  # threshold value for the split
        self.left = left            # left child Node
        self.right = right          # right child Node
        self.gain = gain            # information gain from the split
        self.value = value          # class label if it's a leaf node

"""
Explanation:

- self.feature & self.threshold  
  Used by **decision nodes** to store the question being asked.  
  Example: "Is radius_mean < 15.5?"

- self.left & self.right  
  Used by decision nodes to point to child nodes.  
  These act like **pointers** in a tree.

- self.value  
  Used by **leaf nodes** to store the class label (final prediction).  
  Example: 0 (Benign) or 1 (Malignant).

- self.gain  
  Used by decision nodes to store the **Information Gain** (or **Gini reduction**) from the split.
"""








In [None]:
'''Explanation :
self.threshold = threshold
self.feature = feature
The above two are used by Decision Nodes.
They store the question being asked at this node
For example, "Is the radius_mean < 15.5 ? "
self.left = left and self.right = right
Used by decision nodes to point to the left and right child nodes.
They are also called pointer nodes.
'''

In [None]:
# Building the Decision Tree

class DecisionTree:
    '''
    This is a decision tree classifier.
    '''
    
    def __init__(self,min_samples = 2 , max_depth = 3):
        self.min_samples = min_samples
        self.max_depth = max_depth
        
        "We are setting hyper-parameters to control the growth of the tree prevent overfitting"
        
    
    def split_data(self, dataset,feature, threshold):
        '''
        Splits the given dataset based on the feature and threshold.
        parameters:
        - dataset: The dataset to split.
        - feature  : Index of the feature to split on.
        - threshold: The threshold value for the split.
        
        Returns : 
        left_dataset : subset of data with values less than or equal to the threshold
        right_dataset : subset of data with values greater than the threshold
        

        '''
        
        # create empty arrays
        left_dataset = []
        right_dataset = []
        
        # loop through each row in the dataset in left and right basis the feature and threshold
        
        for row in dataset:
            if row[feature] <= threshold:
                left_dataset.append(row)
            else:
                right_dataset.append(row)
                
        # convert the left and right datasets into numpy arrays
        left_dataset = np.array(left_dataset)
        right_dataset = np.array(right_dataset)
        
        return left_dataset, right_dataset
    
    
    
    # write function to calculate Entropy
    def entropy(self, y):
        '''
        Computes the entropy for given labels
        Entropy suggests impurity or disorder in the dataset.

        
        Returns : float : Entropy value
        
        '''
        
        entropy = 0.0
        # this initializes the entropy to zero
        
        # use numpy's unique function to get the unique labels in y
        labels = np.unique(y)
        
        for label in labels:
            # find examples in y that have the current label
            label_examples = y[y == label]
            # Calculate the ratio of current label in y
            pl = len(label_examples) / len(y)
            # calculate the entropy for the current label and ratio 
            entropy += -pl * np.log2(pl) 
            
            return entropy
        
    
    # write function to calculate Gini Index/Information Gain
    
    def information_gain(self,parent,left,right):
        '''
        Computes the information gain from splitting the parent dataset into two datasets
        Parameters:
        parent(ndarray) : Input parent dataset
        left : subset of parent dataset after the split on the feature
        right : subset of parent dataset after the split on the feature
        
        Returns : 
        Information Gain on the split: float
        '''
        
        # intiialize the information gain to zero
        information_gain = 0.0
        # compute the entropy of the parent dataset
        parent_entropy = self.entropy(parent)
        # calculate the weights for left and right datasets/nodes
        weight_left = len(left) / len(parent)
        weight_right = len(right) / len(parent)
        # compute the entropy of the left and right datasets/nodes
        entropy_left,entropy_right = self.entropy(left) , self.entropy(right)
        # calculate the weighted entropy 
        # weighted_entropy = post split impurity
        # parent entropy= pre split impurity 
        weighted_entropy = (weight_left * entropy_left) + (weight_right * entropy_right)
        # calculate the information gain
        information_gain = parent_entropy - weighted_entropy
        
        return information_gain
    
    
    # function to get the best split
    def best_split(self, dataset, num_samples, num_features):
        """
        Finds the best split for the given dataset.

        Args:
        dataset (ndarray): The dataset to split.
        num_samples (int): The number of samples in the dataset.
        num_features (int): The number of features in the dataset.

        Returns:
        dict: A dictionary with the best split feature index, threshold, gain,
              left and right datasets.
        """
        # dictionary to store the best split values
        best_split = {'gain':- 1, 'feature': None, 'threshold': None}
        # loop over all the features
        for feature_index in range(num_features):
            #get the feature at the current feature_index
            feature_values = dataset[:, feature_index] # column2 
            #get unique values of that feature
            thresholds = np.unique(feature_values)
            # loop over all values of the feature
            for threshold in thresholds:
                # get left and right datasets
                left_dataset, right_dataset = self.split_data(dataset, feature_index, threshold)
                # check if either datasets is empty
                if len(left_dataset) and len(right_dataset):
                    # get y values of the parent and left, right nodes
                    y, left_y, right_y = dataset[:, -1], left_dataset[:, -1], right_dataset[:, -1]
                    # compute information gain based on the y values
                    information_gain = self.information_gain(y, left_y, right_y)
                    # update the best split if conditions are met
                    if information_gain > best_split["gain"]:
                        best_split["feature"] = feature_index
                        best_split["threshold"] = threshold
                        best_split["left_dataset"] = left_dataset
                        best_split["right_dataset"] = right_dataset
                        best_split["gain"] = information_gain
        return best_split
    
    
    
    def calculate_leaf_value(self, y):
        """
        Calculates the most occurring value in the given list of y values.

        Args:
            y (list): The list of y values.

        Returns:
            The most occurring value in the list.
        """
        y = list(y)
        #get the highest present class in the array
        most_occuring_value = max(y, key=y.count)
        return most_occuring_value
    
    
    def build_tree(self, dataset, current_depth=0):
        """
        Recursively builds a decision tree from the given dataset.

        Args:
        dataset (ndarray): The dataset to build the tree from.
        current_depth (int): The current depth of the tree.

        Returns:
        Node: The root node of the built decision tree.
        """
        # split the dataset into X, y values
        X, y = dataset[:, :-1], dataset[:, -1]
        n_samples, n_features = X.shape
        # keeps spliting until stopping conditions are met
        if n_samples >= self.min_samples and current_depth <= self.max_depth:
            # Get the best split
            best_split = self.best_split(dataset, n_samples, n_features)
            # Check if gain isn't zero
            if best_split["gain"]:
                # continue splitting the left and the right child. Increment current depth
                left_node = self.build_tree(best_split["left_dataset"], current_depth + 1)
                right_node = self.build_tree(best_split["right_dataset"], current_depth + 1)
                # return decision node
                return Node(best_split["feature"], best_split["threshold"],
                            left_node, right_node, best_split["gain"])

        # compute leaf node value
        leaf_value = self.calculate_leaf_value(y)
        # return leaf node value
        return Node(value=leaf_value)
    
    
    def fit(self, X, y):
        """
        Builds and fits the decision tree to the given X and y values.

        Args:
        X (ndarray): The feature matrix.
        y (ndarray): The target values.
        """
        dataset = np.concatenate((X, y), axis=1)
        self.root = self.build_tree(dataset)
        
        
    
    def predict(self, X):
        """
        Predicts the class labels for each instance in the feature matrix X.

        Args:
        X (ndarray): The feature matrix to make predictions for.

        Returns:
        list: A list of predicted class labels.
        """
        # Create an empty list to store the predictions
        predictions = []
        # For each instance in X, make a prediction by traversing the tree
        for x in X:
            prediction = self.make_prediction(x, self.root)
            # Append the prediction to the list of predictions
            predictions.append(prediction)
        # Convert the list to a numpy array and return it
        np.array(predictions)
        return predictions
    
    
    
    def make_prediction(self, x, node):
        """
        Traverses the decision tree to predict the target value for the given feature vector.

        Args:
        x (ndarray): The feature vector to predict the target value for.
        node (Node): The current node being evaluated.

        Returns:
        The predicted target value for the given feature vector.
        """
        # if the node has value i.e it's a leaf node extract it's value
        if node.value != None:
            return node.value
        else:
            #if it's node a leaf node we'll get it's feature and traverse through the tree accordingly
            feature = x[node.feature]
            if feature <= node.threshold:
                return self.make_prediction(x, node.left)
            else:
                return self.make_prediction(x, node.right)


In [None]:
# Evaluation 

# X_train,y_train, X_test,y_test = train_test_split(X, y, random_state=41, test_size=0.2)

def train_test_split(X, y, random_state=41, test_size=0.2):
    """
    Splits the data into training and testing sets.

    Parameters:
        X (numpy.ndarray): Features array of shape (n_samples, n_features).
        y (numpy.ndarray): Target array of shape (n_samples,).
        random_state (int): Seed for the random number generator. Default is 42.
        test_size (float): Proportion of samples to include in the test set. Default is 0.2.

    Returns:
        Tuple[numpy.ndarray]: A tuple containing X_train, X_test, y_train, y_test.
    """
    # Get number of samples
    n_samples = X.shape[0] # rows are samples

    # Set the seed for the random number generator
    np.random.seed(random_state)

    # Shuffle the indices
    shuffled_indices = np.random.permutation(np.arange(n_samples))

    # Determine the size of the test set
    test_size = int(n_samples * test_size)

    # Split the indices into test and train
    test_indices = shuffled_indices[:test_size]
    train_indices = shuffled_indices[test_size:]

    # Split the features and target arrays into test and train
    X_train, X_test = X[train_indices], X[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]

    return X_train, X_test, y_train, y_test




In [None]:
def accuracy(y_true, y_pred):
    """
    Computes the accuracy of a classification model.

    Parameters:
    ----------
        y_true (numpy array): A numpy array of true labels for each data point.
        y_pred (numpy array): A numpy array of predicted labels for each data point.

    Returns:
    ----------
        float: The accuracy of the model
    """
    y_true = y_true.flatten()
    total_samples = len(y_true)
    correct_predictions = np.sum(y_true == y_pred) # i want the total number where the actual and prediction is exactly the same
    return (correct_predictions / total_samples)

In [None]:
def balanced_accuracy(y_true, y_pred):
    """Calculate the balanced accuracy for a multi-class classification problem.

    Parameters
    ----------
        y_true (numpy array): A numpy array of true labels for each data point.
        y_pred (numpy array): A numpy array of predicted labels for each data point.

    Returns
    -------
        balanced_acc : The balanced accuracyof the model

    """
    y_pred = np.array(y_pred)
    y_true = y_true.flatten()
    # Get the number of classes
    n_classes = len(np.unique(y_true))

    # Initialize an array to store the sensitivity and specificity for each class
    sen = []
    spec = []
    # Loop over each class
    for i in range(n_classes):
        # Create a mask for the true and predicted values for class i
        mask_true = y_true == i
        mask_pred = y_pred == i

        # Calculate the true positive, true negative, false positive, and false negative values
        TP = np.sum(mask_true & mask_pred)
        TN = np.sum((mask_true != True) & (mask_pred != True))
        FP = np.sum((mask_true != True) & mask_pred)
        FN = np.sum(mask_true & (mask_pred != True))

        # Calculate the sensitivity (true positive rate) and specificity (true negative rate)
        sensitivity = TP / (TP + FN)
        specificity = TN / (TN + FP)

        # Store the sensitivity and specificity for class i
        sen.append(sensitivity)
        spec.append(specificity)
    # Calculate the balanced accuracy as the average of the sensitivity and specificity for each class
    average_sen =  np.mean(sen)
    average_spec =  np.mean(spec)
    balanced_acc = (average_sen + average_spec) / n_classes

    return balanced_acc

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=41, test_size=0.2)

In [None]:
# sklearn implementation 

from sklearn.tree import DecisionTreeClassifier
# Create a decision tree classifier model object.
decision_tree_classifier = DecisionTreeClassifier()

# Train the decision tree classifier model using the training data.
decision_tree_classifier.fit(X_train, y_train)

# Use the trained model to make predictions on the test data.
predictions = decision_tree_classifier.predict(X_test)

# Calculate evaluating metrics
print(f" Model's Accuracy: {accuracy(y_test, predictions)}")
print(f"Model's Balanced Accuracy: {balanced_accuracy(y_test, predictions)}")

In [None]:
# Task by chirantan sir 

# classification report 
# confusion matrix
# TP,TN,FP,FN
# precision , recall