In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')


In [None]:
dataset = pd.read_csv('./data.csv')

In [None]:
 dataset.select_dtypes(include = 'object')

In [None]:
dataset.select_dtypes(include = ['int64', 'float64']).columns

In [None]:
dataset = dataset.drop(columns = 'Unnamed: 32')
dataset = pd.get_dummies(data = dataset, drop_first = True)

In [None]:
sns.countplot(dataset['diagnosis_M'], label = 'count')

In [None]:
dataset_2 = dataset.drop(columns = 'diagnosis_M')

In [None]:
dataset_2.corrwith(dataset['diagnosis_M']).plot.bar(figsize = (20,10), title = ' correlation', rot = 45, grid = True)

In [None]:
corr = dataset.corr()

In [None]:
plt.figure(figsize = (20,10))
sns.heatmap(corr, annot = True)

In [None]:
x = dataset.iloc[:,1:-1].values
y = dataset.iloc[:,-1].values

In [None]:
import numpy as np

class DecisionTree:
    def __init__(self, max_depth=None, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None
        self.feature_order_ = []
        #for storing the Feature indices and values at every split
        self.split_features = []
        self.split_values = []

    def entropy(self, y):
        _, counts = np.unique(y, return_counts=True)
        p = counts / len(y)
        return -np.sum(p * np.log2(p))

    def information_gain(self, x, y, split_feature, split_value):
        entropy_parent = self.entropy(y)
        indices_left = x[:, split_feature] < split_value
        indices_right = x[:, split_feature] >= split_value
        entropy_left = self.entropy(y[indices_left])
        entropy_right = self.entropy(y[indices_right])
        size_left = len(y[indices_left])
        size_right = len(y[indices_right])
        entropy_children = (size_left / len(y)) * entropy_left + \
                           (size_right / len(y)) * entropy_right
        information_gain = entropy_parent - entropy_children
        return information_gain

    def find_best_split(self, x, y):
        best_feature = None
        best_value = None
        best_information_gain = -1
        for feature in range(x.shape[1]):
            values = np.unique(x[:, feature])
            for value in values:
                information_gain = self.information_gain(x, y, feature, value)
                if information_gain > best_information_gain:
                    best_feature = feature
                    best_value = value
                    best_information_gain = information_gain
        return best_feature, best_value


    def build_tree(self, x, y, depth=0):
      if len(y) == 0:
          return None

      if depth == self.max_depth or len(y) < self.min_samples_split:
          return np.bincount(y).argmax()

      best_feature, best_value = self.find_best_split(x, y)

      # Keep track of the feature order
      self.feature_order_.append(best_feature)


      indices_left = x[:, best_feature] < best_value
      indices_right = x[:, best_feature] >= best_value

      if np.all(indices_left) or np.all(indices_right):
          return np.bincount(y).argmax()
      self.split_features.append(best_feature)
      self.split_values.append(best_value)

      left_subtree = self.build_tree(x[indices_left], y[indices_left], depth+1)
      right_subtree = self.build_tree(x[indices_right], y[indices_right], depth+1)

      return (best_feature, best_value, left_subtree, right_subtree)




    def fit(self, x, y):
        self.tree = self.build_tree(x, y)

    def predict(self, x):
        y_pred = np.zeros(x.shape[0])
        for i, sample in enumerate(x):
            node = self.tree
            while isinstance(node, tuple):
                feature, value, left_subtree, right_subtree = node
                if sample[feature] < value:
                    node = left_subtree
                else:
                    node = right_subtree
            y_pred[i] = node
        return y_pred


In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size = 0.25, random_state = 45)


In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score

dt = DecisionTree(max_depth=2, min_samples_split=2)
dt.fit(x_train, y_train)


print("Order of feature selection:", dt.feature_order_)


split_features = dt.split_features
split_values = dt.split_values
for i in range(len(split_features)):
    print("Node {}: split feature={}, split value={}".format(i+1, split_features[i], split_values[i]))


y_pred = dt.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-score: {f1}')

In [None]:
# Random Forest class for Breast Cancer Detection Dataset

import numpy as np

class RandomForest():
    def __init__(self, n_estimators=100, max_depth=2, min_samples_split=2):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.trees = []

    def fit(self, x, y):
        for i in range(self.n_estimators):
            indices = np.random.choice(x.shape[0], size=x.shape[0], replace=True)
            X_subset = x[indices]
            y_subset = y[indices]

            if len(X_subset) > 0 and len(y_subset) > 0:
              tree = DecisionTree(max_depth=self.max_depth, min_samples_split=self.min_samples_split)
              tree.fit(X_subset, y_subset)
              self.trees.append(tree)

    def predict(self, x):
        y_preds = np.zeros((x.shape[0], len(self.trees)))
        for i, tree in enumerate(self.trees):
            y_preds[:, i] = tree.predict(x)

        y_pred = np.zeros(x.shape[0])
        for i in range(x.shape[0]):
            counts = np.bincount(y_preds[i, :].astype('int'))
            y_pred[i] = np.argmax(counts)

        return y_pred
    def get_params(self, deep=True):
      return {'n_estimators': self.n_estimators, 'max_depth': self.max_depth, 'min_samples_split': self.min_samples_split}

    def set_params(self, **params):
        for param, value in params.items():
            setattr(self, param, value)
        return self
    def score(self, x, y):
        y_pred = self.predict(x)
        return accuracy_score(y, y_pred)

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

kfold = KFold(n_splits=10)

accuracy_scores = []
f1_scores = []
precision_scores = []
recall_scores = []


for train_index, test_index in kfold.split(x):

    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]

    rf = RandomForest(n_estimators=100, max_depth=2, min_samples_split=2)
    rf.fit(x_train, y_train)

    y_pred = rf.predict(x_test)

    accuracy_scores.append(accuracy_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred))
    precision_scores.append(precision_score(y_test, y_pred))
    recall_scores.append(recall_score(y_test, y_pred))

mean_accuracy = np.mean(accuracy_scores)
std_accuracy = np.std(accuracy_scores)
mean_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)
mean_precision = np.mean(precision_scores)
std_precision = np.std(precision_scores)
mean_recall = np.mean(recall_scores)
std_recall = np.std(recall_scores)

print('Mean accuracy:', mean_accuracy)
print('SD of accuracy:', std_accuracy)
print('Mean F1 score:', mean_f1)
print('SD of F1 score:', std_f1)
print('Mean precision:', mean_precision)
print('SD of precision:', std_precision)
print('Mean recall:', mean_recall)
print('SD of recall:', std_recall)

In [None]:
from sklearn.metrics import accuracy_score

rf = RandomForest(n_estimators=100, max_depth=20, min_samples_split=2)

rf.fit(x_train, y_train)

y_pred = rf.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)

print('Accuracy:', accuracy)

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [5, 10, 15, 20, 25],
    'min_samples_split': [2, 4, 6, 8, 10],
}


rf = RandomForest()

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)

grid_search.fit(x_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print('Precision:', precision)
print('Recall:', recall)
print('F1 score:', f1)

In [None]:
from sklearn.model_selection import cross_val_score
rf = RandomForest(n_estimators=100, max_depth=2, min_samples_split=2)
scores = cross_val_score(rf, x, y, cv=10)


In [None]:
hyperparameters = {
    'n_estimators': [10, 50, 100],
    'max_depth': [5, 10, 15, 20, 25],
    'min_samples_split': [2, 4, 6, 8, 10]
}

cv = 10


scoring_matrix = ['accuracy', 'f1', 'precision', 'recall']

for hyperparameter_name, hyperparameter_values in hyperparameters.items():

    fig, axs = plt.subplots(1, len(scoring_matrix), figsize=(20, 5), gridspec_kw={'wspace': 0.3})
    fig.suptitle(f"{hyperparameter_name.capitalize()}")

    for i, metric in enumerate(scoring_matrix):

        mean_scores = []
        std_scores = []

        for value in hyperparameter_values:
            if hyperparameter_name == 'n_estimators':
                rf = RandomForest(n_estimators=value)
            elif hyperparameter_name == 'max_depth':
                rf = RandomForest(max_depth=value)
            elif hyperparameter_name == 'min_samples_split':
                rf = RandomForest(min_samples_split=value)
            elif hyperparameter_name == 'min_samples_leaf':
                rf = RandomForest(min_samples_leaf=value)
            elif hyperparameter_name == 'criterion':
                rf = RandomForest(criterion=value)

            scores = cross_val_score(rf, x, y, cv=cv, scoring=metric)
            mean_scores.append(np.mean(scores))
            std_scores.append(np.std(scores))

        axs[i].errorbar(hyperparameter_values, mean_scores, yerr=std_scores, fmt='o-', capsize=5)
        axs[i].set_xlabel(hyperparameter_name.capitalize())
        axs[i].set_ylabel(metric)
        axs[i].set_title(f"{metric} vs. {hyperparameter_name.capitalize()}")

    plt.savefig(f'hyperparameter_tuning_feature{hyperparameter_name}.png')
#     plt.show()

plt.savefig('hyperparameter_tuning_feature.png', dpi=100)


In [None]:
# Random Forest class for new artificial dataset
class RandomForest_new():
    def __init__(self, n_estimators=100, max_depth=2, min_samples_split=2):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.trees = []

    def fit(self, x, y):
        for i in range(self.n_estimators):
            indices = np.random.choice(x.shape[0], size=x.shape[0], replace=True)
            X_subset = x[indices]
            y_subset = y[indices]
            tree = DecisionTree(max_depth=self.max_depth, min_samples_split=self.min_samples_split)
            tree.fit(X_subset, y_subset)
            self.trees.append(tree)

    def predict(self, x):
        tree_preds = []
        for i, tree in enumerate(self.trees):
            tree_preds.append(tree.predict(x.reshape(1, -1))[0])
        return tree_preds


In [None]:
# Importing artificial dataset
dataset_new = pd.read_csv('./my_dataset.csv')

In [None]:
X = dataset_new.iloc[:, :-1].values
y = dataset_new.iloc[:, -1].values

In [None]:
import numpy as np

np.random.seed(42)

train_indices = np.random.choice(X.shape[0], size=12, replace=False)

X_train, y_train = X[train_indices], y[train_indices]

X_test, y_test = np.array([[4, 4]]), np.array([2])

In [None]:
rf = RandomForest_new(n_estimators=20, max_depth=2, min_samples_split=2)
rf.fit(X_train, y_train)
x = X_test[0]
tree_preds = rf.predict(x)
for i, pred in enumerate(tree_preds):
    print(f"Tree {i+1} prediction: {pred}")

In [None]:
from sklearn.metrics import accuracy_score

rf = RandomForest_new(n_estimators=20, max_depth=2, min_samples_split=2)

rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)



In [None]:
y_pred