In [None]:
# Mustafa Can Caliskan, 150200097

In [8]:
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np

# Load the Iris dataset
iris = load_iris()

# Create a DataFrame from the dataset
iris_df = pd.DataFrame(data=np.c_[iris['data'], iris['target']], columns=iris['feature_names'] + ['target'])

# Save the dataset to a CSV file
iris_df.to_csv('iris_dataset.csv', index=False)


In [9]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
        # Initializes properties for a decision tree node
        self.feature = feature  # Feature used for node splitting
        self.threshold = threshold  # Threshold for node splitting
        self.left = left  # Left child node
        self.right = right  # Right child node
        self.value = value  # Class label stored in leaf node

    def isLeafNode(self):
        # Checks if the node is a leaf node
        return self.value is not None

class DecisionTree:
    def __init__(self, minSamplesSplit=2, maxDepth=100, nFeatures=None):
        # Initializes parameters for the decision tree model
        self.minSamplesSplit = minSamplesSplit  # Minimum samples required to split a node
        self.maxDepth = maxDepth  # Maximum depth of the tree
        self.nFeatures = nFeatures  # Number of features to consider for splits
        self.root = None  # Initializing the root node of the tree

    def train(self, X, y):
        # Trains the decision tree on the given dataset
        self.nFeatures = X.shape[1] if not self.nFeatures else min(X.shape[1], self.nFeatures)
        self.root = self.growTree(X, y)

    def growTree(self, X, y, depth=0):
        # Recursively grows the decision tree
        nSamples, nFeats = X.shape
        nLabels = len(np.unique(y))
        # Termination conditions for recursion or creating a leaf node
        if (depth >= self.maxDepth or nLabels == 1 or nSamples < self.minSamplesSplit):
            leafValue = self.mostCommon(y)
            return Node(value=leafValue)
        featIdxs = np.random.choice(nFeats, self.nFeatures, replace=False)
        bestFeat, bestThreshold = self.bestSplit(X, y, featIdxs)
        leftIdxs, rightIdxs = self.split(X[:, bestFeat], bestThreshold)
        left = self.growTree(X[leftIdxs, :], y[leftIdxs], depth + 1)
        right = self.growTree(X[rightIdxs, :], y[rightIdxs], depth + 1)
        return Node(bestFeat, bestThreshold, left, right)

    def mostCommon(self, y):
        # Finds the most common label in the given set of labels
        y = sorted(y)
        frequencies = {}
        for element in y:
            if element in frequencies:
                frequencies[element] += 1
            else:
                frequencies.update({element: 1})
        return max(frequencies, key=frequencies.get)

    def bestSplit(self, X, y, featIdxs):
        # Finds the best feature and threshold for splitting based on information gain
        bestGain = -1
        splitIdx, splitThreshold = None, None
        for featIdx in featIdxs:
            XColumn = X[:, featIdx]
            thresholds = np.unique(XColumn)
            for thr in thresholds:
                gain = self.informationGain(y, XColumn, thr)
                if gain > bestGain:
                    bestGain = gain
                    splitIdx = featIdx
                    splitThreshold = thr
        return splitIdx, splitThreshold

    def informationGain(self, y, XColumn, threshold):
        # Calculates information gain based on a split
        parentEntropy = self.entropy(y)
        leftIdxs, rightIdxs = self.split(XColumn, threshold)
        if len(leftIdxs) == 0 or len(rightIdxs) == 0:
            return 0
        n = len(y)
        nL, nR = len(leftIdxs), len(rightIdxs)
        eL, eR = self.entropy(y[leftIdxs]), self.entropy(y[rightIdxs])
        childEntropy = (nL / n) * eL + (nR / n) * eR
        informationGain = parentEntropy - childEntropy
        return informationGain

    def split(self, XColumn, splitThresh):
        # Splits the dataset into left and right indices based on a threshold
        leftIdxs = np.argwhere(XColumn <= splitThresh).flatten()
        rightIdxs = np.argwhere(XColumn > splitThresh).flatten()
        return leftIdxs, rightIdxs

    def entropy(self, y):
        # Calculates the entropy of a set of labels
        uniqw, inverse = np.unique(y, return_inverse=True)
        hist = np.bincount(inverse)
        ps = hist / len(y)
        return -np.sum([(p * np.log(p + 1e-5) + (1 - p) * np.log(1 - p + 1e-5)) for p in ps if p > 0])

    def predict(self, X):
        # Predicts the labels for input samples
        return np.array([self.traverseTree(x, self.root) for x in X])

    def traverseTree(self, x, node):
        # Traverses the tree to predict a label for a single sample
        if node.isLeafNode():
            return node.value

        if x[node.feature] <= node.threshold:
            return self.traverseTree(x, node.left)
        return self.traverseTree(x, node.right)


data = pd.read_csv('iris_dataset.csv')
X = data.drop('target', axis=1)
y = data['target']
X = X.values
y = y.values
dt = DecisionTree()
dt.train(X, y)
predictions = dt.predict(X)
output_df = pd.DataFrame({'Actual': y, 'Predicted': predictions})
output_df.to_csv('decision_tree_predictions.csv', index=False)
df = pd.read_csv('decision_tree_predictions.csv')
trueProportion = (df['Actual'] == df['Predicted']).mean()
print(f"The proportion of true cases is: {trueProportion}")


The proportion of true cases is: 1.0


In [6]:
class Autoencoder:
    def __init__(self, input_dim, hidden_dim, learning_rate=0.01, epochs=10000):
        # Initialize the autoencoder with input dimensions, hidden layer dimensions, learning rate, and epochs
        self.inputDim = input_dim
        self.hiddenDim = hidden_dim
        self.outputDim = input_dim
        self.learningRate = learning_rate
        self.epochs = epochs

        # Initialize weights randomly for the encoder and decoder layers
        self.W1 = np.random.normal(size=(input_dim, hidden_dim))
        self.W2 = np.random.normal(size=(hidden_dim, self.outputDim))

    def tanh(self, x):
        # Define the hyperbolic tangent activation function
        return np.tanh(x)

    def forward(self, X):
        # Perform the forward pass through the autoencoder
        # Calculate the output of the hidden layer using tanh activation
        hidden = self.tanh(np.dot(X, self.W1))
        # Calculate the output of the decoder layer
        output = np.dot(hidden, self.W2)
        return hidden, output

    def backward(self, X, hidden, output):
        # Perform backpropagation to update weights
        # Calculate gradients for the weights of the decoder layer
        dW2 = np.dot(hidden.T, (output - X))
        # Calculate gradients for the weights of the encoder layer
        dW1 = np.dot(X.T, np.dot((output - X), self.W2.T) * (1 - hidden**2))
        return dW1, dW2

    def normalize(self, X):
        # Normalize the input data
        # Find min and max values of each column for later denormalization
        self.originalMin = np.min(X, axis=0)
        self.originalMax = np.max(X, axis=0)
        return (X - self.originalMin) / (self.originalMax - self.originalMin)

    def denormalize(self, XNormalized):
        # Denormalize the data back to its original scale
        originalMin = np.reshape(self.originalMin, (1, -1))
        originalMax = np.reshape(self.originalMax, (1, -1))
        return XNormalized * (originalMax - originalMin) + originalMin

    def encode(self, X):
        # Encodes the input data by passing it through the encoder layer
        return self.tanh(np.dot(X, self.W1))

    def decode(self, encodedData):
        # Decodes the encoded data by passing it through the decoder layer
        return np.dot(encodedData, self.W2)

    def train(self, X):
        # Training the autoencoder using the input data
        for epoch in range(self.epochs):
            # Forward pass
            hidden, output = self.forward(X)
            # Backward pass to calculate gradients
            dW1, dW2 = self.backward(X, hidden, output)

            # Update weights using gradients and learning rate
            self.W1 -= self.learningRate * dW1
            self.W2 -= self.learningRate * dW2


data = pd.read_csv('iris_dataset.csv')
X = data.drop('target', axis=1)
input_size = X.shape[1]
hidden_size = 2
ae = Autoencoder(input_size, hidden_size)
normalized_X = ae.normalize(X)
ae.train(normalized_X)
encoded_data = ae.encode(normalized_X)
decoded_data = ae.decode(encoded_data)
decoded_data = decoded_data = ae.denormalize(decoded_data)

output_df = pd.DataFrame(decoded_data, columns=X.columns)
output_df.to_csv('autoencoder_output.csv', index=False)
