In [None]:
from collections import Counter
from copy import deepcopy
import csv
from math import inf, log2
from numpy import array_split
from random import choice, shuffle

In [None]:
DATASET_NAME = 'Car'  #@param ['Chess', 'Car', 'Tennis']

# Adâncimea arborilor
D = 3 #@param {type: "slider", min: 2, max: 10}

# Procentul de exemple din setul de date utilizat la construcția arborilor
P = 100 #@param {type: "slider", min: 1, max: 100}
examples = []

In [None]:
class Node:
    """ Representation for a node from the decision tree """
    def __init__(self, label):
        """
            for non-leafs it is the name of the attribute
            for leafs it is the class
        """
        self.label = label
        
        # Dictionary of (attribute value, nodes)
        self.children = {}
    
    def display(self, string):
        print(string + self.label)
        string += "\t"
        if self.children:
            for key, value in self.children.items():
                print(string + key)
                value.display(string + "\t")


def getArchive(dataSetName):
    """ Checks if a specific dataset is present in the local directory, if not,
    downloads it.

    Args:
        dataSetName (str): the dataset name
    """
    
    # Acesta este github-ul meu
    datasets_url = {
        "Car": "https://github.com/anonimous334/IA2/blob/main/datasets/car",
        "Chess": "https://github.com/anonimous334/IA2/blob/main/datasets/chess",
        "Tennis": "https://github.com/anonimous334/IA2/blob/main/datasets/tennis"
    }

    assert dataSetName in datasets_url

    from os import sep, path    
    dataset_url = datasets_url[dataSetName]
    dataset_file = os.sep.join(os.path.normpath(dataset_url).split(os.sep)[-2:])
    print(dataset_file)

    if not path.isfile(dataset_file):
        import urllib
        print("Downloading...")
        urllib.request.urlretrieve(dataset_url, filename=dataset_file)
        assert(path.isfile(dataset_file))
        print("Got the archive")
    else:
        print(f"{dataset_file} already in the local directory")


def getDataSet(dataSetName):
    """ Reads a dataset

    Args:
        dataSetName (str): Name for the dataset

    Returns:
        A tuple containing (classes, attributes, examples):
        classes (set): the classes that are found in the dataset
        attributes (list of strings): the attributes for the dataset
        examples (list of dictionaries): one example contains an entry as
            (attribute name, attribute value)
    """

    dataset_file = f'datasets/{dataSetName.lower()}'

    f_in = open(dataset_file, 'r')
    csv_reader = csv.reader(f_in, delimiter=",")

    # Read the header row
    row = next(csv_reader)

    # The last element represents the class
    attributeNames = row[:-1]
    
    examples = []
    classes = set()

    for row in csv_reader:
        *attributes, label = row
        classes.add(label)
        example = dict(zip(attributeNames, attributes))
        example["CLASS"] = label
        examples.append(example)
    
    f_in.close()
    return classes, attributeNames, examples

In [None]:
def randomTree(d, X, A):
    if d == 0:
       mostFrequentClass = mode(list(x["CLASS"] for x in X))

       n = Node(mostFrequentClass)
    else:
        chosen_attribute = random.choice(A)
        A.remove(chosen_attribute)

        n = Node(chosen_attribute)
        
        for attribute_value in X[chosen_attribute]:
            n.children[attribute_value] = randomTree(d - 1, X, deepcopy(A))

    return n

In [None]:
def mostFrequentClass(X):
    # TODO Cerință 2
    return Counter([x['CLASS'] for x in X]).most_common()[0][0]


def entropy(X):
    # TODO Cerință 2
    classes = {}
    for x in X:
        if x['CLASS'] in classes:
            classes[x['CLASS']] += 1
        else:
            classes[x['CLASS']] = 1

    return -sum(num / len(X) * log2(num / len(X))
        for _, num in classes.items())


def gain(X, a):
    # TODO Cerință 2
    Vs = set([x[a] for x in X])
    l = len(X)
    gain = entropy(X)

    for v in Vs:
        X_new = [x for x in X if x[a] == v]
        gain -= len(X_new) * entropy(X_new) / l

    return gain  


def getMaxGainAttrib(X, A):
    max_gain = -inf

    for attr in A:
        crt_gain = gain(X, attr)
        if crt_gain > max_gain:
            max_gain = crt_gain
            attrib = attr

    return attrib


def id3(X, A):
    # TODO Cerință 2
    return decisionTree(-1, X, A, 'id3', getMaxGainAttrib)


def evaluate(tree, example):
    # TODO Cerință 2
    # Functia intoarce clasa prezisa de arborele `tree` pentru exemplul `example`
    if not tree.children:
        return tree.label
    if example[tree.label] not in tree.children:
        return evaluate(list(tree.children.values())[0], example)
    return evaluate(tree.children[example[tree.label]], example)

In [None]:
def randomForest(X, A, n, d):
    # TODO Cerință 3
    shuffle(X)
    return [randomTree(d, list(chunk), A) for chunk in array_split(X, n)]


def randomID3Forest(X, A, n):
    shuffle(X)
    return [id3(list(chunk), A) for chunk in array_split(X, n)]


def evaluateForest(forest, x):
    return Counter(list(map(lambda t: evaluate(t, x), forest))).most_common()[0][0]


def precision(tree, X, c, type):
    prec = 0
    predicted_ct = 0
    evaluator = evaluate if type == 'tree' else evaluateForest
    
    for ex in X:
        pred_c = evaluator(tree, ex)
        if pred_c == c:
            predicted_ct += 1
            if ex['CLASS'] ==c:
                prec += 1
    
    if predicted_ct != 0:
        return prec / predicted_ct
    return 0


def recall(tree, X, c, type):
    X_c = list(filter(lambda ex: ex['CLASS'] == c, X))
    recall = 0
    evaluator = evaluate if type == 'tree' else evaluateForest
    
    for ex in X_c:
        pred_c = evaluator(tree, ex)
        if pred_c == c:
            recall += 1
            
    recall /= len(X_c)
    return recall


def accuracy(tree, X, type):
    count = 0
    evaluator = evaluate if type == 'tree' else evaluateForest

    for x in X:
        if evaluator(tree, x) == x['CLASS']:
            count += 1
    
    return 1.0 * count / len(X)


def test_algs(root, X, type, tabs):
    for clss in set([x['CLASS'] for x in X]):
        print(tabs * '\t' + f'prec for class {clss} = {precision(root, X, clss, type)}')
        print(tabs * '\t' + f'recall for class {clss} = {recall(root, X, clss, type)}')
        print()
    print(tabs * '\t' + f'acc = {accuracy(root, X, type)}')