In [127]:
import pandas as pd
import graphviz
import uuid

class Gini():
    def __init__(self, file) -> None:
        self.df = pd.read_csv(file)
        self.df = pd.DataFrame(self.df)
        self.list_max_ginis = [] # liste qui contient les seuils optimaux, avec leur gini et variable
        self.tree = {}
        self.main()

    def division(self, df):
        # stocker les valeurs des indices de Gini, leurs seuils et variables correspondantes
        tab_gini = pd.DataFrame(columns=['variable', 'seuil', 'gini'])

        # calcul de Dk
        p1 = df['Y'].value_counts().A  # nombre de modalités A dans ce noeud
        p2 = df['Y'].value_counts().B
        n = p1 + p2  # nombre d'individus
        Dk = 1 - ((p1/n)**2 + (p2/n)**2)  # indice de Gini avant séparation

        # parcourir chaque variable X1, X2...
        i = 0
        for x in df.columns[1:]:
            # parcourir chaque valeur de cette variable comme seuil
            for seuil in df[x]:
                # liste des modalités du noeud de gauche respectant le seuil
                noeud_gauche = df[df[x] <= seuil].Y
                noeud_droite = df[df[x] > seuil].Y

                # calculer l'indice de gini seulement lorsque le noeud n'est pas totalement pur (contient des modalités différentes)
                Dkg, Dkd = 0, 0
                if 'A' in noeud_gauche.tolist() and 'B' in noeud_gauche.tolist():
                    p1g = noeud_gauche.value_counts().A
                    p2g = noeud_gauche.value_counts().B
                    ng = p1g + p2g
                    Dkg = (1 - ((p1g/ng)**2 + (p2g/ng)**2)) * \
                        (ng/n)  # indice de Gini de ce noeud

                if 'A' in noeud_droite.tolist() and 'B' in noeud_droite.tolist():
                    p1d = noeud_droite.value_counts().A
                    p2d = noeud_droite.value_counts().B
                    nd = p1d + p2d
                    Dkd = (1 - ((p1d/nd)**2 + (p2d/nd)**2)) * (nd/n)

                gini = Dk - (Dkg + Dkd)  # indice de Gini global
                tab_gini.loc[i] = [x, seuil, gini]  # ajout des informations
                i += 1

        tab_gini.sort_values(by="gini", ascending=False, inplace=True)
        tab_gini.reset_index(drop=True, inplace=True)
        return tab_gini.iloc[0]

    def recursion(self, df, level=0, parent_noeud_id=None):
        result = self.division(df)
        noeud_id = len(self.list_max_ginis)
        self.list_max_ginis.append([level, result['variable'], result['seuil'], result['gini'], noeud_id])
        
        mg = list(df[df[result['variable']] <= result['seuil']].Y) # modalités à gauche
        md = list(df[df[result['variable']] > result['seuil']].Y)
        self.tree[noeud_id] = {'parent': parent_noeud_id, 'enfant': [], 'mg': mg, 'md': md}
        
        if parent_noeud_id is not None:
            self.tree[parent_noeud_id]['enfant'].append(noeud_id)

        for c in ['g', 'd']:
            if 'A' in self.tree[noeud_id][f'm{c}'] and 'B' in self.tree[noeud_id][f'm{c}']:
                new_df = df[df[result['variable']] <= result['seuil']] if c == 'g' else df[df[result['variable']] > result['seuil']]
                self.recursion(new_df, level + 1, noeud_id)

    def display(self):
        dot = graphviz.Digraph(comment='Arbre de décision', graph_attr={'size': '15,15!'})
        for noeud in self.list_max_ginis:
            level, variable, seuil, gini, noeud_id = noeud
            if level > 0:
                mg = self.tree[noeud_id]['mg']
                md = self.tree[noeud_id]['md']
                if 'A' in mg and 'B' in mg:
                    mg = None
                if 'A' in md and 'B' in md:
                    md = None
                if mg:
                    gauche_id = str(uuid.uuid4())
                    dot.node(gauche_id, label=f"{mg}", shape="oval")
                    dot.edge(f"{noeud_id}", gauche_id, label=f"{variable} , {seuil}")
                if md:
                    droite_id = str(uuid.uuid4())
                    dot.node(droite_id, label=f"{md}", shape="oval")
                    dot.edge(f"{noeud_id}", droite_id, label=f"{variable} , {seuil}")
            label = f"G = {gini:.4f}"
            if level == 0:
                dot.node(f"{noeud_id}", label=label, shape="box")
            else:
                if mg is None and md is None:
                    continue
                dot.node(f"{noeud_id}", label=label, shape="oval")
                parent_id = self.tree[noeud_id]['parent']
                parent_node = [n for n in self.list_max_ginis if n[-1] == parent_id][0]
                _, parent_variable, parent_seuil, _, _ = parent_node
                
                dot.edge(f"{parent_id}", f"{noeud_id}", label=f"{parent_variable} , {parent_seuil}")
        dot.render("arbre_decision", view=True, format="png")

    def main(self):
        self.recursion(self.df)
        self.display() # afficher l'arbre avec graphviz

if __name__ == "__main__":
    Gini('data.csv')
