In [144]:
import numpy as np 
import pandas as pd 
from math import log
import random
from treelib import Tree , Node
from sklearn.model_selection import train_test_split

## loading the data 

In [225]:
path_to_data =  "~/Downloads/prison_dataset.csv"
data = pd.read_csv(path_to_data)



In [226]:
data.head()


Unnamed: 0,Fiscal Year Released,Recidivism Reporting Year,Race - Ethnicity,Age At Release,Convicting Offense Classification,Convicting Offense Type,Convicting Offense Subtype,Main Supervising District,Release Type,Part of Target Population,Recidivism - Return to Prison numeric
0,2010,2013,White,<45,D Felony,Violent,Other,3JD,Parole,Yes,1
1,2010,2013,White,>45,D Felony,Other,Other,3JD,Parole,Yes,1
2,2010,2013,White,<45,D Felony,Other,Other,5JD,Parole,Yes,1
3,2010,2013,White,>45,Other Felony,Drug,Trafficking,3JD,Parole,Yes,1
4,2010,2013,Black,<45,D Felony,Drug,Trafficking,3JD,Parole,Yes,1


In [227]:
class_name = "Recidivism - Return to Prison numeric" 


## Entropy and information Gain functions 

In [228]:
def Entropy(x):
    values = np.unique(x)
    proportions = []

    # now we capture the amount of times each value is repeating in each feature 
    # basically the proportion of p   in sum of - p * log p  (the entropy formula)

    
    for element in values:
        proportions = np.append(proportions , np.sum(x == element ))
    
    proportions /= sum(proportions)
 
    return sum([-p * log(p,2) for p in proportions])


In [229]:
Entropy(data[class_name])

0.9885815658559858

In [230]:
def Info_gain(var , labels):
    var = np.array(var)
    labels = np.array(labels)
    values = np.unique(var)
    num_rows = np.size(var)
    H_sum=0
    # calculation of conditional entropy of each of the value that the variable can take
    # we rescue the rows where the variable take that valu and that class column
    for v in values:
        L = np.array([labels[k] for k in np.where(var == v)])
        H_sum += (np.size(L)/num_rows)*Entropy(L)


    return Entropy(labels) - H_sum

In [231]:
Info_gain(data["Release Type"] , data[class_name])

0.009453934793683172

## stopping criterion function

In [232]:
def same_data(data , class_name) :
    vars = [item for item in data.columns if item  != class_name]
    if len(vars) == 0:
        return True


    cont = 0
    for var in vars :
        if np.size(np.unique(data[var])) == 1 :
            cont += 1 


    if cont == np.size(data.columns) - 1 :
        return True

    return False 

In [233]:
data_test = data.loc[[1,2] , [c for c in data.columns if c not in ["Age At Release" , "Convicting Offense Classification" , "Main Supervising District"]]]
data_test 

Unnamed: 0,Fiscal Year Released,Recidivism Reporting Year,Race - Ethnicity,Convicting Offense Type,Convicting Offense Subtype,Release Type,Part of Target Population,Recidivism - Return to Prison numeric
1,2010,2013,White,Other,Other,Parole,Yes,1
2,2010,2013,White,Other,Other,Parole,Yes,1


In [234]:
print(same_data(data_test, class_name))

True


In [235]:
class MyNode:
    def __init__(self, value,parent_value):
        self.value = value
        self.parent_value = parent_value

In [236]:
def Learn_DTree(data, class_name, out_str, _root, id):
    out_str += "iteration : {}\n".format(id)
    out_str += "root: {}\n".format(_root)

    # check if the labels are the same
    if np.size(np.unique(data[class_name])) == 1:
        T = Tree()
        T.create_node(str(data[class_name].values[0]) + "{} = {}".format(_root["name"], _root["value"]), str(id),
                     data= MyNode(value= str(data[class_name].values[0]),parent_value=_root["value"]))
        out_str += "all the classes are the same, leaf = {}\n".format(str(data[class_name].values[0]))
        out_str += "-" * 50 + "\n"
        return T, id + 1, out_str
 
    # check if all the rows are the same
    elif same_data(data, class_name):
        values = np.unique(data[class_name])
        _max = 0
        prediction = np.random.choice(values)
        for element in values:
            num_votes = len(np.where(data[class_name] == element))
            if num_votes > _max:
                _max = num_votes
                prediction = element
    
        T = Tree()
        T.create_node(str(prediction) + "| {}= {}".format(_root["name"], _root["value"]), str(id),
                     data= MyNode(value= str(prediction),parent_value=_root["value"]))
        out_str += "all the rows have the same value: leaf = {}\n".format(str(prediction))
        out_str += "-" * 50 + "\n"
        return T, id + 1, out_str

     # we can add aditional stoping conditions here 
     


    # choose variable with the highest info_gain
    else:
        list_of_vars = data.columns[np.where(data.columns != class_name)].to_list()
        max_gain = -np.infty

        # case when we have only one remaining variable
        if len(list_of_vars) == 1:
            winner = list_of_vars[0]

        else:
            out_str += "Entropy class : {}\n".format(Entropy(data[class_name]))
            random.shuffle(list_of_vars)
            for var in list_of_vars:
                _gain = Info_gain(data[var], data[class_name])
                out_str += "Info_gain of {}= {}\n".format(var, _gain)
                if _gain > max_gain:
                    max_gain = _gain
                    winner = var

        values_winner = np.unique(data[winner])
        out_str += "Best variable: {}\n".format(winner)
        out_str += "-" * 50 + "\n"

        # instantiate the data, call recursively
        T = Tree()
        id_winner = id
        
        if _root["value"] != "":
            T.create_node("({} | {} = {})".format(winner,_root["name"],_root["value"],),str(id_winner),
                         data= MyNode(value= winner,parent_value=_root["value"]))
        else:
             T.create_node(str(winner), str(id_winner),
                          data= MyNode(value= winner,parent_value=_root["value"]))




        
        id += 1
        for val in values_winner:
            data_inst = data[data[winner] == val]
            del data_inst[winner]
            Sub_Tree, id, out_str = Learn_DTree(data_inst, class_name, out_str, {"name": winner, "value": val}, id)
            T.paste(str(id_winner), Sub_Tree)
    
    return T, id, out_str
 

In [237]:
# Example: Assuming Learn_DTree is properly implemented
T, id, out_str = Learn_DTree(data, class_name, out_str="", _root={"name": "", "value": ""}, id=0)

In [238]:
tree_output = T.show(stdout=False)  # Get the tree as a string
print(tree_output)  # Directly print the string


Recidivism Reporting Year
├── (Age At Release | Recidivism Reporting Year = 2016)
│   ├── (Convicting Offense Type | Age At Release = >45)
│   │   ├── (Main Supervising District | Convicting Offense Type = Other)
│   │   │   ├── (Convicting Offense Classification | Main Supervising District = 5JD)
│   │   │   │   ├── (Part of Target Population | Convicting Offense Classification = D Felony)
│   │   │   │   │   ├── (Release Type | Part of Target Population = Yes)
│   │   │   │   │   │   ├── (Race - Ethnicity | Release Type = Parole)
│   │   │   │   │   │   │   ├── 0| Race - Ethnicity= Black
│   │   │   │   │   │   │   └── 0| Race - Ethnicity= White
│   │   │   │   │   │   └── 1Release Type = Discharged End of Sentence
│   │   │   │   │   └── 1Part of Target Population = No
│   │   │   │   └── (Race - Ethnicity | Convicting Offense Classification = Other Felony)
│   │   │   │       ├── (Release Type | Race - Ethnicity = White)
│   │   │   │       │   ├── 0| Release Type= Discharged End o

In [239]:
print(out_str)


iteration : 0
root: {'name': '', 'value': ''}
Entropy class : 0.9885815658559858
Info_gain of Race - Ethnicity= 5.732673222835771e-05
Info_gain of Convicting Offense Type= 0.00355540535576937
Info_gain of Age At Release= 0.006670108151095477
Info_gain of Release Type= 0.009453934793683172
Info_gain of Main Supervising District= 0.009184459572582071
Info_gain of Part of Target Population= 0.01672136288497783
Info_gain of Convicting Offense Subtype= 0.0010336861301879496
Info_gain of Recidivism Reporting Year= 0.21700574276244844
Info_gain of Convicting Offense Classification= 0.0008671111607267967
Info_gain of Fiscal Year Released= 0.21700574276244844
Best variable: Recidivism Reporting Year
--------------------------------------------------
iteration : 1
root: {'name': 'Recidivism Reporting Year', 'value': 2013}
all the classes are the same, leaf = 1
--------------------------------------------------
iteration : 2
root: {'name': 'Recidivism Reporting Year', 'value': 2016}
Entropy class

In [240]:
def classify_tree(Tree , x):
    nid = Tree.root
    if Tree[nid].is_leaf():
        return Tree[nid].data.value
    else:
        for h in Tree.is_branch(nid):
            if x[Tree[nid].data.value][0] == Tree[h].data.parent_value:
                return classify_tree(Tree.subtree(h), x.drop(Tree[Tree[nid].data.value, 1]))



In [242]:
x = pd.DataFrame({"Fiscal Year Released":["2010"],"Recidivism Reporting Year":["2013"], 
                  "Age At Release": ["<45"], "Convicting Offense Classification":["D Felony"],"Convicting Offense Type":["Violent"],
                  "Convicting Offense Subtype":["Other"],"Main Supervising District":["3JD"],"Release Type":["Parole"],
                  "Part of Target Population":["Yes"],"Recidivism - Return to Prison numeric":["Yes"]})

label = classify_tree(T, x)                  

In [None]:
print(label)

i surrendered here unfortunatly i didnt have enough time to debug this 

In [246]:

from sklearn.tree import DecisionTreeClassifier





In [247]:
X = data.drop(columns=['Recidivism - Return to Prison numeric'])
y = data['Recidivism - Return to Prison numeric']

In [248]:
X_encoded = pd.get_dummies(X)


X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)



In [249]:
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

In [250]:
y_pred = clf.predict(X_test)

In [254]:
correct_predictions = sum(y_pred == y_test)  
total_predictions = len(y_test) 

In [252]:
accuracy = correct_predictions / total_predictions  

print(f"Accuracy of the Decision Tree model: {accuracy * 100:.2f}%")


Accuracy of the Decision Tree model: 71.77%


## 2)

In [None]:
TP = sum((y_pred == 1) & (y_test == 1))  # True Positives
TN = sum((y_pred == 0) & (y_test == 0))  # True Negatives
FP = sum((y_pred == 1) & (y_test == 0))  # False Positives
FN = sum((y_pred == 0) & (y_test == 1))  # False Negatives


confusion_matrix = pd.DataFrame({
    "Predicted 0": [TN, FP],
    "Predicted 1": [FN, TP]
}, index=["Actual 0", "Actual 1"])


print("\nConfusion Matrix:")
print(confusion_matrix)


Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0         1005          519
Actual 1          352         1209


## 3)

we can add even more stopping conditions in the tree construction which will lower the iteration number but not significatly in my case .

we can add another base condition : for example :

In [258]:
    elif len(data) < min_samples_split:
        T = Tree()
        prediction = data[class_name].mode()[0]
        T.create_node(str(prediction) + "| {}= {}".format(_root["name"], _root["value"]), str(id))
        out_str += "Too few samples, stopping. Leaf = {}\n".format(prediction)
        out_str += "-" * 50 + "\n"
        return T, id + 1, out_str


SyntaxError: invalid syntax (106535599.py, line 1)

we pass the min_samples to the main function for example 5 . and add code snipit above below the last condition . the iterations will go from 691 to 614 in that case . 

the condition is stoping the tree construnction when we have less than min_samples_split = 5 (for example)