In [1]:
import pandas as pd
import numpy as np
from math import log
import operator

In [2]:
class Tree():
    def __init__(self, X, predict_feature):
        self.X = X
        self.predict_feature = predict_feature
        self.output_classes, self.class_count = np.unique(X[predict_feature], return_counts=True)
        self.features = list(self.X.columns)
        self.features.remove(predict_feature)
        
class Node(Tree):
    def __init__(self, X, predict_feature):
        super().__init__(X, predict_feature)
        self.node_entropy = self.entropy(X)

    def isLeaf(self):
        return len(np.unique(self.X[self.predict_feature])) == 1
        
    def entropy(self, input_data):
        _, value_count = np.unique(input_data[self.predict_feature], return_counts=True)
        prob = value_count/len(input_data)
        entropy = 0
        for i in range(0, len(value_count)):
            entropy += prob[i]*np.log2(prob[i])
        return -entropy
    
    def avgChildEntropy(self, feature):
        split_classes = np.unique(self.X[feature])
        nodes = []
        for i in range(0, len(split_classes)):
            nodes.append(self.X[self.X[feature]==split_classes[i]])
        
        n_parent = len(self.X)
        avg_entropy = 0
        for i in range(0, len(nodes)):
            avg_entropy+= (len(nodes[i])/n_parent)*self.entropy(nodes[i])
        
        return avg_entropy
    
    def infoGain(self, feature):
        return self.node_entropy - self.avgChildEntropy(feature)
    
    def split_info(self, feature):
        split
        split_info = 0
        for i in range(0, len(self))
    
    def indi_gini(self, node): #pass in child data
        node_gini = 1
        for i in range(0, len(self.output_classes)):
            node_gini-=(len(node[node[self.predict_feature]==self.output_classes[i]])/len(node))**2
        return node_gini
    
    def overall_gini(self, feature): #parent
        split_classes = np.unique(self.X[feature])
        overall_gini = 0
        for i in range(0, len(split_classes)):
            overall_gini += (len(self.X[self.X[feature]==split_classes[i]])/len(self.X))*self.indi_gini(self.X[self.X[feature]==split_classes[i]])
        return overall_gini    
  
    def split_select_gini(self): #returns feature with lowest gini index
        lowest_gini = 0
        self.selected_feature = ""
        for feature in self.features:
            if self.overall_gini(feature) < lowest_gini:
                lowest_gini = self.overall_gini(feature)
                self.selected_feature = feature
        return self.selected_feature, lowest_gini
    
    def split_select_infoGain(self):
        highest_gain = 0
        self.selected_feature = ""
        for feature in self.features:
            if self.infoGain(feature) > highest_gain:
                highest_gain = self.infoGain(feature)
                self.selected_feature = feature
        return self.selected_feature, highest_gain  

    def split(self, split_by): 
        split_list= {} #each element is a node after splitting
        split_values = np.unique(self.X[split_by])
        for i in range(0, len(split_values)):
            node = pd.DataFrame(self.X[self.X[split_by]==split_values[i]])
            node = node.drop(columns=[split_by])
            split_list[split_values[i]] = node
            #split_list.append(node)
        return split_list




In [3]:
def leafClassifer(data, labelCol): 
    classes, n_classes = np.unique(data[labelCol], return_counts=True)

    return classes[n_classes.argmax()] #returns most frequent output class


def decision_tree(data, labelCol, min_split):
    node = Node(data,labelCol)
    if node.isLeaf() or len(node.X) < min_split or len(node.X.columns)==1: #default value for recursion
        return leafClassifer(data, labelCol)
    else:
        node.split_select_infoGain()
        split_nodes = node.split(node.selected_feature)
        myTree = {node.selected_feature:{}}
        
        for subnode_name, subnode_data in split_nodes.items():
            myTree[node.selected_feature][subnode_name] = decision_tree(subnode_data,labelCol, min_split)
        return myTree


def predict(input_row, input_tree):
    split_column = list(input_tree.keys())[0]
    for key,value in input_tree[split_column].items():
        if input_row[split_column] == key:
            if type(value).__name__ == 'str':
                return value
            else:
                return predict(input_row, input_tree[split_column][key])


In [4]:
df= pd.read_csv("data.csv", sep=";")
labels = df.iloc[:,-1:]
df

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.000000,0,10.8,1.4,1.74,Dropout
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.000000,0,10.8,1.4,1.74,Dropout
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.400000,0,9.4,-0.8,-3.12,Graduate
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.000000,0,13.9,-0.3,0.79,Graduate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4419,1,1,6,9773,1,1,125.0,1,1,1,...,0,6,8,5,12.666667,0,15.5,2.8,-4.06,Graduate
4420,1,1,2,9773,1,1,120.0,105,1,1,...,0,6,6,2,11.000000,0,11.1,0.6,2.02,Dropout
4421,1,1,1,9500,1,1,154.0,1,37,37,...,0,8,9,1,13.500000,0,13.9,-0.3,0.79,Dropout
4422,1,1,1,9147,1,1,180.0,1,37,37,...,0,5,6,5,12.000000,0,9.4,-0.8,-3.12,Graduate


In [5]:
df["Grade_bin"] = pd.qcut(df["Previous qualification (grade)"], 4)

In [6]:
df["Grade_bin"].value_counts()

Grade_bin
(125.0, 133.1]     1456
(94.999, 125.0]    1223
(140.0, 190.0]      930
(133.1, 140.0]      815
Name: count, dtype: int64

In [12]:
df_train = df.iloc[0:3200]
df_test = df.iloc[3200:]
df_train

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target,Grade_bin
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0.000000,0,10.8,1.4,1.74,Dropout,"(94.999, 125.0]"
1,1,15,1,9254,1,1,160.0,1,1,3,...,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate,"(140.0, 190.0]"
2,1,1,5,9070,1,1,122.0,1,37,37,...,6,0,0,0.000000,0,10.8,1.4,1.74,Dropout,"(94.999, 125.0]"
3,1,17,2,9773,1,1,122.0,1,38,37,...,6,10,5,12.400000,0,9.4,-0.8,-3.12,Graduate,"(94.999, 125.0]"
4,2,39,1,8014,0,1,100.0,1,37,38,...,6,6,6,13.000000,0,13.9,-0.3,0.79,Graduate,"(94.999, 125.0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3195,1,17,2,9773,1,1,120.0,1,1,1,...,6,0,0,0.000000,0,8.9,1.4,3.51,Dropout,"(94.999, 125.0]"
3196,1,42,1,9003,1,1,140.0,1,1,3,...,6,12,5,12.142857,0,15.5,2.8,-4.06,Enrolled,"(133.1, 140.0]"
3197,1,1,6,9147,1,1,127.0,1,1,19,...,5,8,3,12.666667,0,8.9,1.4,3.51,Graduate,"(125.0, 133.1]"
3198,1,7,1,9254,1,3,110.0,1,19,19,...,6,6,0,0.000000,0,16.2,0.3,-0.92,Dropout,"(94.999, 125.0]"


In [17]:
myTree = decision_tree(df_train, "Target", 10)

In [20]:
df_test

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target,Grade_bin,predicted
3200,2,39,1,9238,1,42,170.0,1,19,38,...,5,0,0.000000,0,8.9,1.4,3.51,Dropout,"(140.0, 190.0]",
3201,1,1,1,9773,1,1,146.0,1,1,3,...,6,6,12.833333,0,9.4,-0.8,-3.12,Graduate,"(140.0, 190.0]",
3202,1,1,1,9773,1,1,127.0,1,5,3,...,9,3,12.000000,0,12.4,0.5,1.79,Enrolled,"(125.0, 133.1]",
3203,1,17,1,9773,1,1,125.0,1,3,3,...,14,3,11.000000,0,13.9,-0.3,0.79,Dropout,"(94.999, 125.0]",
3204,1,17,5,9773,1,1,139.0,1,38,1,...,12,0,0.000000,0,7.6,2.6,0.32,Dropout,"(133.1, 140.0]",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4419,1,1,6,9773,1,1,125.0,1,1,1,...,8,5,12.666667,0,15.5,2.8,-4.06,Graduate,"(94.999, 125.0]",
4420,1,1,2,9773,1,1,120.0,105,1,1,...,6,2,11.000000,0,11.1,0.6,2.02,Dropout,"(94.999, 125.0]",
4421,1,1,1,9500,1,1,154.0,1,37,37,...,9,1,13.500000,0,13.9,-0.3,0.79,Dropout,"(140.0, 190.0]",
4422,1,1,1,9147,1,1,180.0,1,37,37,...,6,5,12.000000,0,9.4,-0.8,-3.12,Graduate,"(140.0, 190.0]",


In [19]:
df_test["predicted"] = df_test.apply(predict, args=(myTree,), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["predicted"] = df_test.apply(predict, args=(myTree,), axis=1)


In [35]:
#accuracy = count(correct predictions) / count(all classes)
accuracy=len(df_test[df_test["Target"] == df_test["predicted"]])/len(df_test)



0.2540849673202614