In [None]:
!pip install ucimlrepo 

In [None]:
from ucimlrepo import fetch_ucirepo
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from collections import Counter
import multiprocessing as mp

In [None]:
adult = fetch_ucirepo(id=2)
X = adult.data.features
y = adult.data.targets

In [None]:
X.dtypes


In [None]:
X.head(5)

In [None]:
# le=LabelEncoder()
# for col,dtype in X.dtypes.items():
#     # print(col)
#     if isinstance(dtype,np.dtypes.ObjectDType):
#         X.loc[:,col]=le.fit_transform(X[col])

In [None]:
len(X)

In [None]:
X.head(50)

In [None]:
y.head(5)

In [None]:
# y.loc[:,'income']=le.fit_transform(y['income'])
X=X.drop("education-num",axis=1)
X.nunique()


In [None]:
X.describe()

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.15,random_state=42)

In [None]:
print(type(X_train))

print(len(X_train))

In [None]:
class Node:
    def __init__(self,splittingFeature:str,children:list,values:list):
        self.splittingFeature=splittingFeature
        self.children=children
        self.values=values
        
        assert len(self.children)==len(self.values),f'the length of children and values is not same , children: {len(self.children)} values: {len(self.values)}'
        
def convertibleInt(val):
    
    try:
        int(val)
        return True
        
    except (ValueError,TypeError):
        return False
            

In [None]:
class DecisionTree:
    
    def __init__(self,split_measure:str):
        
        splitMeasuresDict={'entropy':DecisionTree.getEntropy
                       ,'giniImpurity':DecisionTree.getGiniImpurity}
        
        if split_measure not in splitMeasuresDict.keys():
            raise ValueError(f"invalid split measure, the supported measures are :{[key for key in splitMeasuresDict.keys()]}")
        
        self.splitCriteria=splitMeasuresDict[split_measure]
        
    @staticmethod
    def getEntropy(y:pd.Series):
        
        counts = np.array(list(Counter(y).values()), dtype=float)
        prob=(counts/y.shape[0])
        return np.sum(-prob*np.log(prob))
        
    @staticmethod
    def getGiniImpurity(y:pd.Series):
        
        counts = np.array(list(Counter(y).values()), dtype=float)
        prob=(counts/y.shape[0])
        return 1-np.sum(prob**2)
    
    
    def getOptimalThreshold(self,x:pd.Series,y:pd.Series):
        
        
        idx=np.argsort(x.to_numpy())
        x=x.to_numpy()[idx]
        y=y.to_numpy()[idx]
        
        # print(x.shape,y.shape)
        minImpurity=float('inf')
        bestThre=None
        for i in range(1,y.shape[0]-1):
            
            n1=y[:i]
            n2=y[i:]
            impurity=n1.shape[0]*self.splitCriteria(n1)/x.shape[0]+n2.shape[0]*self.splitCriteria(n2)/x.shape[0]
            # print(impurity)
            if impurity<minImpurity:
                bestThre=(x[i-1]+x[i])/2
                minImpurity=impurity
        return minImpurity,bestThre
    

    def getnewImpurity(self,Data):
        
        # Data=np.transpose(Data,axes=(1,0))
        Data=pd.DataFrame(Data,columns=['feature','label'])
        print(Data.head(2))
        impurity=0
        thres=None
        # if Data['feature'].dtype == object: #checks for discrete
        if not convertibleInt(Data['feature'][0]):
            # print(Data.iloc[0].dtype)
            # print(Data['feature'].unique())
            
            groups={k:g['label'] for k,g in Data.groupby('feature')}
            for k in groups.keys():
                impurity+=groups[k].shape[0]*self.splitCriteria(groups[k])/Data.shape[0]
                
        else:
            
            print("got thres!!")
            impurity,thres=self.getOptimalThreshold(Data['feature'],Data['label'])

        print("min impurity is :",impurity)

        return impurity,thres

    def getOptimalFeature(self,x:pd.DataFrame,y:pd.DataFrame):
        
        # print(x.head(2))

        x_np=x.to_numpy()
        y_np=np.repeat(y.to_numpy(),x.shape[1],axis=1)
        # print(x_np.shape,y_np.shape)
        data=np.transpose(np.stack([x_np,y_np],axis=0),axes=(2,1,0))
        print(data.shape)
        # with mp.Pool(processes=int(0.6*mp.cpu_count())) as pool:
        #     impurities=pool.map(self.getnewImpurity,data)
        impurities=[]
        for d in data:
            impurities.append(self.getnewImpurity(d))
        
        impurities,thres=zip(*impurities)
        
        impurities=np.array(impurities)
        thres=np.array(thres)
        
        return np.argmin(impurities),thres
        
        
    def constructNode(self,x:pd.DataFrame,y:pd.DataFrame):
        
        print("new node:",x.shape,y.shape)
        
        if np.unique(y.to_numpy()).shape[0]==1 or x.shape[1]==0:
            print('terminated')
            return None
        
        bestFeature,thres=self.getOptimalFeature(x,y)
        print("best Feature :",x.columns[bestFeature])
        print("thres",thres)
        if thres[bestFeature]!=None:
            concat_df=pd.concat([x,y],axis=1)
            mask=concat_df[x.columns[bestFeature]]>thres[bestFeature]
            # print("mask shape",mask.shape)
            x=x.drop(x.columns[bestFeature],axis=1)
            values=[thres[bestFeature]]
            children=[]
            children.append(self.constructNode(x.loc[mask],y.loc[mask]))
            children.append(self.constructNode(x.loc[~mask],y.loc[~mask]))
            
        else:
            grouped=x.groupby(pd.concat([x,y],axis=1).columns[bestFeature])
            groupedDF=[g.drop(columns=x.columns[bestFeature]) for _, g in grouped]
            values=[v for v,_ in grouped]
            children=[]
            for df in groupedDF:
                children.append(self.constructNode(df.iloc[:,:-1],df.iloc[:,-1:]))
                
        return Node(x.columns[bestFeature],children,values)

    @staticmethod
    def predict(X:pd.DataFrame):
        pass
        

        

In [None]:
split="giniImpurity"
classifier=DecisionTree(split_measure=split)

In [None]:
X=pd.Series(
    [60,70,75,85,90,95,100,120,125,220],name='feature'
)

Y=pd.Series(
      ['no','no','no','yes','yes','yes','no','no','no','no'],name='label'

)

print(classifier.getOptimalThreshold(X,Y))

In [None]:


RootNode=classifier.constructNode(X_train,y_train)