In [14]:
import pandas as pd
import numpy as np
import random
from pprint import pprint
import seaborn as sns

In [278]:
Data = pd.read_csv("C:\\Users\\Mirzakhan Aliyev\\Desktop\\DT.csv",index_col = None)

# Train and Test Data Seperation

In [279]:
def train_test_split (Data, test_size):
    if isinstance(test_size, float):
        test_size = round(test_size*len(Data))
    indices = Data.index.tolist()
    test_indices = random.sample(population=indices, k = test_size)
    
    test_Data = Data.loc[test_indices]
    train_Data = Data.drop(test_indices)
    
    return train_Data, test_Data

# Writing the Helper functions for Tree Function

In [280]:
random.seed(0)
train_Data, test_Data = train_test_split(Data, 1)
train_Data.head()

Unnamed: 0,Day,Outlook,Humidity,Wind,Play
0,D1,Sunny,High,Weak,No
1,D2,Sunny,High,Strong,No
2,D3,Overcast,High,Weak,Yes
3,D4,Rain,High,Weak,Yes
4,D5,Rain,Normal,Weak,Yes


In [281]:
data = train_Data.values


### Checking The Purity of Columns

In [6]:
def check_purity (data):
    
    label_column = data[:, -1]
    unique_classes = np.unique(label_column)
    if len(unique_classes) == 1:
        return True
    else:
        return False
    # This function Takes Data Frame up with a condition given during input, and accordingly 
    # Calculates the label_colum unique values


### Classifier Function

In [7]:
def Classify_Data (data):
    label_column = data[:,-1]
    unique_classes, counts_unique_classes = np.unique (label_column, return_counts = True)
    index = counts_unique_classes.argmax()
    
    classification = unique_classes[index]
    return classification


In [8]:
Classify_Data(train_Data[train_Data['Outlook']=='Rain'].values)

'Yes'

### Potential Splits

In [216]:
def Get_Potential_Splits (data):
    Potential_Splits = {}

    _, n_columns = data.shape
    for column_index in range (n_columns - 1):
            Potential_Splits[column_index] = [] #This empty list to be filled up
            values = data[:, column_index]
            unique_values = np.unique(values)
            type_of_feature = FEATURE_TYPES[column_index]
            
            Potential_Splits[column_index] = [] #This empty list to be filled up
            
            if type_of_feature == "continuous":
                for index in range(len(unique_values)):
                    if index != 0:
                        current_value = unique_values[index]
                        previous_value = unique_values[index - 1]
                        Potential_split = (current_value + previous_value)/2
                        Potential_Splits[column_index].append(Potential_split)
            else:
                
                Potential_Splits[column_index] = unique_values

    return Potential_Splits

#  This Function checks the possible splitting options within each columns, can be used both for continuous and categorical 
#  data

In [217]:
Get_Potential_Splits(train_Data.values)

{0: array(['D1', 'D10', 'D11', 'D12', 'D13', 'D15', 'D2', 'D3', 'D4', 'D5',
        'D6', 'D7', 'D8', 'D9'], dtype=object),
 1: array(['Overcast', 'Rain', 'Sunny'], dtype=object),
 2: array(['High', 'Normal'], dtype=object),
 3: array(['Strong', 'Weak'], dtype=object)}

### Split Data Function to be used in Entropy identification

In [206]:
def split_data (data, split_column, split_value):
    split_column_values = data[:,split_column]
    
    type_of_feature = FEATURE_TYPES[split_column]
    
    if type_of_feature == "continuous":
        data_below = data[split_column_values <= split_value]
        data_above = data[split_column_values >= split_value]
    else:
        
        #For Categorical Data we have only categories that can be splitted
        data_below = data[split_column_values == split_value]
        data_above = data[split_column_values != split_value]
    return data_below, data_above

In [231]:
data_below, data_above = split_data(data, 3, 'Weak')
calculate_overall_entropy(data_below, data_above)
#data_above

0.8380423950607804

### Lowest Overall Entropy Determination

In [222]:
def calculate_entropy (data):
    label_column = data[:,-1]
    _, counts = np.unique(label_column, return_counts = True)
    summ = counts.sum()
    probability = counts/summ
    entropy = sum (probability * (-np.log2(probability)))
    return entropy


In [230]:
def calculate_overall_entropy (data_below, data_above):
    n_datapoints = len(data_below) + len(data_above)

    p_data_below = len(data_below)/n_datapoints
    p_data_above = len(data_above)/n_datapoints

    overall_entropy = p_data_below*calculate_entropy(data_below) + p_data_above*calculate_entropy(data_above)
    
    return overall_entropy

In [251]:
def determine_best_split (data, Potential_Splits, columns_nottobe_splitted):
    overall_entropy = 999
    nsc = columns_nottobe_splitted
    
    for column_index in Potential_Splits:
        if column_index != nsc:
            for value in Potential_Splits[column_index]:
                data_below, data_above = split_data(data, split_column = column_index, split_value = value)
                current_overall_entropy = calculate_overall_entropy(data_below, data_above)

                if current_overall_entropy <= overall_entropy:
                    overall_entropy = current_overall_entropy
                    best_split_column = column_index
                    best_split_value  = value
                    data_below, data_above = split_data(data, split_column = column_index, split_value = value)
                    current_overall_entropy = calculate_overall_entropy(data_below, data_above)

                

    
    return best_split_column, best_split_value

# Main Algorithm

### Determine the type of the feature

In [194]:
def determine_typeof_thefeature (Data):
    feature_types = []
    unique_value_treshold = 15
    
    for column in Data.columns:
        unique_values = Data[column].unique()
        example_value = Data[column][0]
        
        if (len(unique_values) <= unique_value_treshold) or (isinstance(example_value, str)):
            feature_types.append("categorical")
        else:
            feature_types.append("continuous")
            
            
    
    return feature_types

In [189]:
feature_type = determine_typeof_thefeature(Data)
i = 0
for column in Data.columns:
    print(column, " - ", feature_type[i])
    i=i+1

Day  -  categorical
Outlook  -  categorical
Humidity   -  categorical
Wind  -  categorical
Label  -  categorical


### Representation of Tree Model

In [282]:
def decision_tree_algorithm(Data, counter = 0, min_samples = 2, max_depth = 5):
    
    # We inted to input DataFrame in the first call, but since the function will be multiple called
    # The helper function will use the data as numpy array, and output array as well.
    # So we first switch the DataFrame to Array, but for the next calls the input df will be array, that's we 
    # leave it as it is.
    if counter == 0:
        global COLUMN_HEADERS, FEATURE_TYPES
        COLUMN_HEADERS = Data.columns
        FEATURE_TYPES = determine_typeof_thefeature(Data) # Since This variable is GLOBAL we can use it any code block also
        data = Data.values
    else:
        data = Data
    
    
    # Base Case
    # The samples are the colum size or the features, and if it does not pass certain treshold, we just choose most frequent
    # max_depth is just the number of layer that we want the subtree to extend
    if check_purity(data) or len(data)<min_samples or (counter == max_depth):
        classification = Classify_Data(data)
        return classification
    # Recursive Case 
    else:
        counter +=1
        
        # Calling the Helper Functions
        potential_splits = Get_Potential_Splits(data)
        split_column, split_value = determine_best_split(data, potential_splits, 0)
        data_below, data_above = split_data(data, split_column, split_value)
        
        # Instantiate the sub-tree
        feature_name = COLUMN_HEADERS[split_column]
        type_of_feature = FEATURE_TYPES[split_column]
        if type_of_feature == "continuous":
            question = " {} <= {} ".format(feature_name, split_value)
        else:
            question = " {} = {} ".format(feature_name, split_value)
        sub_tree = {question: []}
        #Define the answers
        yes_answer = decision_tree_algorithm(data_below, counter, min_samples, max_depth)
        no_answer = decision_tree_algorithm(data_above, counter, min_samples, max_depth)
        if yes_answer == no_answer:
            sub_tree = yes_answer
        else:
            sub_tree[question].append(yes_answer)
            sub_tree[question].append(no_answer)
        
        return sub_tree

In [283]:
tree = decision_tree_algorithm(Data)
pprint(tree)

{' Outlook = Overcast ': ['Yes',
                          {' Humidity = Normal ': [{' Wind = Weak ': ['Yes',
                                                                      {' Outlook = Sunny ': ['Yes',
                                                                                             'No']}]},
                                                   {' Outlook = Sunny ': ['No',
                                                                          {' Wind = Weak ': ['Yes',
                                                                                             'No']}]}]}]}


### Testing Algorithm

In [287]:
def classify_example (example, tree):
    question = list(tree.keys())[0]

    feature_name, comparison_operator, value = question.split()
    if comparison_operator == "<=":

        if example[feature_name] <= float(value):
            answer = tree[question][0]
        else:
            answer = tree[question][1]
    else:
        
        if str(example[feature_name]) == value:
            answer = tree[question][0]
        else:
            answer = tree[question][1]
    #Base Case

    if not isinstance(answer, dict):
        return answer
    else:
        residual_tree = answer
        return classify_example(example, residual_tree)
    
    

In [293]:
example

Unnamed: 0,Day,Outlook,Humidity,Wind,Play
13,D14,Rain,High,Strong,No


In [294]:
example = test_Data
#example

tree = decision_tree_algorithm(Data)
classify_example(example, tree)

'No'

### Accuracy

In [291]:
def calculate_accurac (Data, tree):
    Data['Classification'] = Data.apply(classify_example, axis = 1, args=(tree,))
    Data['Classification_Correctness'] = Data.Label == Data.Classification
    
    accuracy = Data.Classification_Correctness.mean()
    return accuracy

In [292]:
FEATURE_TYPES

['categorical', 'categorical', 'categorical', 'categorical', 'categorical']