#### Seperate the input columns

In [16]:
import pandas as pd
import random

df = pd.read_csv('ds_all.csv')
X = df.drop(columns=['Loan_Status', 'Loan_ID'], axis=1)
ran = random.randint(0, len(X.columns)-1)

#### Get the numerical value from string value -> client

In [17]:
from sklearn import preprocessing

def convert_string_columns_to_numeric(df):
    # Select all columns with string values
    string_columns = df.select_dtypes(include='object')

    # Create a label encoder for each string column
    label_encoders = {column: preprocessing.LabelEncoder() for column in string_columns}

    # Encode the string values in each column
    df[string_columns.columns] = string_columns.apply(lambda x: label_encoders[x.name].fit_transform(x))

    # Return the modified dataframe
    return df

numerical_column = convert_string_columns_to_numeric(X)
print(numerical_column)


    Education  Self_Employed  ApplicantIncome  LoanAmount  Credit_History  \
0           0              0             8334         160               1   
1           0              0             4583         128               1   
2           0              1             3000          66               1   
3           1              0             2583         120               1   
4           0              0             6000         141               1   
5           0              1             5417         267               1   
6           1              0             2333          95               1   
7           0              0             3036         158               0   
8           0              0             4006         168               1   
9           0              0            12841         349               1   
10          0              0             3200          70               1   
11          0              0             3660         187               1   

#### Get random split value -> Client
#### Get the randomly selected 50% input columns -> Master

In [18]:
import pandas as pd
import numpy as np

def select_random_features(df):
  # Get the number of columns in the dataframe
  n_cols = df.shape[1]

  # Select a random 50% of the columns
  col_indices = np.random.choice(range(n_cols), size=int(n_cols / 2), replace=False)

  # Return a new dataframe with only the selected columns
  return df.iloc[:, col_indices]
                
random_features = select_random_features(numerical_column)
print(random_features)

    Credit_History  Education  Property_Area
0                1          0              1
1                1          0              0
2                1          0              2
3                1          1              2
4                1          0              2
5                1          0              2
6                1          1              2
7                0          0              1
8                1          0              2
9                1          0              1
10               1          0              2
11               1          0              1
12               1          0              2
13               1          0              0
14               1          0              2
15               1          0              2
16               1          1              2
17               0          0              2
18               1          1              0
19               1          0              1
20               0          0              0
21        

#### Get random split value from min and max

In [22]:
import numpy as np

def get_random_value_from_range(column):
  # Get the minimum and maximum values of the column
  min_value = column.min()
  max_value = column.max()

  # Generate a random value between the min and max values
  random_value = np.random.uniform(min_value, max_value)

  return random_value


# Get the name of the first column
first_column_name = numerical_column.columns[1]
print(first_column_name)

# Get the first column using its name
first_column = numerical_column[first_column_name]

random_split_value = get_random_value_from_range(first_column)
print(random_split_value)

Self_Employed
0.6174726510409408


#### Split the sample space based on the split threashold

In [24]:
import pandas as pd

def split_dataframe_on_column_value(df, column_name, split_value):
  # Select rows with values less than or equal to the split value
  df1 = df[df[column_name] <= split_value]

  # Select rows with values greater than the split value
  df2 = df[df[column_name] > split_value]

  # Return the two dataframes as a tuple
  return df1, df2


df1, df2 = split_dataframe_on_column_value(numerical_column, first_column_name, random_split_value)

# Print the resulting dataframes
print(df1)
print(df2)

    Education  Self_Employed  ApplicantIncome  LoanAmount  Credit_History  \
0           0              0             8334         160               1   
1           0              0             4583         128               1   
3           1              0             2583         120               1   
4           0              0             6000         141               1   
6           1              0             2333          95               1   
7           0              0             3036         158               0   
8           0              0             4006         168               1   
9           0              0            12841         349               1   
10          0              0             3200          70               1   
11          0              0             3660         187               1   
12          0              0             3073         200               1   
13          0              0             1853         114               1   

#### Aggregate the label for the sperated datasets -> client run for each sperated dataframe

In [7]:
def get_column_value_counts(df, column_name):
  # Get the value counts of the column
  value_counts = df[column_name].value_counts()

  # Get the unique values and counts as two separate lists
  unique_values = value_counts.index.tolist()
  counts = value_counts.values.tolist()

  # Return the unique values and counts as a tuple
  return unique_values, counts

# Get the value counts of a column
unique_values, counts = get_column_value_counts(df2, 'Loan_Status')

# Print the unique values and counts
print(unique_values)
print(counts)

['Y', 'N']
[22, 14]


#### Sum the splited values - master

In [8]:
def gini_impurity_of_leaf(counts):
  # Get the size of the sample space
  n = sum(counts)

  # Calculate the Gini impurity
  impurity = 1 - sum((count / n) ** 2 for count in counts)

  return impurity

impurity = gini_impurity_of_leaf(counts)
print(impurity)

0.47530864197530853


#### Calculate gini impurity of node

In [9]:
def gini_impurity_of_node(gi1, count1, gi2, count2):
    n = count1 + count2
    
    avg_impurity = (gi1*count1)/n + (gi2*count2)/n
    
    return avg_impurity

avg_impurity = gini_impurity_of_node(impurity, sum(counts), impurity, sum(counts))
print(avg_impurity)

0.47530864197530853


#### Function for stopping condition

In [10]:
def isStoppingCondition(counts,treeheight):
    approvedLoan_count = counts[0]
    declinedLoan_count = counts[1]
    
    if approvedLoan_count >= declinedLoan_count:
        percentage = (approvedLoan_count/sum(counts))
    else:
        percentage = (declinedLoan_count/sum(counts))

    decision = (percentage >= .80) or (treeheight >= 3)
    return decision

treeheight = 2
print(isStoppingCondition(counts, treeheight))

False


#### Build the tree -> master

In [14]:
import random

def random_subset(lst):
    result = []
    for i in range(len(lst)//2):
        index = random.randint(0, len(lst)-1)
        result.append(lst[index])
    return result


class TreeNode:
    def __init__(self, feature=None, split_value=0, tree_height=0, leaf_value=None, left=None, right=None):
        self.feature = feature
        self.split_value = split_value
        self.tree_height = tree_height
        self.leaf_value = leaf_value
        self.left = left
        self.right = right

feature_list = ['Education','Self_Employed','ApplicantIncome','LoanAmount','Credit_History','Property_Area']

random_feature_set = random_subset(feature_list)
print(random_feature_set)




['ApplicantIncome', 'LoanAmount', 'LoanAmount']


In [None]:
import random

def get_random_element(arr):
    # Get a random index from the array
    random_index = random.randint(0, len(arr)-1)
    # Return the element at the random index
    return arr[random_index]


In [15]:
import fet_pb2

def get_global_lebel_count(aggregatedValueList):
    accumulated_count_left = [0,0]
    accumulated_count_right = [0,0]
    
    global_lebel_count: list[fet_pb2.GetAggregatedValuesFromClientResponse] = []
    
    for lebel in global_lebel_count:
        accumulated_count_left[0] += lebel.aggregatedValueLeft.approvedLoan
        accumulated_count_left[1] += lebel.aggregatedValueLeft.declinedLoan
        
        accumulated_count_right[0] += lebel.aggregatedValueRight.approvedLoan
        accumulated_count_right[1] += lebel.aggregatedValueRight.declinedLoan
        
    return accumulated_count_left, accumulated_count_right
            

ModuleNotFoundError: No module named 'google'

In [None]:
class GiniImpurity:
    def __init__(self, gini_impurity=0, feature='', split_value=0):
        self.gini_impurity = gini_impurity
        self.feature = feature
        self.split_value = split_value

In [None]:
def getSplitValueFromClient(client_id, feature_name):
    response = 0
    with grpc.insecure_channel(f'localhost:{client_id}') as channel:
        # Fet service client test
        stub = fet_pb2_grpc.MasterClientCommunicationServiceStub(channel)
        response = stub.GetRandomSplitValueFromClient(fet_pb2.GetRandomSplitValueFromClientRequest(feature=feature_name, clientId=client_id))
        channel.close()
    return response
     
def getAggregatedValueFromClient(client_id, feature_name, split_value, data_set):
    response = None
    with grpc.insecure_channel(f'localhost:{client_id}') as channel:
        stub = fet_pb2_grpc.MasterClientCommunicationServiceStub(channel)
        response = stub.GetAggregatedValuesFromClient(fet_pb2.GetAggregatedValuesFromClientRequest(feature=feature_name, 
                                                                                                   clientId=client_id, 
                                                                                                   splitValue=split_value,
                                                                                                   dataSet = data_set))
        channel.close()
    return response

def broadCastTreeNodeBasedOnBestSplit(feature, split_value, tree_height):
    with grpc.insecure_channel(f'localhost:{client_id}') as channel:
        stub = fet_pb2_grpc.MasterClientCommunicationServiceStub(channel)
        response = stub.BroadcastTreeNodesBasedOnBestSplit(
            fet_pb2.BroadcastTreeNodesBasedOnBestSplitRequest(feature=feature, splitValue=split_value, treeHeight=tree_height))
        channel.close()
    

In [None]:
def get_min_gini_impurity(arr: list[GiniImpurity]):
    # Set the initial minimum value to the first element in the list
    min_value = arr[0]
    # Iterate over the elements in the list
    for element in arr:
        # If the current element is less than the current minimum value, update the minimum value
        if element.gini_impurity < min_value.gini_impurity:
            min_value = element
    # Return the minimum value
    return min_value


In [None]:
# buildTree for master
# Append the tree inside the forest

forest = []


# for first time aggregated_label can be [1,1] or 50/50 
# dataset can be 'Full'
# tree_height = 0
def buildTree(feature_list, aggregated_label, tree_height, dataset):
    if isStoppingCondition(aggregated_label, tree_height):
        if (aggregated_label[0] >= aggregated_label[1]):
            return TreeNode(leaf_value='Y')
        else:
            return TreeNode(leaf_value='N')
        
    
    # Randomly choose feature subset
    random_feature_set = random_subset(feature_list)
    
    client_set = [8282, 8383]
    
    gini_impurity_list: list[GiniImpurity] = []
    
    
    for feature in random_feature_set:
        
        # get the split value for each client
        split_value_set = []
        for client in client_set:
            split_value_set.append(getSplitValueFromClient(feature, client))
        
        split_value = get_random_element(split_value_set)
        
        
        # gather sum_left and sum_right from each client
        aggregatedValueList = []
        for client in client_set:
            aggregatedValueList.append(getAggregatedValueFromClient(feature, client, split_value, dataset))
            
        CL, CR = get_global_lebel_count(aggregatedValueList)
        gini_impurity_value = gini_impurity_of_node(gini_impurity_of_leaf(CL), CL, gini_impurity_of_leaf(CR), CR)
        
        gini_impurity_list.append(GiniImpurity(gini_impurity=gini_impurity_value, feature=feature, split_value=split_value))
    
    currently_best_split_feature = get_min_gini_impurity(gini_impurity_list)
    
    # TODO broadcast best_split_feature
    broadCastTreeNodeBasedOnBestSplit(currently_best_split_feature.feature, currently_best_split_feature.split_value, tree_height)
    
    
    # calculate aggregated label
    bestAggregatedValueList = []
    for client in client_set:
        bestAggregatedValueList.append(
            getAggregatedValueFromClient(currently_best_split_feature.feature, 
                                        client, 
                                        currently_best_split_feature.split_value,
                                        dataset))

    BCL, BCR = get_global_lebel_count(bestAggregatedValueList)
    
    
    node = TreeNode(feature=currently_best_split_feature.feature, 
                    split_value=currently_best_split_feature.split_value, 
                    tree_height=tree_height)
    
    # for key of data we can use f'{tree_height}_left' and f'{tree_height}_right'
    
    new_tree_height = tree_height+1
    node.left = buildTree(feature_list, BCL, new_tree_height, f'{new_tree_height}_left')
    node.right = buildTree(feature_list, BCR, new_tree_height, f'{new_tree_height}_right')
    return node
        
