### TOWARDS DECENTRALIZED PRICING MODELS WITH FEDERATED LEARNING: A STUDY ON CALIFORNIA HOUSING DATASET
Batuhan Avcı - 101629010

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import networkx as nx
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import kneighbors_graph
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

### Data Preprocessing and Visualization

Read the data.

In [None]:
houses = pd.read_csv(f'housing.csv')
houses.head()

In [None]:
# plot 
plt.figure(figsize=(12, 8))
sns.scatterplot(x='longitude', y='latitude', data=houses, palette='coolwarm', legend=True)
plt.title('Houses in California')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.grid(alpha=0.2)
plt.show()

Check for missing values.

In [None]:
houses.isnull().sum()

There are missing values in total_bedroom column. Fill missing values by using median.

In [None]:
houses['total_bedrooms'].fillna(houses['total_bedrooms'].median(), inplace=True)
houses.isnull().sum()

By using the latitude and longitude columns, I would like to perform k-means clustering to split the data into 12 clusters. Each cluster will be used as a node in the graph. This process will add 3 columns to the data: area_index, central_lat, and central_lon. The area_index column will be used to identify the cluster that each data point belongs to. The central_lat and central_lon columns will be used to plot the nodes on the graph.

In [None]:
# Perform Clustering
# Extract latitude and longitude
lat_lon = houses[['latitude', 'longitude']]

# Perform Agglomerative Hierarchical clustering to classify the places into different areas
agg = AgglomerativeClustering(n_clusters=9)
houses['area_index'] = agg.fit_predict(lat_lon)

# Get the central coordinates for each area
cluster_centers = []
for i in range(9):
    cluster_centers.append([
        houses[houses['area_index'] == i]['latitude'].mean(),
        houses[houses['area_index'] == i]['longitude'].mean()
    ])

# Map the central coordinates to the houses dataset
houses['central_lat'] = houses['area_index'].apply(lambda x: cluster_centers[x][0])
houses['central_lon'] = houses['area_index'].apply(lambda x: cluster_centers[x][1])

# Display the updated houses dataset with the new columns
houses.head()

Let's visualize the data based on the clusters.

In [None]:
# plot 
plt.figure(figsize=(12, 8))
sns.scatterplot(x='longitude', y='latitude', data=houses, hue='area_index', palette='coolwarm', legend=True)
plt.scatter([center[1] for center in cluster_centers], [center[0] for center in cluster_centers], color='black', s=100)
plt.title('Clustered Areas in California (9 clusters with centers in black)', fontsize=15) 
plt.xlabel('Longitude', fontsize=16) 
plt.ylabel('Latitude', fontsize=16) 
plt.xticks(fontsize=12)  
plt.yticks(fontsize=12) 
plt.grid(alpha=0.2)
plt.legend(fontsize=12)
plt.show()

Plot the histogram: the number of houses in each cluster.

In [None]:
# Histogram number of houses in each area

plt.figure(figsize=(12, 8))
sns.histplot(houses['area_index'], bins=30, kde=True)
plt.title('Number of Houses in Cluster')
plt.xlabel('Area Index')
plt.ylabel('Number of Houses')
plt.grid(alpha=0.2)
plt.show()


In [None]:
# Check the number of houses in each area
houses['area_index'].value_counts()

The column 'ocean_proximity' is a categorical column. I will convert it to a numerical column by using one-hot encoding. But first, let's visualize the data based on the 'ocean_proximity' column.

In [None]:
plt.figure(figsize=(12, 8))
sns.scatterplot(x='longitude', y='latitude', data=houses, hue='ocean_proximity', palette='colorblind', legend=True)
plt.title('Ocean Proximity For Each Data Point')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.grid(alpha=0.2)
plt.show()

Perform one-hot encoding.

In [None]:
# Step 4: One-Hot Encoding for 'ocean_proximity'
houses_encoded = pd.get_dummies(houses, columns=['ocean_proximity'])

To decide which columns to use for the model, I will check the correlation matrix.

In [None]:
# Select numerical features for normalization including the one-hot encoded columns
numerical_features_encoded = ['longitude', 'latitude', 'housing_median_age', 'total_rooms',
                              'total_bedrooms', 'population', 'households', 'median_income']

all_features = ['longitude', 'latitude', 'housing_median_age', 'total_rooms',
                              'total_bedrooms', 'population', 'households',
                                'median_income', 'ocean_proximity_<1H OCEAN',
                                  'ocean_proximity_INLAND', 'ocean_proximity_ISLAND', 
                                  'ocean_proximity_NEAR BAY', 'ocean_proximity_NEAR OCEAN', 
                                  'area_index', 'central_lat', 'central_lon', 'median_house_value']

# Calculate correlation matrix
correlation_matrix = houses_encoded[all_features].corr()

# Plot the correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()


Normalize the features by using min-max scaler. For the target variable, only scaling by 10000 will be applied.

In [None]:
# Divide median_house_value by 1000 to scale it down
houses_encoded['median_house_value'] /= 10000

houses_normalized_encoded = houses_encoded.copy()

# Features to be scaled for the whole dataset
whole_dataset_features = ['area_index', 'central_lat', 'central_lon']

# Apply Min-Max scaling to the specified features for the whole dataset
scaler = MinMaxScaler()
houses_normalized_encoded[whole_dataset_features] = scaler.fit_transform(houses_encoded[whole_dataset_features])

# Function to apply Min-Max scaling to numerical features within each area_index group
def scale_within_area(group):
    scaler = MinMaxScaler()
    group[numerical_features_encoded] = scaler.fit_transform(group[numerical_features_encoded])
    return group

# Apply the scaling function to each group based on area_index
houses_normalized_encoded = houses_normalized_encoded.groupby('area_index').apply(scale_within_area)

# Reset index if necessary
houses_normalized_encoded.reset_index(drop=True, inplace=True)

houses_normalized_encoded.head()

Drop the target variable from the features.

In [None]:
# Drop the target column from the features
X = houses_normalized_encoded.drop(columns='median_house_value')

### Helper Functions

In [None]:
def plotGraph(G_houses):    
    coords = np.array([G_houses.nodes[node]['coord'] for node in G_houses.nodes])
    
    # Draw nodes
    for node in G_houses.nodes:
        plt.scatter(coords[node,1], coords[node,0], color='blue', s=50, zorder=5)  # zorder ensures nodes are on top of edges
        plt.text(coords[node,1]+0.016, coords[node,0]+0.027, str(node), fontsize=8, ha='center', va='center', color='black', fontweight='bold')
    
    # Draw edges
    for edge in G_houses.edges:
        plt.plot([coords[edge[0],1],coords[edge[1],1]], [coords[edge[0],0],coords[edge[1],0]], linestyle='-', color='gray', alpha=0.5)

    plt.xlabel('longitude')
    plt.ylabel('latitude')
    plt.title('Clustered Areas in California')
    plt.grid(alpha=0.2)
    plt.show()
    
    
# The function connects each clusterş with 
# the nearest neighbours. 
def add_edges(graph, numneighbors=2):
    # Get the coordinates of the stations.
    coords = np.array([G_houses.nodes[node]['coord'] for node in G_houses.nodes])
    
    A = kneighbors_graph(coords, numneighbors, mode='connectivity', include_self=False)
    nrnodes = len(graph.nodes)
    for iter_i in range(nrnodes): 
        for iter_ii in range(nrnodes): 
            if iter_i != iter_ii : 
                if A[iter_i,iter_ii]> 0 :
                    graph.add_edge(iter_i, iter_ii)
    return graph


def ExtractFeatureMatrixLabelVector(data):
    nrfeatures = 16
    nrdatapoints = len(data)
    

    X = np.zeros((nrdatapoints, nrfeatures))
    y = np.zeros((nrdatapoints, 1))

    for ind in range(nrdatapoints):
        lat1 = float(data['latitude'].iloc[ind])
        lon1 = float(data['longitude'].iloc[ind])
        lat = float(data['central_lat'].iloc[ind])
        lon = float(data['central_lon'].iloc[ind])
        age = float(data['housing_median_age'].iloc[ind])
        rooms = float(data['total_rooms'].iloc[ind])
        bedrooms = float(data['total_bedrooms'].iloc[ind])
        population = float(data['population'].iloc[ind])
        income = float(data['median_income'].iloc[ind])
        households = float(data['households'].iloc[ind])
        area = float(data['area_index'].iloc[ind])
        value = float(data['median_house_value'].iloc[ind])
        ocean = float(data['ocean_proximity_<1H OCEAN'].iloc[ind])
        inland = float(data['ocean_proximity_INLAND'].iloc[ind])
        island = float(data['ocean_proximity_ISLAND'].iloc[ind])
        nearbay = float(data['ocean_proximity_NEAR BAY'].iloc[ind])
        nearocean = float(data['ocean_proximity_NEAR OCEAN'].iloc[ind])


        X[ind,:] = [lat1, lon1, lat, lon, age, rooms, bedrooms, population, income, households, ocean, inland, area, island, nearbay, nearocean]
        y[ind,:] = value
 
    return X, y

In [None]:
seed = 11
num_areas = len(houses_normalized_encoded.area_index.unique())
print(f'num_areas={num_areas}')

ExtractFeatureMatrixLabelVector function is used to extract the feature matrix and label vector from the data.

In [None]:
X, y = ExtractFeatureMatrixLabelVector(houses_normalized_encoded)


Create the graph.

In [None]:
from sklearn.utils import shuffle
# Create a networkX graph
G_houses = nx.Graph()

# Add a one node per station
G_houses.add_nodes_from(range(0, num_areas))

for i, area in enumerate(houses_normalized_encoded.area_index.unique()):
    # Extract data of a certain station
    area_data = houses_normalized_encoded[houses_normalized_encoded.area_index==area]
    
    # Extract features and labels

    X_local, y_local = ExtractFeatureMatrixLabelVector(area_data)
    X_local.shape

    #shuffle the data
    X_local, y_local = shuffle(X_local, y_local, random_state=seed)
    # Split the dataset into training and validation set, test
    X_train, X_val, y_train, y_val = train_test_split(X_local, y_local, test_size=0.4, random_state=seed)
    print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)
    # Split validation set into validation and test set
    X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=5/8, random_state=seed)


    # Stack local X_train X_val, y_train, y_val to compare FL model with a linear regression on whole dataset
    if i == 0:
        X_train_all = X_train
        X_val_all = X_val
        X_test_all = X_test
        y_train_all = y_train
        y_val_all = y_val
        y_test_all = y_test
    else:
        X_train_all = np.vstack((X_train_all, X_train))
        X_val_all = np.vstack((X_val_all, X_val))
        X_test_all = np.vstack((X_test_all, X_test))
        y_train_all = np.vstack((y_train_all, y_train))
        y_val_all = np.vstack((y_val_all, y_val))
        y_test_all = np.vstack((y_test_all, y_test))



    # Create a linear regression model
    local_samplesize = len(y_local)
    G_houses.nodes[i]['samplesize'] = local_samplesize
    G_houses.nodes[i]['coord'] = (area_data.central_lat.unique()[0], area_data.central_lon.unique()[0])
    G_houses.nodes[i]['X_train'] = X_train # The training feature matrix for local dataset at node i
    G_houses.nodes[i]['y_train'] = y_train  # The training label vector for local dataset at node i
    G_houses.nodes[i]['X_val'] = X_val # The training feature matrix for local dataset at node i
    G_houses.nodes[i]['y_val'] = y_val  # The training label vector for local dataset at node i
    G_houses.nodes[i]['X_test'] = X_test # The training feature matrix for local dataset at node i
    G_houses.nodes[i]['y_test'] = y_test  # The training label vector for local dataset at node i
    G_houses.nodes[i]['weights'] = np.zeros((16, 1)) # The weight vector for local dataset at node i

### For Benchmark Model
I will use the linear regression model on the whole data as the benchmark model. I want to see how well the FL algorithms performs compared to the linear regression model.

In [None]:
# perform linear regression on whole dataset
model = LinearRegression()
model.fit(X_train_all, y_train_all)
y_train_pred = model.predict(X_train_all)
y_val_pred = model.predict(X_val_all)
y_test_pred = model.predict(X_test_all)

mse_train = mean_squared_error(y_train_all, y_train_pred)
mse_val = mean_squared_error(y_val_all, y_val_pred)
mse_test = mean_squared_error(y_test_all, y_test_pred)

print(f'Mean Squared Error on Training Set: {mse_train:.2f}')
print(f'Mean Squared Error on Validation Set: {mse_val:.2f}')
print(f'Mean Squared Error on Test Set: {mse_test:.2f}')

### FedGD Algorithm

In [None]:
def FedGD(G_houses, alpha, learning_rate, max_iter=1000):
    G_houses_gd = G_houses.copy()
    num_areas = len(G_houses.nodes)
    
    for i in range(max_iter):
        # Iterate over all nodes.
        for current_node in G_houses_gd.nodes:
            # Extract the training data from the current node.
            X_train = G_houses_gd.nodes[current_node]['X_train']
            y_train = G_houses_gd.nodes[current_node]['y_train']
            w_current = G_houses_gd.nodes[current_node]['weights']
            training_size = len(y_train)

            # Compute the first term of the Equation 5.9.
            term_1 = (2 / training_size) * X_train.T.dot(y_train - X_train.dot(w_current))
            # Compute the second term of the Equation 5.9
            # by receiving neighbors' weight vectors.
            term_2 = 0
            neighbors = list(G_houses_gd.neighbors(current_node))
            for neighbor in neighbors:
                w_neighbor = G_houses_gd.nodes[neighbor]['weights']
                term_2 += w_neighbor - w_current
            term_2 *= 2 * alpha
            # Equation 5.9
            w_updated = w_current + learning_rate * (term_1 + term_2)

            # Update the current weight vector but do not overwrite the
            # "weights" attribute as we need to do all updates synchronously, i.e.,
            # using the previous local params
            G_houses_gd.nodes[current_node]['newweights'] = w_updated

        # After computing the new localparmas for each node, we now update
        # the node attribute 'weights' for all nodes
        for node_id in G_houses_gd.nodes:
            G_houses_gd.nodes[node_id]['weights'] = G_houses_gd.nodes[node_id]['newweights']

    # Create the storages for the training and validation errors.
    train_errors = np.zeros(num_areas)
    val_errors = np.zeros(num_areas)

    # Iterate over all nodes.
    for station in G_houses_gd.nodes:
        # Extract the data of the current node.
        X_train = G_houses_gd.nodes[station]['X_train']
        y_train = G_houses_gd.nodes[station]['y_train']
        X_val = G_houses_gd.nodes[station]['X_val']
        y_val = G_houses_gd.nodes[station]['y_val']
        w = G_houses_gd.nodes[station]['weights']

        # Compute and store the training and validation errors.
        train_errors[station] = mean_squared_error(y_train, X_train.dot(w))
        val_errors[station] = mean_squared_error(y_val, X_val.dot(w))

    # Output the average training and validation errors.
    avg_train_error = np.mean(train_errors)
    avg_val_error = np.mean(val_errors)
    
    return avg_train_error, avg_val_error, G_houses_gd

Perfrom grid search to find the best hyperparameters.

In [None]:
#Grid search loop
num_neighbours_list = [3, 4, 5]
alpha_list = [0.5, 0.2, 0.1]
learning_rate_list = [0.1, 0.05, 0.01]



# Create a storage for the results
results = []

# Loop over all combinations of hyperparameters
for num_neighbours in num_neighbours_list:
    G_houses_c = G_houses.copy()
    G_houses_c = add_edges(G_houses_c, numneighbors=num_neighbours)
    print("\nThe empirical graph is connected:", nx.is_connected(G_houses_c))

    for alpha in alpha_list:
        for learning_rate in learning_rate_list:

            avg_train_error, avg_val_error, trained_graph = FedGD(G_houses_c, alpha, learning_rate)

            # Output the average training and validation errors.
            print(f"num_neighbours: {num_neighbours}, alpha: {alpha}, learning_rate: {learning_rate}, avg_train_error: {avg_train_error}, avg_val_error: {avg_val_error}")

            # Store the results
            results.append((num_neighbours, alpha, learning_rate, avg_train_error, avg_val_error, trained_graph))

# Sort results based on the average validation error
results.sort(key=lambda x: x[4])
print("\nBest hyperparameters found:")
best_params_gd = results[0]
print(best_params_gd)

Based on the best hyperparameters (used validation set), evaluate the model on test set.

In [None]:
best_num_neighbours_gd, best_alpha_gd, best_learning_rate_gd, _, _, best_trained_graph_gd = best_params_gd
plotGraph(best_trained_graph_gd)

In [None]:
# Create the storages for the test errors.
test_errors = np.zeros(num_areas)
train_errors = np.zeros(num_areas)
val_errors = np.zeros(num_areas)

# Iterate over all nodes.
for station in best_trained_graph_gd.nodes:
    # Extract the data of the current node.
    X_test = best_trained_graph_gd.nodes[station]['X_test']
    y_test = best_trained_graph_gd.nodes[station]['y_test']
    X_train = best_trained_graph_gd.nodes[station]['X_train']
    y_train = best_trained_graph_gd.nodes[station]['y_train']
    X_val = best_trained_graph_gd.nodes[station]['X_val']
    y_val = best_trained_graph_gd.nodes[station]['y_val']

    w = best_trained_graph_gd.nodes[station]['weights']

    # Compute and store the test errors.
    test_errors[station] = mean_squared_error(y_test, X_test.dot(w))
    train_errors[station] = mean_squared_error(y_train, X_train.dot(w))
    val_errors[station] = mean_squared_error(y_val, X_val.dot(w))
    print("For node ", station)
    print("-------------------")

    print(f"Train error for node {station}: {train_errors[station]}")
    print(f"Validation error for node {station}: {val_errors[station]}")
    print(f"Test error for node {station}: {test_errors[station]}")

# Output the average test error.
avg_test_error = np.mean(test_errors)
print("\nRecall: The average training error:", best_params_gd[3])
print("Recall: The average validation error:", best_params_gd[4])
print("The average test error:", avg_test_error)

#mean squared error on the train set
print(f'Mean Squared Error on Training Set: {np.mean(train_errors):.2f}')
#mean squared error on the validation set
print(f'Mean Squared Error on Validation Set: {np.mean(val_errors):.2f}')



See the prediction results. For the node 1.

In [None]:
ind = 3

y_val = best_trained_graph_gd.nodes[ind]['y_test']
y_pred = best_trained_graph_gd.nodes[ind]['X_test'].dot(best_trained_graph_gd.nodes[ind]['weights'])

# Select a subset of data points for better visualization 
subset_size = min(100, len(y_val))  # Use up to 100 points or less if the dataset is smaller
indices = np.arange(subset_size)
y_val_subset = y_val[:subset_size]
y_pred_subset = y_pred[:subset_size]

# Calculate absolute errors
errors = np.abs(y_val_subset - y_pred_subset)

# Plot true vs predicted values
plt.figure(figsize=(14, 6))
plt.plot(indices, y_val_subset, label='True values', marker='o')
plt.plot(indices, y_pred_subset, label='Predicted values', marker='x')
plt.xlabel('Data points')
plt.ylabel('Price (in $10000)')
plt.title('True and Predicted Values FedGD')
plt.legend()
plt.grid(alpha=0.2)
plt.show()


### FedSGD Algorithm

In [None]:
def FedSGD(G_houses, alpha, learning_rate, batch_size, max_iter=1000):
    G_houses_sgd = G_houses.copy()
    num_areas = len(G_houses.nodes)
    
    for station in G_houses_sgd.nodes:
        G_houses_sgd.nodes[station]['weights'] = np.zeros((G_houses_sgd.nodes[station]['X_train'].shape[1], 1))
        G_houses_sgd.nodes[station]['curr_batch_start'] = 0

    for i in range(max_iter):
        # Iterate over all nodes.
        for current_node in G_houses_sgd.nodes:
            # Extract the training data from the current node.
            X_train = G_houses_sgd.nodes[current_node]['X_train']
            y_train = G_houses_sgd.nodes[current_node]['y_train']
            w_current = G_houses_sgd.nodes[current_node]['weights']
            training_size = len(y_train)

            # Compute the first term of the Equation 5.11.
            curr_batch_start = G_houses_sgd.nodes[current_node]['curr_batch_start']
            # Get the batched features and labels
            X_train_batch = X_train[curr_batch_start:(curr_batch_start+batch_size)]
            y_train_batch = y_train[curr_batch_start:(curr_batch_start+batch_size)]
            
            # update batch start for the next iteration
            curr_batch_start = curr_batch_start + batch_size
            # check if batch start would be outside the training set 
            if curr_batch_start >= training_size: 
                curr_batch_start = 0  # if next batch exceeds training set size start over from first datapoint
            G_houses_sgd.nodes[current_node]['curr_batch_start'] = curr_batch_start
            
            term_1 = (2 / batch_size) * X_train_batch.T.dot(y_train_batch - X_train_batch.dot(w_current))
            
            # Compute the second term of the Equation 5.11
            # by receiving neighbors' weight vectors.
            term_2 = 0
            neighbors = list(G_houses_sgd.neighbors(current_node))
            for neighbor in neighbors:
                w_neighbor = G_houses_sgd.nodes[neighbor]['weights']
                term_2 += w_neighbor - w_current
            term_2 *= 2 * alpha
            # Equation 5.11
            w_updated = w_current + learning_rate * (term_1 + term_2)
            
            # Update the current weight vector but do not overwrite the 
            # "weights" attribute as we need to do all updates synchronously, i.e., 
            # using the previous local params 
            G_houses_sgd.nodes[current_node]['newweights'] = w_updated
        
        # After computing the new localparmas for each node, we now update 
        # the node attribute 'weights' for all nodes 
        for node_id in G_houses_sgd.nodes: 
            G_houses_sgd.nodes[node_id]['weights'] = G_houses_sgd.nodes[node_id]['newweights']

    # Create the storages for the training and validation errors.
    train_errors = np.zeros(num_areas)
    val_errors = np.zeros(num_areas)

    # Iterate over all nodes.
    for station in G_houses_sgd.nodes:
        # Extract the data of the current node.
        X_train = G_houses_sgd.nodes[station]['X_train']
        y_train = G_houses_sgd.nodes[station]['y_train']
        X_val = G_houses_sgd.nodes[station]['X_val']
        y_val = G_houses_sgd.nodes[station]['y_val']
        w = G_houses_sgd.nodes[station]['weights']

        # Compute and store the training and validation errors.
        train_errors[station] = mean_squared_error(y_train, X_train.dot(w))
        val_errors[station] = mean_squared_error(y_val, X_val.dot(w))

    # Output the average training and validation errors.
    avg_train_error = np.mean(train_errors)
    avg_val_error = np.mean(val_errors)
    
    return avg_train_error, avg_val_error, G_houses_sgd


Perfrom grid search to find the best hyperparameters.

In [None]:
# Hyperparameters grid
num_neighbours_list = [3, 4, 5]
alpha_list = [0.5, 0.2, 0.1]
learning_rate_list = [0.1, 0.05, 0.01]
batch_size_list = [16, 32]

# Create a storage for the results
results = []

# Loop over all combinations of hyperparameters
for num_neighbours in num_neighbours_list:
    G_houses_c2 = G_houses.copy()
    G_houses_c2 = add_edges(G_houses_c2, numneighbors=num_neighbours)
    print("\nThe empirical graph is connected:", nx.is_connected(G_houses_c2))

    for alpha in alpha_list:
        for learning_rate in learning_rate_list:
            for batch_size in batch_size_list:
                avg_train_error, avg_val_error, trained_graph = FedSGD(G_houses_c2, alpha, learning_rate, batch_size)
                print(f"num_neighbours: {num_neighbours}, alpha: {alpha}, learning_rate: {learning_rate}, batch_size: {batch_size}, avg_train_error: {avg_train_error}, avg_val_error: {avg_val_error}")
                results.append((num_neighbours, alpha, learning_rate, batch_size, avg_train_error, avg_val_error, trained_graph))

# Sort results based on the average validation error
results.sort(key=lambda x: x[5])
print("\nBest hyperparameters found:")
best_params_sgd = results[0]
print('best_num_neighbours:', best_params_sgd[0], 'best_alpha:', best_params_sgd[1], 'best_learning_rate:', best_params_sgd[2], 'best_batch_size:', best_params_sgd[3], 'avg_train_error:', best_params_sgd[4], 'avg_val_error:', best_params_sgd[5])

Based on the best hyperparameters (used validation set), evaluate the model on test set.

In [None]:
best_num_neighbours_sgd, best_alpha_sgd, best_learning_rate_sgd, best_batch_size_sgd, _, _, best_trained_graph_sgd = best_params_sgd
plotGraph(best_trained_graph_sgd)

In [None]:
# Create the storages for the test errors.
test_errors = np.zeros(num_areas)
train_errors = np.zeros(num_areas)
val_errors = np.zeros(num_areas)

# Iterate over all nodes.
for station in best_trained_graph_sgd.nodes:
    # Extract the data of the current node.
    X_test = best_trained_graph_sgd.nodes[station]['X_test']
    y_test = best_trained_graph_sgd.nodes[station]['y_test']
    X_train = best_trained_graph_sgd.nodes[station]['X_train']
    y_train = best_trained_graph_gd.nodes[station]['y_train']
    X_val = best_trained_graph_sgd.nodes[station]['X_val']
    y_val = best_trained_graph_sgd.nodes[station]['y_val']

    w = best_trained_graph_sgd.nodes[station]['weights']

    # Compute and store the test errors.
    test_errors[station] = mean_squared_error(y_test, X_test.dot(w))
    train_errors[station] = mean_squared_error(y_train, X_train.dot(w))
    val_errors[station] = mean_squared_error(y_val, X_val.dot(w))
    print("For node ", station)
    print("-------------------")

    print(f"Train error for node {station}: {train_errors[station]}")
    print(f"Validation error for node {station}: {val_errors[station]}")
    print(f"Test error for node {station}: {test_errors[station]}")

# Output the average test error.
avg_test_error = np.mean(test_errors)
print("\nRecall: The average training error:", best_params_sgd[4])
print("Recall: The average validation error:", best_params_sgd[5])
print("The average test error:", avg_test_error)


In [None]:
ind = 3
y_val = best_trained_graph_sgd.nodes[ind]['y_test']
y_pred = best_trained_graph_sgd.nodes[ind]['X_test'].dot(best_trained_graph_sgd.nodes[ind]['weights'])

# Select a subset of data points for better visualization
subset_size = min(100, len(y_val))  # Use up to 100 points or less if the dataset is smaller
indices = np.arange(subset_size)
y_val_subset = y_val[:subset_size]
y_pred_subset = y_pred[:subset_size]

# Calculate absolute errors
errors = np.abs(y_val_subset - y_pred_subset)

# Plot true vs predicted values
plt.figure(figsize=(14, 6))
plt.plot(indices, y_val_subset, label='True values', marker='o')
plt.plot(indices, y_pred_subset, label='Predicted values', marker='x')
plt.xlabel('Data points')
plt.ylabel('Price (in $10000)')
plt.title('True and Predicted Values FedSGD')
plt.legend()
plt.grid(alpha=0.2)
plt.show()



## References

A. Jung, “Lecture Notes for CS-E4740 Federated Learning,”, available at https://github.com/alexjungaalto/FederatedLearning/blob/main/material/FL_LectureNotes.pdf, 2024.

CS-E4740 Federated Learning Course Assignments: https://github.com/alexjungaalto/FederatedLearning/tree/main/material/Assignments