In [None]:
import numpy as np
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd


In [None]:
#Construct grid for returns distribution

fig, axs = plt.subplots(3, 3, figsize=(15, 15))

for i, col in enumerate(returns.columns):
    axs[i // 3, i % 3].hist(returns[col].dropna(), bins=50, edgecolor='k')
    axs[i // 3, i % 3].set_title(f'Distribution of Returns for {col}')
    axs[i // 3, i % 3].set_xlabel('Returns')
    axs[i // 3, i % 3].set_ylabel('Frequency')

# My dataset only has 7 columns. For neat presentation, I turn off the empty subplots.

axs[2, 1].axis('off')
axs[2, 2].axis('off')

plt.tight_layout()

In [None]:
# Graph total returns distribution
# I used roughly the same colour as previous graph to distinguish between variables without changing context.

plt.figure(figsize=(10, 6))
plt.hist(returns.values.flatten(), bins=50, edgecolor='k', color='steelblue')
plt.title('Distribution of Returns', fontsize=14)
plt.xlabel('Returns', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

# Optional addiitons. I usually print stats with graphs to help with the interpretation.
print("Mean of Returns:\n", returns.mean())
print("\nStandard Deviation of Returns:\n", returns.std())
print("\nSkewness of Returns:\n", returns.skew())
print("\nKurtosis of Returns:\n", returns.kurtosis())

In [None]:
# HRP uses correlation as distance metric. First contruct a correlation matrix,
# convert into distance matrix, then use scipy's linkage() to perform the clustering.
# here I am using the average linkage method.

# Hierarchical clusters are usually visualised using dendrograms.
# I needed one to add to my research so I embedded it into the functions.

def HRP(returns_data, max_clusters=4, plot_dendrogram=True):

    correlation_matrix = np.corrcoef(returns_data.T)
    distance_matrix = np.sqrt(0.5 * (1 - correlation_matrix))
    linkage_matrix = linkage(distance_matrix, method='average')


    if plot_dendrogram:
        plt.figure(figsize=(10, 7))
        dendrogram(linkage_matrix, labels=returns_data.columns, leaf_rotation=90)
        plt.title('Hierarchical Risk Parity Dendrogram')
        plt.xlabel('Indices')
        plt.ylabel('Distance')
        plt.show()

    clusters = fcluster(linkage_matrix, t=max_clusters, criterion='maxclust')

    return clusters

# Optional. Helps with interpretation.
print("Cluster Assignments:", clusters)


# Important to store the cluster labels.
clusters = HRP(returns, max_clusters=4)
cluster_mapping = dict(zip(returns.columns, clusters))
print("Cluster Mapping:", cluster_mapping)

In [None]:
# I verified the clusters using a correlation matrix for my columns (i.e. returns data)
# There are many advanced forms to verify. However, in my case, I picked the one that
# would instantly reasonate with my audience and fit in with the overall direction of the research.

# Best presented in a heatmap.


    correlation_matrix =returns.corr()
    plt.figure(figsize=(10, 8))
    sns.heatmap(
        correlation_matrix,
        annot=True,
        cmap="coolwarm",
        vmin=-1,
        vmax=1,
        fmt=".2f",
        linewidths=0.5,
        cbar_kws={"label": "Correlation"},
        square=True,
    )
    plt.title("Correlation Between Indexes", fontsize=14)
    plt.show()

In [None]:
# I computed each cluster's annualised performance metrics and stored them in a dictionary for later use.


def cluster_performance(returns, cluster_mapping, ppy=12):
# ppy signifies period per year.
    cluster_performance = {}

    for cluster_label in set(cluster_mapping.values()):
        cluster_assets = [asset for asset, cluster in cluster_mapping.items() if cluster == cluster_label]

        cluster_returns = returns[cluster_assets]

        cluster_expected_return = (1 + cluster_returns.mean().mean()) ** ppy - 1
        cluster_risk = cluster_returns.std().mean() * (ppy ** 0.5)

        # Store results in a dictionary
        cluster_performance[cluster_label] = {
            "Annualized Expected Return": cluster_expected_return,
            "Annualized Risk (Volatility)": cluster_risk,
        }

    return cluster_performance

In [None]:
cluster_performance = cluster_performance(returns, cluster_mapping)
for cluster, performance in cluster_performance.items():
    print(f"Cluster {cluster}:")
    print(f"  Annualized Expected Return: {performance['Annualized Expected Return']:.4%}")
    print(f"  Annualized Risk (Volatility): {performance['Annualized Risk (Volatility)']:.4%}")

# This aproach to portfolio construction is very manual. The main reason is that I want to construct 3 portfolios with different constraints:
# one allocates equal weights, one aims for a maximum sharpe ratio and one has minimum risk.
# My dataset has 7 columns and 175 rows. When dealing with larger datasets, it is best to use optimisation tools from PyPortfolioOpt library.

risk_free_rate = 2
expected_returns = np.array([c["Annualized Expected Return"] for c in cluster_performance.values()])
risks = np.array([c["Annualized Risk (Volatility)"] for c in cluster_performance.values()])

# Calculate Sharpe Ratios
sharpe_ratios = (expected_returns - risk_free_rate) / risks

# Equal weights
equal_weights = np.full(len(clusters), 1 / len(clusters))
equal_return = np.dot(equal_weights, expected_returns)
equal_risk = np.sqrt(np.dot(equal_weights**2, risks**2))
equal_sharpe = (equal_return - risk_free_rate) / equal_risk

# Maximum Sharpe ratio
max_sharpe_weights = sharpe_ratios / sharpe_ratios.sum()
max_sharpe_return = np.dot(max_sharpe_weights, expected_returns)
max_sharpe_risk = np.sqrt(np.dot(max_sharpe_weights**2, risks**2))
max_sharpe_sharpe = (max_sharpe_return - risk_free_rate) / max_sharpe_risk

# Minimum risk
min_risk_weights = 1 / risks
min_risk_weights /= min_risk_weights.sum()
min_risk_return = np.dot(min_risk_weights, expected_returns)
min_risk_risk = np.sqrt(np.dot(min_risk_weights**2, risks**2))
min_risk_sharpe = (min_risk_return - risk_free_rate) / min_risk_risk



# Assemble in a df. Not the ideal approach for neat presetantion, but served me well for my research.
# If you need a presentable table, you could format it and save to xlsx.

portfolios = pd.DataFrame({
    "Portfolio": ["Equal Weights", "Max Sharpe Ratio", "Min Risk"],
    "Weights": [equal_weights, max_sharpe_weights, min_risk_weights],
    "Expected Return": [equal_return, max_sharpe_return, min_risk_return],
    "Risk (Volatility)": [equal_risk, max_sharpe_risk, min_risk_risk],
    "Sharpe Ratio": [equal_sharpe, max_sharpe_sharpe, min_risk_sharpe],
})

portfolios