In [1]:
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import pandas as pd
import seaborn as sns

plt.style.use('ggplot')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [2]:
import scipy.stats as stats
from sklearn import cluster, metrics
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage, cophenet, fcluster
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import pdist
from sklearn.metrics import silhouette_samples, silhouette_score

In [3]:
file_location = 'C:\\fx_2017_19.csv'

In [4]:
f = pd.read_csv(file_location)
df = pd.DataFrame(f)

FileNotFoundError: [Errno 2] File C:\fx_2017_19.csv does not exist: 'C:\\fx_2017_19.csv'

In [None]:
df.info()

In [None]:
df['Date'] = pd.to_datetime(df.Date).dt.to_period('D')

In [None]:
df.set_index('Date', inplace=True)
df.index = df.index.to_timestamp()

In [None]:
df.columns = ['usd', 'msci', 'oil', 'cny', 'twd', 'krw', 'sgd', 'myr', 'idr', 'thb', 'php', 'inr', 'rub', 
                'pln', 'huf', 'ron', 'try', 'brl', 'mxn', 'clp', 'cop', 'zar', 'ils', 'eur', 'gbp',
                'jpy', 'aud', 'nzd', 'cad', 'chf', 'nok', 'sek', 'gold', 'silver']

In [None]:
df.fillna(method='ffill', inplace=True)

In [None]:
df.head(5)

# Feature Engineering

#### All the exchange rates are rendered in USD/XXX terms, so they are all expressed as 1 US dollar equivalents. 
#### The broad USD index used is the Fed nominal broad trade-weighted US dollar exchange rate, and the crude oil price is the Brent crude price.

In [None]:
# Generate the equivalent 1 US$ values for gold and silver
df['au$'] = 1/df.gold
df['ag$'] = 1/df.silver

In [None]:
df.drop(['gold', 'silver'], axis=1, inplace=True)

In [None]:
df.head(5)

#### Dealing with financial market returns correlations, it is preferable to use the log returns, but the CAPM beta is calculated on the simple percentage returns. 

#### Furthermore, it is preferable to standardise the data for the clustering algorithms.

#### Thus, the order of data transformation will be to obtain the log returns and calculate the correlation coefficients, then get the simple returns and calculate the betas, and finally standardise the data. 

## Calculate cross-asset correlations

#### Correlations are a linear measure, so we use the log returns to reduce the likely distributional skew to FX returns, though FX returns do tend to be less skewed than equities. But we check later below.

In [None]:
df_logret = np.log(df/df.shift())

In [None]:
df_logret

In [None]:
# Delete the frist row of null values
df_logret = df_logret.iloc[1:]

In [None]:
correlations = df_logret.corr()

In [None]:
usd_correlations = correlations['usd'].iloc[3:]

In [None]:
# Array of USD correlations
usd_correlations

In [None]:
equity_correlations = correlations['msci'].iloc[3:]

In [None]:
# Array of World equity correlations
equity_correlations

## Calculate betas

#### The CAPM beta is calculated on simple percentage returns.

In [None]:
# Generate simple percentage returns to calculate the respective betas
df_returns = df.pct_change()

In [None]:
df_returns = df_returns.iloc[1:]

In [None]:
df_returns.head(5)

In [None]:
fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=False, figsize=(18, 6))
sns.distplot(df_logret['ag$'], bins=20, kde=True, ax=ax1).set_title('USD/Silver log returns histogram')
sns.distplot(df_returns['ag$'], bins=20, kde=True, ax=ax2).set_title('USD/Silver simple returns histogram')
plt.show()

#### Histograms above confirm not much difference between the simple returns distribution compared to the log returns distribution, though this was in a low vol period.

In [None]:
# Function to calculate beta
def beta(df):
    # sets first column as the market data
    X = df.values[:, [0]]
    # prepend a column of ones for the intercept
    X = np.concatenate([np.ones_like(X), X], axis=1)
    # matrix algebra for regression coefficient
    b = np.linalg.pinv(X.T.dot(X)).dot(X.T).dot(df.values[:, 1:])
    return pd.Series(b[1], df.columns[1:], name='beta')

In [None]:
# Calculate the betas to US dollar
usd_betas = beta(df_returns)

In [None]:
usd_betas

In [None]:
del usd_betas['msci']
del usd_betas['oil']

In [None]:
usd_betas

## Adding implied volatility

#### The values are 25-delta 3-month implied volatility

In [None]:
v = pd.read_csv('C:\\vol_2017_19.csv')

In [None]:
df_v = pd.DataFrame(v)

In [None]:
df_v['Date'] = pd.to_datetime(df_v.Date).dt.to_period('D')

In [None]:
df_v.set_index('Date', inplace=True)
df_v.index = df_v.index.to_timestamp()

In [None]:
df_v.columns = ['cny', 'twd', 'krw', 'sgd', 'myr', 'idr', 'thb', 'php', 'inr', 'rub', 
                'pln', 'huf', 'ron', 'try', 'brl', 'mxn', 'clp', 'cop', 'zar', 'ils', 'eur', 'gbp',
                'jpy', 'aud', 'nzd', 'cad', 'chf', 'nok', 'sek', 'au$', 'ag$']

In [None]:
# Forward filing all NaN values
df_v.fillna(method='ffill', inplace=True)
df_v.head(5)

In [None]:
vol_list = pd.DataFrame(df_v.mean(), columns=['vol'])

In [None]:
vol_list

## Consolidating all three features together

In [None]:
X = pd.DataFrame(data=[usd_betas, equity_correlations]).T

In [None]:
X = vol_list.merge(X, left_index=True, right_index=True, how='inner')

In [None]:
X.columns = ["vol", "usd_beta", "equity_corr"]

In [None]:
X = X[["usd_beta", "equity_corr", 'vol']]

# EDA

In [None]:
X

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))

mask = np.zeros_like(X.corr(), dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

sns.heatmap(X.corr(), mask=mask, annot=True, cmap='coolwarm', ax=ax)
ax.set_yticks(np.arange(0, df.shape[1])+0.1)
ax.set_ylim([X.shape[1], 0])
plt.show()

In [None]:
# Standardising the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
X_scaled

In [None]:
# Transformed the arrays of scaled values into a DataFrame
X_scaled = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)

In [None]:
g=sns.pairplot(X)
g.fig.set_size_inches(12,10)
plt.show()

In [None]:
fig = plt.figure(figsize=(10, 10))
ax = plt.axes(projection="3d")
ax.set_title('3D Scatterplot of FX Objects, Jan 2017 through Dec 2019', fontsize=14)

origin = [0,0,0]
ax.text(origin[0],origin[0],origin[0],"origin",size=15, color='navy')

x_points = X_scaled.usd_beta
y_points = X_scaled.equity_corr
z_points = X_scaled.vol
ax.scatter3D(x_points, y_points, z_points, s=150, color='maroon')

ax.set_xlim(-2, 2.5)
ax.set_ylim(-1.5, 3.5)
ax.set_zlim(-1.5, 3)

ax.set_xlabel('USD beta',labelpad=10,fontsize='large')
ax.set_ylabel('Equity correlation',labelpad=10,fontsize='large')
ax.set_zlabel('Implied volatility',labelpad=10,fontsize='large')

plt.show()

#### Refer to the "medium_2019_interactive" notebook for the interactive 3D charts

# Hierarchical Agglomerative Clustering

In [None]:
hier_comp = linkage(X_scaled, method='complete', metric='euclidean')

In [None]:
hier_average = linkage(X_scaled, method='average', metric='euclidean')

In [None]:
hier_ward = linkage(X_scaled, method='ward', metric='euclidean')

In [None]:
# Change the chart style...
plt.style.use('fivethirtyeight')

In [None]:
plt.figure(figsize=(12, 10))
plt.title('Dendrogram of FX Clusters, Jan 2017 through Dec 2019 (Complete)', fontsize=14)
plt.xlabel('Distance', fontsize=10)
plt.ylabel('Currency', fontsize=10)
dendrogram(
    hier_comp,
    orientation='right',
    #     leaf_rotation=90.,
    leaf_font_size=20,
    labels=X.index.values,
    color_threshold=3
)
plt.yticks(fontsize=11)
plt.show()

In [None]:
plt.figure(figsize=(12, 10))
plt.title('Dendrogram of FX Clusters, Jan 2017 through Dec 2019 (Average)', fontsize=14)
plt.xlabel('Distance', fontsize=10)
plt.ylabel('Currency', fontsize=10)
dendrogram(
    hier_average,
    orientation='right',
    #     leaf_rotation=90.,
    leaf_font_size=20,
    labels=X.index.values,
    color_threshold=2.1
)
plt.yticks(fontsize=11)
plt.show()

In [None]:
plt.figure(figsize=(12, 10))
plt.title('Dendrogram of FX Clusters, Jan 2017 through Dec 2019 (Ward)', fontsize=14)
plt.xlabel('Distance', fontsize=10)
plt.ylabel('Currency', fontsize=10)
dendrogram(
    hier_ward,
    orientation='right',
    #     leaf_rotation=90.,
    leaf_font_size=20,
    labels=X.index.values,
    color_threshold=5
)
plt.yticks(fontsize=11)
plt.show()

#### The various linkages - Complete, Average & Ward - all result in the same final 4 clusters. So no need to go into inter-cluster comparisons with the cophenetic correlations. In any event, the Complete linkage has the best scores among the three. So that's just use that as the final model.

In [None]:
hier_comp = linkage(X_scaled, method='complete', metric='euclidean')
c, coph_dists = cophenet(hier_comp, pdist(X, metric='euclidean'))
c

In [None]:
hier_average = linkage(X_scaled, method='average', metric='euclidean')
c, coph_dists = cophenet(hier_average, pdist(X, metric='euclidean'))
c

In [None]:
hier_ward = linkage(X_scaled, method='ward', metric='euclidean')
c, coph_dists = cophenet(hier_ward, pdist(X, metric='euclidean'))
c

#### The dendrogram (Complete linkage) above shows us how the bottoms-up agglomerative algorithm groups various currencies together during calendar years 2017 through 2019.

#### Four clusters:
#### Cluster 0 - NOK, CLP, AUD, SEK, NZD, HUF, PLN, EUR, RON, GBP, COP, MXN, BRL, RUB
#### Cluster 1 - silver, TRY, ZAR
#### Cluster 2 - gold, CHF, JPY
#### Cluster 3 - INR, THB, PHP, MYR, TWD, ILS, CNY, SGD, IDR, KRW, CAD

#### The closest geographically distinct cluster is Cluster 3, which is composed of mainly Asian currencies

# Cluster Evaluation

In [None]:
# Looks like k=4 gives the best silhouette score
for k in range(2, 6):
    model = AgglomerativeClustering(n_clusters=k, affinity='euclidean', linkage='complete')  
    q = model.fit_predict(X_scaled)
    s = silhouette_score(X_scaled, q, metric='euclidean')
    print('{} number of clusters has Silhouette score of {:0.6f}'.format(k, s))

In [None]:
# Same when we use Average linkage
for k in range(2, 6):
    model = AgglomerativeClustering(n_clusters=k, affinity='euclidean', linkage='average')  
    q = model.fit_predict(X_scaled)
    s = silhouette_score(X_scaled, q, metric='euclidean')
    print('{} number of clusters has Silhouette score of {:0.6f}'.format(k, s))

In [None]:
# Slightly different scores for the Ward linkage, due to the slight difference in results
for k in range(2, 6):
    model = AgglomerativeClustering(n_clusters=k, affinity='euclidean', linkage='ward')  
    q = model.fit_predict(X_scaled)
    s = silhouette_score(X_scaled, q, metric='euclidean')
    print('{} number of clusters has Silhouette score of {:0.6f}'.format(k, s))

In [None]:
# So we settle on the AH algorithm with Complete linkage and 4 clusters 
cluster_comp = AgglomerativeClustering(n_clusters=4, affinity='euclidean', linkage='complete')  

In [None]:
X['clusters'] = cluster_comp.fit_predict(X_scaled)
X

In [None]:
g=sns.pairplot(vars=[col for col in X if col!='clusters'], data=X, hue='clusters')
g.fig.set_size_inches(12,9)
plt.show()

#### The charts above makes clear the distinguishing features of the mainly Asian FX cluster (Cluster 3 - green), which includes ILS and CAD. It has low USD beta and low implied volatility, but a negative equity correlation.

#### This may be contrasted with the "safe haven cluster" (Cluster 2 - yellow), which has a largely similarly low volatility and low USD beta, but is positively correlated to equities (the only cluster to be thus).

In [None]:
fig = plt.figure(figsize=(10, 10))
ax = plt.axes(projection="3d")
ax.set_title('3D Scatterplot of FX Clusters, Jan 2017 through Dec 2019', fontsize=14)

origin = [0,0,0]
ax.text(origin[0],origin[0],origin[0],"origin",size=15, color='red')

x_points = X_scaled.usd_beta
y_points = X_scaled.equity_corr
z_points = X_scaled.vol
ax.scatter3D(x_points, y_points, z_points, s=150, c=X.clusters, cmap="viridis")

ax.set_xlim(-2, 2.5)
ax.set_ylim(-1.5, 3.5)
ax.set_zlim(-1.5, 3)

ax.set_xlabel('USD beta',labelpad=10,fontsize='large')
ax.set_ylabel('Equity correlation',labelpad=10,fontsize='large')
ax.set_zlabel('Implied volatility',labelpad=10,fontsize='large')

plt.show()

## Distributional characteristics of each cluster

### Let's evaluate the clusters through boxplots and two sample t-tests

In [None]:
X.groupby('clusters').agg(['mean', 'std', 'count'])

In [None]:
# Box-plots for 'usd_beta'
plt.figure(figsize=(12,4))
sns.boxplot(x='usd_beta', y='clusters', data=X, orient='h');

In [None]:
# Box-plots for 'equity_corr'
# Cluster 3 is statistically distinct from the others
plt.figure(figsize=(12,4))
sns.boxplot(x='equity_corr', y='clusters', data=X, orient='h');

In [None]:
# Box-plots for 'vol'
# Cluster 0, cluster 1 and clusters 2-3 are statistically different from each other
plt.figure(figsize=(12,4))
sns.boxplot(x='vol', y='clusters', data=X, orient='h');

In [None]:
# Function to run t-tests across two feature columns of DataFrame for two chosen clusters
def cluster_test(i, j):
    column_list = [x for x in X.columns if x != 'clusters']
    t_test_results = {}
    
    for column in column_list:
        group1 = X.where(X.clusters == i).dropna()[column]
        group2 = X.where(X.clusters == j).dropna()[column]

        t_test_results[column] = stats.ttest_ind(group1, group2, equal_var=False)
    
    results_df = pd.DataFrame.from_dict(t_test_results, orient='Index')
    results_df.columns = ['t-statistic','p-value']
    print('Cluster {} versus Cluster {} t-test results:'.format(i, j))
    return results_df


In [None]:
# Cluster 0 is statistically distinct from Cluster 1 in usd_beta and vol
cluster_test(0, 1)

In [None]:
# Cluster 0 is statistically distinct from Cluster 2 in usd_beta and equity_corr
cluster_test(0, 2)

In [None]:
# Cluster 0 is statistically distinct from Cluster 3 in usd_beta and vol
cluster_test(0, 3)

In [None]:
# Cluster 1 is statistically distinct from Cluster 2 in all three features
cluster_test(1, 2)

In [None]:
# Cluster 1 is statistically distinct from Cluster 3 in usd_beta and vol
cluster_test(1, 3)

In [None]:
# Cluster 2 is statistically distinct from Cluster 3 in all three features
cluster_test(2, 3)

#### The findings indicate that Cluster 1 (TRY, ZAR & silver) is distinguished by extremely high usd_beta and vol values. 
#### Cluster 2 (JPY, CHF & gold) is distinguished by positive equity_corr values. 
#### Cluster 3 (mostly Asian FX) tends to have the lowest usd_beta and vol values, though with some marginal overlap on these attributes with Cluster 2. 
#### Cluster 4 is the "inbetweener" cluster, particular on usd_beta and vol, with usd_beta values generally higher than 1.0, unlike Clusters 2 and 3.

#### The results show that geography is of little use in segmenting global currencies in terms of their behavior to common financial market factors, except perhaps for non-JPY Asian currencies. The USD beta offered the best differencing factor across the four clusters, followed by implied volatility, and lastly by equity market correlation. However, one cluster ("safe haven cluster") was distinguished by having the only positive equity correlation values.