# Ranking correlation coefficients

Let's start by defining an initial `DataFrame` with some made up data:

In [95]:
import pandas as pd
from pandas import DataFrame

df_initial: DataFrame = pd.DataFrame(
    [
        {"price": 1, "quantity": 7, "rating": 75, "weight": 504},
        {"price": 2, "quantity": 20, "rating": 23, "weight": 220},
        {"price": 3, "quantity": 34, "rating": 11, "weight": 360},
        {"price": 4, "quantity": 39, "rating": 63, "weight": 250},
        {"price": 5, "quantity": 52, "rating": 77, "weight": 95},
    ]
)

Get the matrix of correlation coefficients for each feature-pair:

In [96]:
correlations: DataFrame = df_initial.corr(numeric_only=True)

print(correlations)

             price  quantity    rating    weight
price     1.000000  0.991237  0.226527 -0.807906
quantity  0.991237  1.000000  0.117234 -0.800305
rating    0.226527  0.117234  1.000000 -0.067521
weight   -0.807906 -0.800305 -0.067521  1.000000


Notice that the matrix diagonal is all `1`s. Also notice that above and below the diagonal are identical (*Feature A's* correlation with *Feature B* is equal to *Feature B's* correlation with *Feature A*).

Knowing that *Feature A* is perfectly correlated with itself isn't particularly interesting to us, so we want to transform our coefficients matrix so that only the relevant coefficients (without duplicates) have nonzero values.

We do this by using a NumPy method to set all values below the diagonal to zero (note that a `triu` returns an `array`, not a Pandas `DataFrame`, so we need to construct a new `DataFrame`):

In [97]:
import numpy as np

correlations_upper: DataFrame = pd.DataFrame(
    np.triu(correlations.values), index=correlations.index, columns=correlations.columns
)

print(correlations_upper)

          price  quantity    rating    weight
price       1.0  0.991237  0.226527 -0.807906
quantity    0.0  1.000000  0.117234 -0.800305
rating      0.0  0.000000  1.000000 -0.067521
weight      0.0  0.000000  0.000000  1.000000


We can then use another NumPy method to set all diagonal values to zero:

In [98]:
np.fill_diagonal(correlations_upper.values, 0)

print(correlations_upper)

          price  quantity    rating    weight
price       0.0  0.991237  0.226527 -0.807906
quantity    0.0  0.000000  0.117234 -0.800305
rating      0.0  0.000000  0.000000 -0.067521
weight      0.0  0.000000  0.000000  0.000000


To rank our coefficients, we care about absolute values, so we should calculate the modulus of each element in our `array`:

In [99]:
correlations_positive: DataFrame = abs(correlations_upper)

print(correlations_positive)

          price  quantity    rating    weight
price       0.0  0.991237  0.226527  0.807906
quantity    0.0  0.000000  0.117234  0.800305
rating      0.0  0.000000  0.000000  0.067521
weight      0.0  0.000000  0.000000  0.000000


Now, we can unstack our `DataFrame` so that all the cell values are contained with a single column (and are easily sortable):

In [100]:
from pandas import Series

correlations_unstacked: Series = correlations_positive.unstack()

print(correlations_unstacked)

price     price       0.000000
          quantity    0.000000
          rating      0.000000
          weight      0.000000
quantity  price       0.991237
          quantity    0.000000
          rating      0.000000
          weight      0.000000
rating    price       0.226527
          quantity    0.117234
          rating      0.000000
          weight      0.000000
weight    price       0.807906
          quantity    0.800305
          rating      0.067521
          weight      0.000000
dtype: float64


We can now sort the feature-pairs by correlation coefficient:

In [101]:
correlations_sorted: Series = correlations_unstacked.sort_values(ascending=False)

print(correlations_sorted)

quantity  price       0.991237
weight    price       0.807906
          quantity    0.800305
rating    price       0.226527
          quantity    0.117234
weight    rating      0.067521
price     price       0.000000
          quantity    0.000000
          rating      0.000000
          weight      0.000000
quantity  quantity    0.000000
          rating      0.000000
          weight      0.000000
rating    rating      0.000000
          weight      0.000000
weight    weight      0.000000
dtype: float64


Wrapping this in a function that takes some parameters we have:

In [102]:
import numpy as np


def get_n_correlation_pairs_above_threshold(df: DataFrame, n: int, threshold: float):
    corrs = df.corr(numeric_only=True)
    corrs_upper = pd.DataFrame(
        np.triu(corrs.values), index=corrs.index, columns=corrs.columns
    )
    np.fill_diagonal(corrs_upper.values, 0)
    corrs_positive = abs(corrs_upper)
    corrs_unstacked = corrs_positive.unstack()
    corrs_sorted = corrs_unstacked.sort_values(ascending=False)

    return corrs_sorted.where(lambda coefficient: coefficient > threshold)[0:n]


print(get_n_correlation_pairs_above_threshold(df=df_initial, n=2, threshold=0.75))

quantity  price    0.991237
weight    price    0.807906
dtype: float64
