In [47]:
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import pandas as pd

### Computing Spearman r from scratch vs directly

In [45]:
# create the data
n = 12
data1 = np.random.randn(n)
data2 = 0.2371*np.random.randn(n)+0.2551

# rank the data
data1_rank = stats.rankdata(data1)
data2_rank = stats.rankdata(data2)

# compute the correlation with ranked data
pcorr = stats.pearsonr(data1_rank,data2_rank)[0]
# compute spearson directly and compare
scorr = stats.spearmanr(data1,data2)[0]
print(pcorr)
print(scorr)

0.020979020979020963
0.02097902097902098


In [67]:
anscombe = np.array([
     # series 1     series 2      series 3       series 4
    [10,  8.04,    10,  9.14,    10,  7.46,      8,  6.58, ],
    [ 8,  6.95,     8,  8.14,     8,  6.77,      8,  5.76, ],
    [13,  7.58,    13,  8.76,    13, 12.74,      8,  7.71, ],
    [ 9,  8.81,     9,  8.77,     9,  7.11,      8,  8.84, ],
    [11,  8.33,    11,  9.26,    11,  7.81,      8,  8.47, ],
    [14,  9.96,    14,  8.10,    14,  8.84,      8,  7.04, ],
    [ 6,  7.24,     6,  6.13,     6,  6.08,      8,  5.25, ],
    [ 4,  4.26,     4,  3.10,     4,  5.39,      8,  5.56, ],
    [12, 10.84,    12,  9.13,    12,  8.15,      8,  7.91, ],
    [ 7,  4.82,     7,  7.26,     7,  6.42,      8,  6.89, ],
    [ 5,  5.68,     5,  4.74,     5,  5.73,     19, 12.50, ]
    ])

df = pd.DataFrame(anscombe, columns=('S1X','S1Y','S2X','S2Y','S3X','S3Y','S4X','S4Y'))

# Get Series 2 data and rank it

df['S2XRank'] = df['S2X'].rank()

df['S2YRank'] = df['S2Y'].rank()

cor_data= df[['S2XRank','S2YRank']].copy()

# Calculate Pearson Correlation on ranked Data
print(cor_data)
display('Calculated Spearman Correlation ', cor_data.corr().iloc[0,1])

# Use spearman function to calculate 

corr_s = stats.spearmanr(cor_data['S2XRank'],cor_data['S2YRank'])[0]

display('Spearman Function ', corr_s)

    S2XRank  S2YRank
0       7.0     10.0
1       5.0      6.0
2      10.0      7.0
3       6.0      8.0
4       8.0     11.0
5      11.0      5.0
6       3.0      3.0
7       1.0      1.0
8       9.0      9.0
9       4.0      4.0
10      2.0      2.0


'Calculated Spearman Correlation '

0.6909090909090908

'Spearman Function '

0.690909090909091

In [155]:
anscombe = np.array([
     # series 1     series 2      series 3       series 4
    [10,  8.04,    10,  9.14,    10,  7.46,      8,  6.58, ],
    [ 8,  6.95,     8,  8.14,     8,  6.77,      8,  5.76, ],
    [13,  7.58,    13,  8.76,    13, 12.74,      8,  7.71, ],
    [ 9,  8.81,     9,  8.77,     9,  7.11,      8,  8.84, ],
    [11,  8.33,    11,  9.26,    11,  7.81,      8,  8.47, ],
    [14,  9.96,    14,  8.10,    14,  8.84,      8,  7.04, ],
    [ 6,  7.24,     6,  6.13,     6,  6.08,      8,  5.25, ],
    [ 4,  4.26,     4,  3.10,     4,  5.39,      8,  5.56, ],
    [12, 10.84,    12,  9.13,    12,  8.15,      8,  7.91, ],
    [ 7,  4.82,     7,  7.26,     7,  6.42,      8,  6.89, ],
    [ 5,  5.68,     5,  4.74,     5,  5.73,     19, 12.50, ]
    ])


corrs_p = []
corrs_s = []
print(np.shape(anscombe))
for i in range(4):
    
    corrs_p.append(stats.pearsonr(anscombe[:,i*2],anscombe[:,i*2+1])[0])
    corrs_s.append(stats.spearmanr(anscombe[:,i*2],anscombe[:,i*2+1])[0])

anscorr_s = []
for i in range(int(anscombe.shape[1]/2)):
    x = stats.rankdata(anscombe[:,i*2])
    y = stats.rankdata(anscombe[:,i*2+1])
    x_mean = np.mean(x)
    y_mean = np.mean(y)
    corrs = sum((x-x_mean)*(y-y_mean))/np.sqrt(sum((x-x_mean)**2)* sum((y-y_mean)**2))
    anscorr_s.append(corrs)

print(anscorr_s)
print(corrs_s)

(11, 8)
[0.8181818181818182, 0.6909090909090909, 0.990909090909091, 0.5]
[0.8181818181818182, 0.690909090909091, 0.990909090909091, 0.5]
