## Python Point-Biserial Correlation

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import pointbiserialr

In [2]:
binary_var = np.array([1, 0, 1, 1, 0, 1, 0, 1, 0, 0])

In [3]:
continuous_var = np.array([85, 70, 90, 95, 60, 88, 55, 93, 58, 65])

In [4]:
mean_1 = np.mean(continuous_var[binary_var==1])

In [5]:
mean_0 = np.mean(continuous_var[binary_var==0])

In [6]:
std_x = np.std(continuous_var, ddof=1)

In [10]:
n_1 = np.sum(binary_var==1)

In [11]:
n_0 = np.sum(binary_var==0)

In [12]:
n = len(binary_var)

In [13]:
r_pb = (mean_1 - mean_0) / std_x * np.sqrt(n_1 * n_0 / (n*(n-1)))

In [14]:
print(r_pb)

0.9535664558521106


In [15]:
# Example 2 Scipy

In [16]:
r_pb_scipy, p_value = pointbiserialr(binary_var, continuous_var)

In [17]:
print(r_pb_scipy)

0.9535664558521106


In [18]:
# Example 3 Pandas

In [19]:
data = {
    'completed_race': [1, 0, 1, 1, 0, 1, 0, 1, 0, 0],
    'miles_per_week': [85, 70, 90, 95, 60, 88, 55, 93, 58, 65]
}

In [20]:
df = pd.DataFrame(data)

In [21]:
df

Unnamed: 0,completed_race,miles_per_week
0,1,85
1,0,70
2,1,90
3,1,95
4,0,60
5,1,88
6,0,55
7,1,93
8,0,58
9,0,65


In [22]:
r_pb_pandas = df['completed_race'].corr(df['miles_per_week'])

In [23]:
print(r_pb_pandas)

0.9535664558521105
