In [31]:
# Spring 2018 - Adam Ross Nelson - Modified/executed in Jupyter.
# Fall   2017 - Adam Ross Nelson - Modified/executed in python.
# Fall   2015 - Adam Ross Nelson - Originally exectued in Stata.
#             - See: https:github.com/adamrossnelson/crossreg/blob/master/corrhandStata.do
# Maintained at: https:github.com/adamrossnelson/crossreg

# Use notebook calculates correlation without using -df.corr-
# Can be used to assist when learning how to calculate by by hand. 
# Or, useful when double checking hand work.

In [32]:
import pandas as pd
import math as math
import requests

thedata = {'y':[3,7,6,6,6,3,3,6],
          'x':[2,10,9,5,8,2,4,8]}

df = pd.DataFrame(thedata)
df

Unnamed: 0,x,y
0,2,3
1,10,7
2,9,6
3,5,6
4,8,6
5,2,3
6,4,3
7,8,6


In [33]:
# Individually calculate Y-bar (or Ysubi minus Ybar) | The error terms
df = df.assign(y_minus_ybar=df['y'] - df['y'].mean())

# Individually calculate X-bar (or Xsubi minus Xbar) | The error terms
df = df.assign(x_minus_xbar=df['x'] - df['x'].mean())

df

Unnamed: 0,x,y,y_minus_ybar,x_minus_xbar
0,2,3,-2.0,-4.0
1,10,7,2.0,4.0
2,9,6,1.0,3.0
3,5,6,1.0,-1.0
4,8,6,1.0,2.0
5,2,3,-2.0,-4.0
6,4,3,-2.0,-2.0
7,8,6,1.0,2.0


In [42]:
# Generate the covariances; then use to calculate numerator
# This term is stated as sum of the error terms (from above) multiplied together
df = df.assign(cov_of_xy=df['y_minus_ybar'] * df['x_minus_xbar'])
cov_of_xy_sum = df['cov_of_xy'].sum()

print("Covariance of x and y is %8.4f" %cov_of_xy_sum)

Covariance of x and y is  34.0000


In [43]:
# Generate squared errors
# The sum of squared error of y is stored in variable -sum_y_sq_err-
df = df.assign(y_minus_ybar_sq=df['y_minus_ybar'] * df['y_minus_ybar'])
sum_y_sq_err = df['y_minus_ybar_sq'].sum()

# The sum of squared error of x is stored in variable -sum_x_sq_err-
df = df.assign(x_minus_xbar_sq=df['x_minus_xbar'] * df['x_minus_xbar'])
sum_x_sq_err = df['x_minus_xbar_sq'].sum()

print("Sum of y squared error is %8.4f" %sum_y_sq_err)
print("Sum of x squared error is %8.4f" %sum_x_sq_err)

Sum of y squared error is  20.0000
Sum of x squared error is  70.0000


In [44]:
# Display results
df

Unnamed: 0,x,y,y_minus_ybar,x_minus_xbar,cov_of_xy,y_minus_ybar_sq,x_minus_xbar_sq
0,2,3,-2.0,-4.0,8.0,4.0,16.0
1,10,7,2.0,4.0,8.0,4.0,16.0
2,9,6,1.0,3.0,3.0,1.0,9.0
3,5,6,1.0,-1.0,-1.0,1.0,1.0
4,8,6,1.0,2.0,2.0,1.0,4.0
5,2,3,-2.0,-4.0,8.0,4.0,16.0
6,4,3,-2.0,-2.0,4.0,4.0,4.0
7,8,6,1.0,2.0,2.0,1.0,4.0


In [45]:
print('The correlation ''hand calculated'' results : ', end='')
print((cov_of_xy_sum)/(math.sqrt(sum_y_sq_err * sum_x_sq_err)), end=('\n' * 2))

The data correlation hand calculated results : 0.9086882225022429



In [47]:
# Simplified, less verbose option not dependant on vars above
print((df['cov_of_xy'].sum())/ \
math.sqrt((df['y_minus_ybar_sq'].sum()) * (df['x_minus_xbar_sq'].sum())), end=('\n' * 2))

0.9086882225022429



In [49]:
# print using available corr method
print('The "python calculated" correlation results : ' , end='')
print(df['y'].corr(df['x']), end=('\n' * 2))

The "python calculated" correlation results : 0.9086882225022428



In [50]:
# Provide formula notes for reference:
formulas = requests.get('https://raw.githubusercontent.com/adamrossnelson/crossreg/master/formulas.txt')
print('Provide formula notes for reference:')
print(formulas.text)

Provide formula notes for reference:
 
                        SUM(X - Xbar)(Y - Ybar)
  Correlation =  ----------------------------------
                  SQRT(SUM(X-Xbar)^2 SUM(Y-Ybar)^2)

                             total covxy
  Correlation =  ---------------------------------------
                  SQRT(total xmxbarsq * total ymybarsq)

                                 cov
  Correlation =  ---------------------------------------
                          SQRT( xmy * ymy )

                               Cov(X,Y)
  Correlation =  ---------------------------------------
                        SQRT(Var(X) * Var(Y))

                               Cov(X,Y)
  Correlation =  ---------------------------------------
                        SQRT(Var(X) * Var(Y))

