In [20]:
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from scipy import stats

data_dir = Path('../data').resolve()
data_dir.exists()

True

In [2]:
df = pd.read_csv(data_dir/'train.csv', index_col='Id')

# data section of interest
df = df[df['MSZoning'] == 'RL']
df = df[df['SaleCondition'] == 'Normal']

# target normalization
df['SalePrice'] = df['SalePrice'].apply(np.log)

# input variables
input_variables = ['LotArea', 'OverallQual', 'YearBuilt', 'TotRmsAbvGrd', 'GarageCars']
df = df[input_variables + ['SalePrice', ]]

# lot area normalization
df['LotArea'] = df['LotArea'].apply(np.log)

df.head()

Unnamed: 0_level_0,LotArea,OverallQual,YearBuilt,TotRmsAbvGrd,GarageCars,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,9.041922,7,2003,8,2,12.247694
2,9.169518,6,1976,6,2,12.109011
3,9.328123,7,2001,6,2,12.317167
5,9.565214,8,2000,9,3,12.429216
6,9.554993,5,1993,5,2,11.8706


In [18]:
alpha = 0.05

# Correlation

Pearson's linear correlation test.

In [30]:
corrs = df.corr('pearson')['SalePrice'].sort_values()[:-1]
corrs

LotArea         0.349376
TotRmsAbvGrd    0.558417
YearBuilt       0.576022
GarageCars      0.668399
OverallQual     0.835257
Name: SalePrice, dtype: float64

All have (apparently) a mild correlation, except for OverallQual, that has a strong correlation.

## Confidence intervals

In [34]:
def confidence_interval(r, alpha=0.05, n=df.shape[0]):
    # from https://zhiyzuo.github.io/Pearson-Correlation-CI-in-Python/
    z = np.arctanh(r)
    z_alpha = stats.norm.ppf(1 - alpha/2)

    se = 1 / np.sqrt(n - 3)

    a, b = z - z_alpha * se, z + z_alpha * se

    return np.tanh((a, b))

cis = corrs.apply(confidence_interval)
cis

LotArea          [0.29241939186022864, 0.40386232089516]
TotRmsAbvGrd    [0.5131574554784188, 0.6005788383928516]
YearBuilt       [0.5320255704916634, 0.6169166042783123]
GarageCars       [0.6317481525272429, 0.702067324252386]
OverallQual     [0.8150033966782022, 0.8534715878375436]
Name: SalePrice, dtype: object

In [56]:
df_corr = pd.DataFrame([cis.apply(lambda x: x[0]), cis.apply(lambda x: x[1])]).T
df_corr.columns = ['low', 'high']
df_corr['corr'] = corrs
df_corr[['low', 'corr', 'high']]

Unnamed: 0,low,corr,high
LotArea,0.292419,0.349376,0.403862
TotRmsAbvGrd,0.513157,0.558417,0.600579
YearBuilt,0.532026,0.576022,0.616917
GarageCars,0.631748,0.668399,0.702067
OverallQual,0.815003,0.835257,0.853472


# Hypothesis tests

In [58]:
t_alpha = 1.962

In [57]:
def t(r, n=df.shape[0]):
    return r * np.sqrt(n - 2) / np.sqrt(1 - r * r)

t(0.349376)

11.510865732852656