In [1]:
import numpy as np
import pandas as pd

In [10]:
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()

Downloading Cal. housing from https://ndownloader.figshare.com/files/5976036 to C:\Users\hp\scikit_learn_data


In [20]:
print(housing.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bur

In [25]:
# The target variable(=y) is the median house value for California districts

df = pd.DataFrame(housing.data,columns=housing.feature_names)
df['median_house_value']=housing.target
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,median_house_value
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [26]:
df.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,2.068558
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,1.153956
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,0.14999
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8,1.196
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,1.797
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01,2.64725
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31,5.00001


# t검정과 ANOVA의 조건
1) 표본이 정규분포를 따른다.<br>
2) 등분산<br>
3) 각 표본은 독립

위와 같은 조건들이 필요하지만 아래는 예시를 위한 것이므로 해당 조건을 고려하지 않도록 한다.



# 단일표본 t 검정
$H_0 : mu$ = popmean 

In [41]:
from scipy.stats import ttest_1samp
tstats,pvalue = ttest_1samp(df['median_house_value'],popmean=2)
print('t통계량 : ',tstats,'p-value : ',pvalue)

t통계량 :  8.535417086565456 p-value :  1.491623832774445e-17


In [34]:
tstats,pvalue = ttest_1samp(df['median_house_value'],2.0685)
print('t통계량 : ',tstats,'p-value : ',pvalue)

t통계량 :  0.007241987993753399 p-value :  0.9942218500916473


# 독립표본 t 검정
$H_0 : mean(data1) = mean(data2)$

In [37]:
from scipy.stats import ttest_ind
tstats,pvalue = ttest_ind(df['MedInc'],df['AveOccup'])
print('t통계량 : ',tstats,'p-value : ',pvalue)

t통계량 :  10.885696560227869 p-value :  1.4702074542030424e-27


In [40]:
tstats,pvalue = ttest_ind(df['MedInc']-0.6, df['AveOccup'])
print('t통계량 : ',tstats,'p-value : ',pvalue)

t통계량 :  2.721585824867504 p-value :  0.006499675506229937


# ANOVA
(일원배치분산분석)

In [54]:
from scipy.stats import f_oneway
fstats,pvalue = f_oneway(df['MedInc']-0.8,df['AveOccup'],df['AveRooms']-2)
print('f통계량 : ',fstats,'p-value : ',pvalue)

f통계량 :  22.536247540900362 p-value :  1.6451014332533905e-10


# 사후검정
data1, data2, data3의 평균이 같다는 귀무가설은 기각<br>
어디서 차이가 나는지도 알아보자

In [50]:
data = np.concatenate([df['MedInc']-0.8,df['AveOccup'],df['AveRooms']-2])
labels = ['MedInc']*len(df['MedInc'])+['AveOccup']*len(df['AveOccup'])+['AveRooms']*len(df['AveRooms'])

from statsmodels.stats.multicomp import pairwise_tukeyhsd

tukey_result = pairwise_tukeyhsd(data,labels,0.05)

In [51]:
data.shape

(61920,)

In [52]:
len(labels)

61920

In [53]:
print(tukey_result)

  Multiple Comparison of Means - Tukey HSD, FWER=0.05  
 group1   group2  meandiff p-adj  lower   upper  reject
-------------------------------------------------------
AveOccup AveRooms   0.3583 0.001  0.2139  0.5028   True
AveOccup   MedInc      0.0   0.9 -0.1444  0.1445  False
AveRooms   MedInc  -0.3583 0.001 -0.5028 -0.2139   True
-------------------------------------------------------


# 이항검정
범주가 두 개인 경우

In [56]:
# 다음은 1000번의 시도 중 성공확률이 0.5이고 515번 성공한 경우에 대한 이항검정이다.
from scipy.stats import binom_test
pval = binom_test(515,n=1000,p=0.5)
print(pval)

0.35911734869080425


In [60]:
# 550번 성공한 경우
pval = binom_test(550,n=1000,p=0.5)
print(pval)

0.0017305360849770046


# 카이제곱 검정

In [63]:
from scipy.stats import chi2_contingency
data = [[30,10],[35,5],[28,12],[20,20]]
chi2,pval,dof,expected = chi2_contingency(data)
print('chi2: {}, pvalue: {}, dof: {}\n {}'.format(chi2,pval,dof,expected))

chi2: 14.068913575597815, pvalue: 0.002812834559546625, dof: 3
 [[28.25 11.75]
 [28.25 11.75]
 [28.25 11.75]
 [28.25 11.75]]


# 다중공선성(vif) 확인

In [68]:
str_of_features= '+'.join(list(df.columns[:-1]))
print(str_of_features)

MedInc+HouseAge+AveRooms+AveBedrms+Population+AveOccup+Latitude+Longitude


In [72]:
from patsy import dmatrices
import statsmodels.api as sm

# R 문법과 상당히 비슷하다는 것을 확인할 수 있음

y,X = dmatrices('median_house_value ~ '+str_of_features, data=df, return_type = 'dataframe')

In [76]:
# OLS : Ordinary Least Square
result = sm.OLS(y,X).fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:     median_house_value   R-squared:                       0.606
Model:                            OLS   Adj. R-squared:                  0.606
Method:                 Least Squares   F-statistic:                     3970.
Date:                Sun, 09 Aug 2020   Prob (F-statistic):               0.00
Time:                        18:11:29   Log-Likelihood:                -22624.
No. Observations:               20640   AIC:                         4.527e+04
Df Residuals:                   20631   BIC:                         4.534e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    -36.9419      0.659    -56.067      0.0

다중공선성이 의심된다는 메세지가 출력되었으므로 해당 칼럼을 찾아 제거해야한다.

In [78]:
from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF
vif=[VIF(X.values,i) for i in range(X.shape[1])]

In [80]:
vif_df = pd.DataFrame()
vif_df['vif'] = vif
vif_df['name'] = X.columns

In [81]:
vif_df

Unnamed: 0,vif,name
0,17082.623698,Intercept
1,2.501295,MedInc
2,1.241254,HouseAge
3,8.342786,AveRooms
4,6.994995,AveBedrms
5,1.138125,Population
6,1.008324,AveOccup
7,9.297624,Latitude
8,8.962263,Longitude


보통 4 이상이면 다중공선성이 있다 의심되고 10 이상이면 매우 크다고 생각함