In [51]:
import pandas 

In [52]:
cars = pandas.read_csv('cars.csv')

In [53]:
cars.head()

Unnamed: 0.1,Unnamed: 0,speed,dist
0,1,4,2
1,2,4,10
2,3,7,4
3,4,7,22
4,5,8,16


# 상관계수

In [54]:
from scipy.stats import pearsonr

### (피어슨 상관계수, p값) : p값은 작을 수록 좋다
- p값 < .05 : 95% 신뢰구간 반대 부호가 포함 X
- P값 < .01 : 99% 신뢰구간 반대 부호가 포함 X
- p값 < .001 : 99.9% 신뢰구간 반대 부호가 포함 x

In [55]:
pearsonr(cars['speed'],cars['dist'])

(0.8068949006892105, 1.4898364962950763e-12)

In [56]:
from sklearn.utils import resample

In [57]:
df = resample(cars)

In [58]:
res = pearsonr(df['speed'],df['dist'])

In [59]:
res[0] #상관계수

0.8204470024616676

In [60]:
cors = [] #빈 리스트 
for _ in range(10000): #1만번 반복 
    df = resample(cars) #리샘플링 
    res = pearsonr(df['speed'],df['dist']) #상관계수를 구한다 
    cors.append(res[0]) #상관계수를 리스트에 추가 [0]은 상관계수, [1]은 값 

In [61]:
import numpy as np

In [62]:
np.quantile(cors,[.025,.975]) #상관계수의 95% 신뢰구간 

array([0.70177795, 0.88457381])

In [63]:
np.quantile(cors,[.005,.995]) #상관계수의 99% 신뢰구간 : 신뢰수준을 높이니까 구간이 더 넓어짐 

array([0.65411298, 0.90293636])

# 스피어만과 켄달 상관계수 
- 등수나 서열이 있는 data

In [64]:
liar = pandas.read_csv('liar.csv')

In [65]:
liar.head()

Unnamed: 0,Creativity,Position,Novice
0,53,1,0
1,36,3,1
2,31,4,0
3,43,2,0
4,30,4,1


In [66]:
from scipy.stats import spearmanr, kendalltau

In [67]:
spearmanr(liar['Creativity'], liar['Position']) # -가 나옴: 창의성이 높을 수록 거짓말을 더 잘한다. 

SpearmanrResult(correlation=-0.37321838128767815, pvalue=0.0017204168895658578)

In [42]:
kendalltau(liar['Creativity'], liar['Position']) # -가 나옴: 창의성이 높을 수록 거짓말을 더 잘한다. 

KendalltauResult(correlation=-0.3002413080651747, pvalue=0.001258802279346817)

In [44]:
pearsonr(liar['Creativity'], liar['Position'])

(-0.30603143483570205, 0.01114802877289378)

- 창의성과 거짓말 등수 사이에 역상관 -> 창의성이 높을수록 거짓말을 잘한다
- p value는 최소한 .05보단 작아야한다
- 확실한 결론: 스피어만과 켄달 둘 다 해봄 
- 그렇지만 보통 스피어만만 사용함 

# 자동차 데이터로 회귀분석

In [70]:
from statsmodels.formula.api import ols

In [75]:
res = ols('dist ~ speed', data=cars ).fit() 
# r 쪽에서 쓰는 표기법 'speed ~ dist' 인데 
# y=ax +b 처럼 dist가 왼쪽(y, 종속 변수) , speed가 오른쪽 (x, 독립변수)
# ~ 표시 뜻을 by 라고 생각하자 (스피드에 따라서 dist가 달라진다)

In [76]:
res.summary()

0,1,2,3
Dep. Variable:,dist,R-squared:,0.651
Model:,OLS,Adj. R-squared:,0.644
Method:,Least Squares,F-statistic:,89.57
Date:,"Mon, 16 Sep 2019",Prob (F-statistic):,1.49e-12
Time:,14:00:50,Log-Likelihood:,-206.58
No. Observations:,50,AIC:,417.2
Df Residuals:,48,BIC:,421.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-17.5791,6.758,-2.601,0.012,-31.168,-3.990
speed,3.9324,0.416,9.464,0.000,3.097,4.768

0,1,2,3
Omnibus:,8.975,Durbin-Watson:,1.676
Prob(Omnibus):,0.011,Jarque-Bera (JB):,8.189
Skew:,0.885,Prob(JB):,0.0167
Kurtosis:,3.893,Cond. No.,50.7


### 선형 모형 돌릴떄는 coef랑 intercept, speed 값만 나오는 데 회귀 모형은 많은 값들이 나옴 
- dist = 3.9(=speed)*speed + (-17.57 = intercept)
- 근데 회귀 분석 모델은 다 나오게 해줌 (coef표 맨 끝에 [0.025 0.975] 95% 신뢰구간까지 다! )
- p>|t| : p값이 0.05랑 비교했을 때..! 해석을 볼 수 있음 (신뢰구간이 둘다 +면 p값이 작게 나옴 -> 신뢰 구간이 +-든 한쪽으로 일정하게 나온다는 뜻) 
- 계수에다가 +- std err*2 를 하면 신뢰구간이 나옴 (신뢰구간을 구하는 과정에서 나오는 수치) 
- t: p를 구하는 과정에서 나오는 중간 수치 

# 아동공격성 데이터로 회귀분석2

In [78]:
child = pandas.read_csv('child.csv')

In [79]:
child.head()

Unnamed: 0,Aggression,Television,Computer_Games,Sibling_Aggression,Diet,Parenting_Style
0,0.37416,0.172671,0.141907,-0.328216,-0.110303,-0.279034
1,0.771153,-0.032872,0.709918,0.576837,-0.02299,-1.248167
2,-0.097728,-0.07446,-0.390141,-0.217184,0.280301,-0.328063
3,0.015935,-0.004427,-0.40808,0.046223,-0.263479,-1.005119
4,-0.275385,-0.675239,-0.277778,-0.891045,0.226581,0.489478


In [91]:
res = ols('Aggression ~ Television', data=child ).fit() 

In [92]:
res.summary()

0,1,2,3
Dep. Variable:,Aggression,R-squared:,0.025
Model:,OLS,Adj. R-squared:,0.024
Method:,Least Squares,F-statistic:,17.11
Date:,"Mon, 16 Sep 2019",Prob (F-statistic):,3.98e-05
Time:,15:17:40,Log-Likelihood:,-175.93
No. Observations:,666,AIC:,355.9
Df Residuals:,664,BIC:,364.9
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.0005,0.012,-0.041,0.967,-0.025,0.024
Television,0.1634,0.040,4.137,0.000,0.086,0.241

0,1,2,3
Omnibus:,24.471,Durbin-Watson:,1.931
Prob(Omnibus):,0.0,Jarque-Bera (JB):,58.038
Skew:,0.108,Prob(JB):,2.5e-13
Kurtosis:,4.43,Cond. No.,3.23


### 분석결과 : 두 변수의 상관관계는 유의미하다 (++)이며, p값은 0.05보다 낮다 
### television 점수가 1점 올라가면 agrr도 약 0.16 올라감 
### 근데 adj.r는 낮다..! (aggre의 2.5% 정도는 television으로 설명된다) -> 별로 유의미하지 않음 
### prob도0.05보다 작기 때문에 데이터도 충분하다. 
### aic, bic는 결과가 높다,,, 
## 결론: - 공격성이 증가하는 건 맞는 데, 작은 비중이다.

In [93]:
res1 = ols('Aggression~ Computer_Games', data=child ).fit() 

In [94]:
res1.summary()

0,1,2,3
Dep. Variable:,Aggression,R-squared:,0.035
Model:,OLS,Adj. R-squared:,0.033
Method:,Least Squares,F-statistic:,23.9
Date:,"Mon, 16 Sep 2019",Prob (F-statistic):,1.27e-06
Time:,15:18:19,Log-Likelihood:,-172.63
No. Observations:,666,AIC:,349.3
Df Residuals:,664,BIC:,358.3
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.0068,0.012,-0.560,0.576,-0.031,0.017
Computer_Games,0.1742,0.036,4.889,0.000,0.104,0.244

0,1,2,3
Omnibus:,25.478,Durbin-Watson:,1.929
Prob(Omnibus):,0.0,Jarque-Bera (JB):,66.334
Skew:,-0.011,Prob(JB):,3.94e-15
Kurtosis:,4.546,Cond. No.,2.93


### 컴퓨터와 텔레비전을 비교해보면 컴퓨터의 r이 더 높다 (3.5%와 2.5%) 그리고 aic, bic도 더 낮음. 
### 그러니까 컴퓨터 - 공격성에 더 큰 비율을 차지한다. -> 그치만 3.5% 라서 이것도 그렇게 관계가 있진 않음

In [95]:
res2 = ols('Aggression ~ Diet', data=child ).fit() 

In [96]:
res2.summary()

0,1,2,3
Dep. Variable:,Aggression,R-squared:,0.0
Model:,OLS,Adj. R-squared:,-0.001
Method:,Least Squares,F-statistic:,0.04891
Date:,"Mon, 16 Sep 2019",Prob (F-statistic):,0.825
Time:,15:19:05,Log-Likelihood:,-184.38
No. Observations:,666,AIC:,372.8
Df Residuals:,664,BIC:,381.8
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.0049,0.012,-0.397,0.692,-0.029,0.019
Diet,-0.0081,0.037,-0.221,0.825,-0.080,0.064

0,1,2,3
Omnibus:,27.097,Durbin-Watson:,1.928
Prob(Omnibus):,0.0,Jarque-Bera (JB):,73.373
Skew:,-0.023,Prob(JB):,1.17e-16
Kurtosis:,4.625,Cond. No.,2.97


### 신뢰구간 부호가 -+ 이며 p값도 0.05 보다 더 큼 -> 음식은 공격성과 상관이 없다. 

# 독립변수가 2개인 경우 

In [98]:
res3 = ols('Aggression ~ Television+Computer_Games', data=child ).fit() 

In [99]:
res3.summary()
# tel과 com이 독립변수 각각 1개씩
# y1 = ax1 + b1 / y2 = ax2 + b2

0,1,2,3
Dep. Variable:,Aggression,R-squared:,0.051
Model:,OLS,Adj. R-squared:,0.049
Method:,Least Squares,F-statistic:,17.99
Date:,"Mon, 16 Sep 2019",Prob (F-statistic):,2.45e-08
Time:,15:21:32,Log-Likelihood:,-166.81
No. Observations:,666,AIC:,339.6
Df Residuals:,663,BIC:,353.1
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.0029,0.012,-0.237,0.813,-0.027,0.021
Television,0.1353,0.040,3.420,0.001,0.058,0.213
Computer_Games,0.1539,0.036,4.293,0.000,0.083,0.224

0,1,2,3
Omnibus:,24.166,Durbin-Watson:,1.934
Prob(Omnibus):,0.0,Jarque-Bera (JB):,57.964
Skew:,0.091,Prob(JB):,2.59e-13
Kurtosis:,4.434,Cond. No.,3.42


# 위에를 보면 독립변수가 많아지니까 r의 값이 증가함. 
# aic, bic 값도 셋 중 제일 낮음
## (수정 r을 봐도 세 지수가 다 좋아진 걸 보면 -> 공격성을 설명하는 데 하나의 변수보다 여러개의 모형을 보는 게 낮다는 해석)

# tv, 컴퓨터 -> 공격성 vs tv, 컴퓨터, 형제 -> 공격성 비교

In [116]:
res6 = ols('Aggression ~ Sibling_Aggression',data=child).fit()
res6.summary()

0,1,2,3
Dep. Variable:,Aggression,R-squared:,0.017
Model:,OLS,Adj. R-squared:,0.015
Method:,Least Squares,F-statistic:,11.3
Date:,"Mon, 16 Sep 2019",Prob (F-statistic):,0.000821
Time:,16:10:01,Log-Likelihood:,-178.79
No. Observations:,666,AIC:,361.6
Df Residuals:,664,BIC:,370.6
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.0061,0.012,-0.493,0.622,-0.030,0.018
Sibling_Aggression,0.1264,0.038,3.361,0.001,0.053,0.200

0,1,2,3
Omnibus:,24.126,Durbin-Watson:,1.903
Prob(Omnibus):,0.0,Jarque-Bera (JB):,60.452
Skew:,-0.025,Prob(JB):,7.47e-14
Kurtosis:,4.475,Cond. No.,3.06


In [114]:
res4 = ols('Aggression ~ Television+Computer_Games',data=child).fit()
res4.summary()

0,1,2,3
Dep. Variable:,Aggression,R-squared:,0.051
Model:,OLS,Adj. R-squared:,0.049
Method:,Least Squares,F-statistic:,17.99
Date:,"Mon, 16 Sep 2019",Prob (F-statistic):,2.45e-08
Time:,16:09:33,Log-Likelihood:,-166.81
No. Observations:,666,AIC:,339.6
Df Residuals:,663,BIC:,353.1
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.0029,0.012,-0.237,0.813,-0.027,0.021
Television,0.1353,0.040,3.420,0.001,0.058,0.213
Computer_Games,0.1539,0.036,4.293,0.000,0.083,0.224

0,1,2,3
Omnibus:,24.166,Durbin-Watson:,1.934
Prob(Omnibus):,0.0,Jarque-Bera (JB):,57.964
Skew:,0.091,Prob(JB):,2.59e-13
Kurtosis:,4.434,Cond. No.,3.42


In [115]:
res5 = ols('Aggression ~ Television+Computer_Games+Sibling_Aggression', data=child ).fit() 
res5.summary()

0,1,2,3
Dep. Variable:,Aggression,R-squared:,0.056
Model:,OLS,Adj. R-squared:,0.051
Method:,Least Squares,F-statistic:,13.03
Date:,"Mon, 16 Sep 2019",Prob (F-statistic):,2.81e-08
Time:,16:09:34,Log-Likelihood:,-165.3
No. Observations:,666,AIC:,338.6
Df Residuals:,662,BIC:,356.6
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.0037,0.012,-0.304,0.761,-0.027,0.020
Television,0.1214,0.040,3.015,0.003,0.042,0.201
Computer_Games,0.1416,0.036,3.879,0.000,0.070,0.213
Sibling_Aggression,0.0669,0.039,1.731,0.084,-0.009,0.143

0,1,2,3
Omnibus:,22.454,Durbin-Watson:,1.923
Prob(Omnibus):,0.0,Jarque-Bera (JB):,52.404
Skew:,0.07,Prob(JB):,4.18e-12
Kurtosis:,4.367,Cond. No.,3.6


### 형제들의 공격성 - 공격성에는 상관관계가 있지만 
### 형제들 + 컴퓨터 + 타변수 - 공격성을 하면 형제들의 공격성 변수는 관계가 사라짐 (신뢰구간이 -+/ p값도 0.05보다 낮아짐)
# 결론 : tv와 컴퓨터를 *통계적으로 통제* 했을 때, 형제의 공격성은 아동의 공격성을 설명하지 못한다. 

- 그렇지만 모형은 적합도 지수로 고르기 때문에 티비 , 게임 -> 공격성 보다 < 형제(x), tv, 게임 -> 공격성의 지수가 더 높기 때문에 후자 모형을 선택하자! : 형제가 영향을 미친다는 결론은 낼 수 없지만, 지수로 보면 더 크기 때문에 빼면 안 된다. 
- - 모형 전체로 잘 돌아가냐와 변수가 영향일 미친다라는 것은 다르다. 
- ex) 스마트폰에 있는 기능을 안 쓰는 거랑 아예 없는 거는 다르지 않냐

# 최적의 변수 조합 찾기

In [120]:
res4 = ols('Aggression ~ Computer_Games+Diet+Sibling_Aggression+Parenting_Style', data=child ).fit() 
res4.summary()

0,1,2,3
Dep. Variable:,Aggression,R-squared:,0.082
Model:,OLS,Adj. R-squared:,0.076
Method:,Least Squares,F-statistic:,14.74
Date:,"Mon, 16 Sep 2019",Prob (F-statistic):,1.54e-11
Time:,16:30:18,Log-Likelihood:,-155.96
No. Observations:,666,AIC:,321.9
Df Residuals:,661,BIC:,344.4
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.0059,0.012,-0.497,0.619,-0.029,0.017
Computer_Games,0.1434,0.037,3.891,0.000,0.071,0.216
Diet,-0.1116,0.038,-2.947,0.003,-0.186,-0.037
Sibling_Aggression,0.0863,0.038,2.258,0.024,0.011,0.161
Parenting_Style,0.0619,0.013,4.925,0.000,0.037,0.087

0,1,2,3
Omnibus:,25.206,Durbin-Watson:,1.911
Prob(Omnibus):,0.0,Jarque-Bera (JB):,64.229
Skew:,0.051,Prob(JB):,1.13e-14
Kurtosis:,4.518,Cond. No.,3.48


그렇지만 모형은 적합도 지수로 고르기 때문에 티비 , 게임 -> 공격성 보다 < 형제(x), tv, 게임 -> 공격성의 지수가 더 높기 때문에 후자 모형을 선택하자! : 형제가 영향을 미친다는 결론은 낼 수 없지만, 지수로 보면 더 크기 때문에 빼면 안 된다.
- -> 이 의견에 의해서 모형이 가장 높은 값은 상관없는 변수들이 있을지라도, 여러개의 독립변수를 둔 모형값이 가장 높다! 
- -> 모형 설계와 변수 해석은 다르다

# 후방선택

In [119]:
res4 = ols('Aggression ~ Computer_Games+Diet+Sibling_Aggression+Parenting_Style', data=child ).fit() 
res4.summary()
# Television+Computer_Games+Diet+Sibling_Aggression+Parenting_Style 여기서 하나씩 빼보기 (결과 :television을 빼는 게 젤 나음 )
# 여기서 또 하나씩 더 빼봄 (뭘 빼도 더 안 좋아짐 -> stop!) : 최선의 모형 찾기 

0,1,2,3
Dep. Variable:,Aggression,R-squared:,0.082
Model:,OLS,Adj. R-squared:,0.076
Method:,Least Squares,F-statistic:,14.74
Date:,"Mon, 16 Sep 2019",Prob (F-statistic):,1.54e-11
Time:,16:29:34,Log-Likelihood:,-155.96
No. Observations:,666,AIC:,321.9
Df Residuals:,661,BIC:,344.4
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.0059,0.012,-0.497,0.619,-0.029,0.017
Computer_Games,0.1434,0.037,3.891,0.000,0.071,0.216
Diet,-0.1116,0.038,-2.947,0.003,-0.186,-0.037
Sibling_Aggression,0.0863,0.038,2.258,0.024,0.011,0.161
Parenting_Style,0.0619,0.013,4.925,0.000,0.037,0.087

0,1,2,3
Omnibus:,25.206,Durbin-Watson:,1.911
Prob(Omnibus):,0.0,Jarque-Bera (JB):,64.229
Skew:,0.051,Prob(JB):,1.13e-14
Kurtosis:,4.518,Cond. No.,3.48
