# 1. 상관계수

### 1. 상관계수

In [1]:
import pandas as pd

data = {
    '키' : [150, 160, 170, 175, 165],
    '몸무게' : [42,50, 70, 64, 56]
}
df = pd.DataFrame(data)
df.corr()

Unnamed: 0,키,몸무게
키,1.0,0.919509
몸무게,0.919509,1.0


In [2]:
df.corr().iloc[0,1]

0.9195090879163764

In [3]:
print(df['키'].corr(df['몸무게']))
df['몸무게'].corr(df['키'])

0.9195090879163765


0.9195090879163765

In [4]:
print(df.corr())
print(df.corr(method = 'spearman'))
df.corr(method = 'kendall')

            키       몸무게
키    1.000000  0.919509
몸무게  0.919509  1.000000
       키  몸무게
키    1.0  0.9
몸무게  0.9  1.0


Unnamed: 0,키,몸무게
키,1.0,0.8
몸무게,0.8,1.0


### 2. 두 변수의 상관 계수와 t-검정

In [7]:
from scipy import stats
print(stats.pearsonr(df['몸무게'], df['키']))
print(stats.spearmanr(df['몸무게'], df['키']))
print(stats.kendalltau(df['몸무게'], df['키']))

PearsonRResult(statistic=0.9195090879163766, pvalue=0.02707945689558947)
SignificanceResult(statistic=0.8999999999999998, pvalue=0.03738607346849874)
SignificanceResult(statistic=0.7999999999999999, pvalue=0.08333333333333333)


# 2. 단순 선형 회귀 분석

In [12]:
import pandas as pd
data = {
    '키' : [150, 160, 170, 175, 165, 155, 172, 168, 174, 158,
          162, 173, 156, 159, 167, 163, 171, 169, 176, 161],
    '몸무게' : [42, 50, 70, 64, 56, 48, 68, 60, 65, 52,
            54, 67, 49, 51, 58, 55, 69, 61, 66, 53]
}
df = pd.DataFrame(data)

In [13]:
from statsmodels.formula.api import ols
model = ols('키 ~ 몸무게', data = df).fit()
model.summary()

0,1,2,3
Dep. Variable:,키,R-squared:,0.892
Model:,OLS,Adj. R-squared:,0.886
Method:,Least Squares,F-statistic:,148.0
Date:,"Fri, 13 Jun 2025",Prob (F-statistic):,4.04e-10
Time:,19:11:07,Log-Likelihood:,-45.761
No. Observations:,20,AIC:,95.52
Df Residuals:,18,BIC:,97.51
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,115.0676,4.158,27.671,0.000,106.331,123.804
몸무게,0.8658,0.071,12.167,0.000,0.716,1.015

0,1,2,3
Omnibus:,0.985,Durbin-Watson:,2.609
Prob(Omnibus):,0.611,Jarque-Bera (JB):,0.336
Skew:,-0.315,Prob(JB):,0.845
Kurtosis:,3.082,Cond. No.,432.0


In [14]:
print(model.rsquared)

0.8915914350087261


In [15]:
print(model.params['몸무게'])
model.params['Intercept']

0.8658438852380184


115.06763904471866

In [16]:
print(model.pvalues['몸무게'])
print("{:.10f}".format(model.pvalues['몸무게']))

4.03793255993052e-10
0.0000000004


In [17]:
import pandas as pd
new_data = pd.DataFrame({'몸무게' : [67]})
result = model.predict(new_data)
print(result[0])

173.0791793556659


In [18]:
df['잔차'] = df['키'] - model.predict(df)
sum(df['잔차'] ** 2)

113.74226638884456

In [19]:
df['잔차'] = df['키'] - model.predict(df)
MSE = (df['잔차'] **2).mean()
MSE

5.687113319442227

In [21]:
from sklearn.metrics import mean_squared_error
pred = model.predict(df['몸무게'])
mse = mean_squared_error(df['키'], pred)
mse

5.687113319442227

In [22]:
model.conf_int(alpha = 0.05).loc['몸무게']

0    0.716337
1    1.015351
Name: 몸무게, dtype: float64

In [24]:
new_data = pd.DataFrame({"몸무게" : [50]})
pred = model.get_prediction(new_data)
result = pred.summary_frame(alpha = 0.05)
result

Unnamed: 0,mean,mean_se,mean_ci_lower,mean_ci_upper,obs_ci_lower,obs_ci_upper
0,158.359833,0.794986,156.68963,160.030037,152.820798,163.898869


# 3. 다중 선형 회귀 분석

In [25]:
import pandas as pd
data = {
    '매출액' : [300, 320, 250, 360, 315, 328, 310, 335, 326, 280,
            290, 300, 315, 328, 310,335, 300, 400, 500, 600],
    '광고비' : [70, 75, 30, 80, 72, 77, 70, 82, 70, 80,
            68, 90, 72, 77, 70, 82, 40, 20, 75, 80],
    '직원수' : [15, 16, 14, 20, 19, 17, 16, 19, 15, 20,
            14, 5, 16, 17, 16, 14, 30, 40, 10, 50]
}
df = pd.DataFrame(data)
df.head(3)

Unnamed: 0,매출액,광고비,직원수
0,300,70,15
1,320,75,16
2,250,30,14


In [26]:
from statsmodels.formula.api import ols
model = ols('매출액 ~ 광고비 + 직원수', data = df).fit()
model.summary()

0,1,2,3
Dep. Variable:,매출액,R-squared:,0.512
Model:,OLS,Adj. R-squared:,0.454
Method:,Least Squares,F-statistic:,8.907
Date:,"Fri, 13 Jun 2025",Prob (F-statistic):,0.00226
Time:,21:00:39,Log-Likelihood:,-108.22
No. Observations:,20,AIC:,222.4
Df Residuals:,17,BIC:,225.4
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,101.0239,71.716,1.409,0.177,-50.284,252.331
광고비,1.8194,0.807,2.255,0.038,0.117,3.522
직원수,5.9288,1.430,4.147,0.001,2.912,8.945

0,1,2,3
Omnibus:,30.534,Durbin-Watson:,1.354
Prob(Omnibus):,0.0,Jarque-Bera (JB):,64.655
Skew:,2.444,Prob(JB):,9.13e-15
Kurtosis:,10.327,Cond. No.,401.0


In [27]:
df['광고비'].corr(df['매출액'])

0.13316981737040343

In [28]:
from scipy import stats
stats.pearsonr(df['광고비'], df['매출액'])

PearsonRResult(statistic=0.13316981737040345, pvalue=0.5756778801904271)

In [29]:
model.rsquared

0.5116964327009041

In [30]:
model.params

Intercept    101.023872
광고비            1.819427
직원수            5.928756
dtype: float64

In [33]:
model.pvalues['광고비']

0.03764350647696033

In [34]:
new_data = pd.DataFrame({'광고비' : [50], '직원수' : [20]})
pred = model.predict(new_data)
pred

0    310.57033
dtype: float64

In [35]:
df['잔차'] = df['매출액'] - model.predict(df)
sum(df['잔차'] ** 2)

58686.17827156107

In [38]:
df['잔차'] = df['매출액'] - model.predict(df)
(df['잔차'] ** 2).mean()

2934.3089135780533

In [39]:
model.conf_int(alpha = 0.05)

Unnamed: 0,0,1
Intercept,-50.283684,252.331429
광고비,0.116785,3.522069
직원수,2.912406,8.945105


In [43]:
new_data = pd.DataFrame({'광고비' : [45], '직원수' : [22]})
pred = model.get_prediction(new_data)
result = pred.summary_frame(alpha =0.05)
result

Unnamed: 0,mean,mean_se,mean_ci_lower,mean_ci_upper,obs_ci_lower,obs_ci_upper
0,313.330707,22.502058,265.855514,360.8059,180.58875,446.072663


# 4. 범주형 변수

In [44]:
import pandas as pd
df = pd.read_csv('study.csv')
df.head()

Unnamed: 0,study_hours,material_type,score
0,71,강의,95
1,34,독학,63
2,91,도서,95
3,80,독학,80
4,40,강의,79


In [45]:
model = ols('score ~ study_hours + material_type', data = df).fit()
model.summary()

0,1,2,3
Dep. Variable:,score,R-squared:,0.969
Model:,OLS,Adj. R-squared:,0.968
Method:,Least Squares,F-statistic:,991.9
Date:,"Fri, 13 Jun 2025",Prob (F-statistic):,4.42e-72
Time:,21:34:47,Log-Likelihood:,-238.89
No. Observations:,100,AIC:,485.8
Df Residuals:,96,BIC:,496.2
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,59.2111,0.799,74.147,0.000,57.626,60.796
material_type[T.도서],-8.6696,0.678,-12.778,0.000,-10.016,-7.323
material_type[T.독학],-17.6129,0.634,-27.790,0.000,-18.871,-16.355
study_hours,0.4839,0.011,43.810,0.000,0.462,0.506

0,1,2,3
Omnibus:,1.754,Durbin-Watson:,2.173
Prob(Omnibus):,0.416,Jarque-Bera (JB):,1.216
Skew:,0.231,Prob(JB):,0.544
Kurtosis:,3.28,Cond. No.,228.0
