# 1. 단일 표본 검정

In [1]:
import pandas as pd
df = pd.DataFrame({
    'Caffeine(mg)' : [
        94.2, 93.7, 95.5, 93.9, 94.0, 95.2, 94.7, 93.5, 92.8, 94.4, 
        93.8, 94.6, 93.3, 95.1, 94.3, 94.9, 93.9, 94.8, 95.0, 94.2,
        93.7, 94.4, 95.1, 94.0, 93.6
    ]ㅉ
})

In [3]:
df.mean()

Caffeine(mg)    94.264
dtype: float64

In [10]:
from scipy import stats
stats.shapiro(df['Caffeine(mg)'])

ShapiroResult(statistic=0.9826574921607971, pvalue=0.9321980476379395)

In [13]:
from scipy import stats
stats.ttest_1samp(df['Caffeine(mg)'], 95, alternative = 'less')

TtestResult(statistic=-5.501737036221897, pvalue=5.8686553916715e-06, df=24)

In [14]:
stat, pvalue = stats.ttest_1samp(df['Caffeine(mg)'], 95, alternative = 'less')
print("{:.10f}".format(pvalue))

0.0000058687


# 2. 독립 표본 검정

In [15]:
import pandas as pd
df = pd.DataFrame({
    '충전기' : ['New'] * 10 + ['Old'] * 10,
    '충전시간' : [
        1.5, 1.6, 1.4, 1.7, 1.5, 1.6, 1.7, 1.4, 1.6, 1.5,
        1.7, 1.8, 1.7, 1.9, 1.8, 1.7, 1.8, 1.9, 1.7, 1.6
    ]
})
df.head()

Unnamed: 0,충전기,충전시간
0,New,1.5
1,New,1.6
2,New,1.4
3,New,1.7
4,New,1.5


In [19]:
df_new = df['충전기'] == 'New'
df_old = df['충전기'] == 'Old'
from scipy import stats
stats.ttest_ind(df[df_new]['충전시간'], df[df_old]['충전시간'], alternative = 'less', equal_var = True)

Ttest_indResult(statistic=-4.582575694955849, pvalue=0.00011546547787696304)

# 3. 대응 표본 검정

In [1]:
import pandas as pd
df = pd.DataFrame({
    'User' : list(range(1,11)),
    '기존방법' : [60.4, 60.7, 60.5, 60.3, 60.8, 60.6, 60.2, 60.5, 60.7, 60.4],
    '새로운방법' : [59.8, 60.2, 60.1, 59.9, 59.7, 58.4, 57.0, 60.3, 59.6, 59.8]
})
df.head(2)

Unnamed: 0,User,기존방법,새로운방법
0,1,60.4,59.8
1,2,60.7,60.2


In [2]:
df['diff'] = df['새로운방법'] - df['기존방법']
df['diff'].mean()

-1.0300000000000005

In [3]:
from scipy import stats
stats.ttest_rel(df['새로운방법'], df['기존방법'], alternative = 'less')

TtestResult(statistic=-3.407973078114844, pvalue=0.0038872633380070652, df=9)

# 4. 일원 분산 분석

In [4]:
import pandas as pd
df = pd.read_csv('math.csv')
df.head()

Unnamed: 0,groups,scores
0,group_A,85
1,group_A,88
2,group_A,90
3,group_A,82
4,group_A,87


In [6]:
df_a = df['groups'] == 'group_A'
df_b = df['groups'] == 'group_B'
df_c = df['groups'] == 'group_C'
df_d = df['groups'] == 'group_D'
from scipy import stats
print(stats.shapiro(df[df_a]['scores']))
print(stats.shapiro(df[df_b]['scores']))
print(stats.shapiro(df[df_c]['scores']))
print(stats.shapiro(df[df_d]['scores']))

ShapiroResult(statistic=0.9715898036956787, pvalue=0.9051811695098877)
ShapiroResult(statistic=0.9499422907829285, pvalue=0.6678178906440735)
ShapiroResult(statistic=0.9299424290657043, pvalue=0.44732627272605896)
ShapiroResult(statistic=0.9065684080123901, pvalue=0.2582412362098694)


In [8]:
from scipy import stats
stats.levene(df[df_a]['scores'], df[df_b]['scores'], df[df_c]['scores'], df[df_d]['scores'])

LeveneResult(statistic=1.757685352622062, pvalue=0.17270284963232105)

In [11]:
from statsmodels.formula.api import ols
model = ols('scores~ groups', data = df).fit()

from statsmodels.stats.anova import anova_lm
anova_lm(model)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
groups,3.0,411.8,137.266667,34.174274,1.240642e-10
Residual,36.0,144.6,4.016667,,


# 5. 이원 분산 분석

In [12]:
import pandas as pd
df = pd.read_csv('tomato2.csv')
df.head()

Unnamed: 0,비료유형,물주기,수확량
0,A,1,514
1,A,1,480
2,A,1,507
3,A,2,452
4,A,2,526


In [15]:
from statsmodels.stats.anova import anova_lm
from statsmodels.formula.api import ols
model = ols('수확량 ~ 비료유형 * C(물주기)', data = df).fit()
anova_lm(model)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
비료유형,2.0,5251.722222,2625.861111,3.184685,0.059334
C(물주기),3.0,9057.0,3019.0,3.66149,0.02646
비료유형:C(물주기),6.0,4271.833333,711.972222,0.863491,0.535426
Residual,24.0,19788.666667,824.527778,,


# 6. 적합도 검정

In [20]:
30/1000

0.03

In [19]:
from scipy.stats import chisquare
chisquare([550, 250, 100, 70, 30], [600, 250, 80, 50, 20])

Power_divergenceResult(statistic=22.166666666666668, pvalue=0.00018567620386641427)

# 7. 독립성 검정

In [26]:
from scipy.stats import chi2_contingency
df = [[50, 30], [60, 40]]
chi2_contingency(df)

Chi2ContingencyResult(statistic=0.03535714285714309, pvalue=0.8508492527705047, dof=1, expected_freq=array([[48.88888889, 31.11111111],
       [61.11111111, 38.88888889]]))

In [27]:
import pandas as pd
df = pd.DataFrame({
    '캠프' : ['빅분기'] * 80 + ['정처기'] * 100,
    '등록여부' : ['등록'] * 50 + ['등록안함'] * 30 + ['등록'] * 60 + ['등록안함'] * 40
})

In [29]:
from scipy.stats import chi2_contingency
df = pd.crosstab(df['캠프'], df['등록여부'])
chi2_contingency(df)

Chi2ContingencyResult(statistic=0.03535714285714309, pvalue=0.8508492527705047, dof=1, expected_freq=array([[48.88888889, 31.11111111],
       [61.11111111, 38.88888889]]))

# 8. 다중 선형 회귀

In [34]:
import pandas as pd
df = pd.DataFrame({
    '할인율': [28, 24, 13, 0, 27, 30, 10, 16, 6, 5, 7, 11, 11, 30, 25,
            4, 7, 24, 19, 21, 6, 10, 26, 13, 15, 6, 12, 6, 20, 2],
    '온도': [15, 34, 15, 22, 29, 30, 14, 17, 28, 29, 19, 19, 34, 10,
           29, 28, 12, 25, 32, 28, 22, 16, 30, 11, 16, 18, 16, 33, 12, 22],
    '광고비': [342, 666, 224, 764, 148, 499, 711, 596, 797, 484, 986, 347, 146, 362, 642,
            591, 846, 260, 560, 941, 469, 309, 730, 305, 892, 147, 887, 526, 525, 884],
    '주문량': [635, 958, 525, 25, 607, 872, 858, 732, 1082, 863, 904, 686, 699, 615, 893,
            830, 856, 679, 918, 951, 789, 583, 988, 631, 866, 549, 910, 946, 647, 943]
})
round(df['할인율'].corr(df['온도']),2)

0.09

In [38]:
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
model = ols('주문량 ~ 할인율 + 온도 + 광고비', data = df).fit()
round(model.rsquared,2)

0.4

In [43]:
round(model.params,4)

Intercept    267.6609
할인율            4.2068
온도             9.4798
광고비            0.4148
dtype: float64

In [44]:
round(model.params['Intercept'], 4)

267.6609

In [45]:
round(model.pvalues['온도'], 4)

0.0289

In [48]:
new_data = pd.DataFrame({'할인율' : [10], '온도' : [20], '광고비' : [500]})
int(model.predict(new_data))

  int(model.predict(new_data))


706

In [49]:
df['diff'] = df['주문량'] - model.predict(df)
round(sum(df['diff']**2),2)

732197.9

In [51]:
df['diff'] = df['주문량'] - model.predict(df)
(df['diff']**2).mean()

24406.596627140243

In [55]:
model.conf_int(alpha = 0.1)

Unnamed: 0,0,1
Intercept,45.95572,489.366084
할인율,-1.847229,10.260887
온도,2.490702,16.468984
광고비,0.201064,0.628589


In [56]:
new_data = pd.DataFrame({'할인율' : [15], '온도' : [25], '광고비' : [300]})
pred = model.get_prediction(new_data)
result = pred.summary_frame(alpha  = 0.1)
result

Unnamed: 0,mean,mean_se,mean_ci_lower,mean_ci_upper,obs_ci_lower,obs_ci_upper
0,692.207386,45.555397,614.507283,769.907488,395.622293,988.792478


In [57]:
model.pvalues['광고비']

0.0027398053085787655

# 9. 로지스틱 회귀

In [18]:
import pandas as pd
df = pd.read_csv('customer_travel.csv')
mid_point = len(df)//2
a = df.iloc[:mid_point]
b = df.iloc[mid_point:]
a.shape, b.shape

((400, 5), (400, 5))

In [19]:
model = logit('target ~ age + service + social + booked', data = a).fit()
model.summary()

Optimization terminated successfully.
         Current function value: 0.527521
         Iterations 6


0,1,2,3
Dep. Variable:,target,No. Observations:,400.0
Model:,Logit,Df Residuals:,395.0
Method:,MLE,Df Model:,4.0
Date:,"Sun, 15 Jun 2025",Pseudo R-squ.:,0.05254
Time:,12:39:02,Log-Likelihood:,-211.01
converged:,True,LL-Null:,-222.71
Covariance Type:,nonrobust,LLR p-value:,0.0001052

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,2.3314,1.204,1.937,0.053,-0.028,4.691
age,-0.1043,0.038,-2.781,0.005,-0.178,-0.031
service,0.0452,0.079,0.572,0.567,-0.110,0.200
social,0.1920,0.247,0.779,0.436,-0.291,0.675
booked,-0.9542,0.272,-3.512,0.000,-1.487,-0.422


In [20]:
model = logit('target ~ age + booked', data = a).fit()
model.summary()

Optimization terminated successfully.
         Current function value: 0.528581
         Iterations 6


0,1,2,3
Dep. Variable:,target,No. Observations:,400.0
Model:,Logit,Df Residuals:,397.0
Method:,MLE,Df Model:,2.0
Date:,"Sun, 15 Jun 2025",Pseudo R-squ.:,0.05064
Time:,12:39:02,Log-Likelihood:,-211.43
converged:,True,LL-Null:,-222.71
Covariance Type:,nonrobust,LLR p-value:,1.265e-05

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,2.4581,1.184,2.076,0.038,0.137,4.779
age,-0.1025,0.037,-2.752,0.006,-0.176,-0.030
booked,-0.9461,0.267,-3.542,0.000,-1.470,-0.423


In [21]:
model.llf*(-2)

422.8647650289116

In [22]:
import numpy as np
np.exp(model.params['booked'] * 3)

0.05853312291771145

In [23]:
model.params[model.pvalues<0.05].sum()

1.409468270586194

In [28]:
pred = model.predict(b)
pred = (pred > 0.5).astype(int)
from sklearn.metrics import accuracy_score
accuracy_score(b['target'], pred)

0.765

In [29]:
1 - 0.765

0.235