In [None]:
import pandas as pd
import numpy as np

# 예제 데이터 생성
np.random.seed(40)
n_samples = 210
X = np.random.randn(n_samples, 4)
y = (X[:,0] + X[:,1] * 0.5 + np.random.randn(n_samples) * 0.5 > 0).astype(int)
df = pd.DataFrame(X, columns = ['weight', 'height', 'age', 'income'])
df['gender'] = y

# 데이터 확인
print(df.head())

     weight    height       age    income  gender
0 -0.607548 -0.126136 -0.684606  0.928715       0
1 -1.844401 -0.467002  2.292490  0.488810       0
2  0.710267  1.055534  0.054073  0.257953       0
3  0.588282  0.885244 -1.017007 -0.133693       1
4 -0.438186  0.493443 -0.199009 -1.274984       0


In [None]:
#1
import statsmodels.api as sm

# 성별을 weight로 회귀
X_weight = df['weight']
X_weight = sm.add_constant(X_weight)
y = df['gender']
logit_model_weight = sm.Logit(y, X_weight).fit()

print(logit_model_weight.summary())

# odds-ratio 계산
odds_ratio_weight = np.exp(logit_model_weight.params['weight'])
print(f'weight의 오즈비: {odds_ratio_weight}')

Optimization terminated successfully.
         Current function value: 0.383761
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                 gender   No. Observations:                  210
Model:                          Logit   Df Residuals:                      208
Method:                           MLE   Df Model:                            1
Date:                Mon, 09 Jun 2025   Pseudo R-squ.:                  0.4460
Time:                        12:47:09   Log-Likelihood:                -80.590
converged:                       True   LL-Null:                       -145.48
Covariance Type:            nonrobust   LLR p-value:                 4.599e-30
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0674      0.197      0.342      0.733      -0.319       0.454
weight         2.7193      0.

In [None]:
#2

# 성별을 4개 변수로 회귀
X_all = df[['weight', 'height', 'age', 'income']]
X_all = sm.add_constant(X_all)
logit_model_all = sm.Logit(y, X_all).fit()

# Residual Deviance 계산(llf 활용)
residual_deviance = -2 * logit_model_all.llf
print(f'Residual deviance : {residual_deviance.round(3)}')

Optimization terminated successfully.
         Current function value: 0.300367
         Iterations 8
Residual deviance : 126.154


In [None]:
#3

# 데이터 분할을 위한 패키지 불러오기
from sklearn.model_selection import train_test_split

# 데이터를 훈련/테스트 세트로 분할
df_train, df_test = train_test_split(df, test_size= 90, random_state=42)
X_train = sm.add_constant(df_train['weight'])
X_test = sm.add_constant(df_test['weight'])
y_train = df_train['gender']
y_test = df_test['gender']

print(X_train.shape)
print(X_test.shape)

# 훈련 세트로 모델 적합
logit_model_train = sm.Logit(y_train, X_train).fit()

from sklearn.metrics import accuracy_score

# 테스트 세트로 예측
y_pred = logit_model_train.predict(X_test) > 0.5

# 오분류율 계산
error_Rate= 1 - accuracy_score(y_test,y_pred)
print(f'오분류율 : {round(error_Rate, 4)}')

(120, 2)
(90, 2)
Optimization terminated successfully.
         Current function value: 0.414800
         Iterations 7
오분류율 : 0.1444


In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_diabetes

diabetes = load_diabetes(as_frame = True)
df = diabetes.frame
print(df.head())

        age       sex       bmi        bp        s1        s2        s3  \
0  0.038076  0.050680  0.061696  0.021872 -0.044223 -0.034821 -0.043401   
1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163  0.074412   
2  0.085299  0.050680  0.044451 -0.005670 -0.045599 -0.034194 -0.032356   
3 -0.089063 -0.044642 -0.011595 -0.036656  0.012191  0.024991 -0.036038   
4  0.005383 -0.044642 -0.036385  0.021872  0.003935  0.015596  0.008142   

         s4        s5        s6  target  
0 -0.002592  0.019907 -0.017646   151.0  
1 -0.039493 -0.068332 -0.092204    75.0  
2 -0.002592  0.002861 -0.025930   141.0  
3  0.034309  0.022688 -0.009362   206.0  
4 -0.002592 -0.031988 -0.046641   135.0  


In [6]:
#4

import statsmodels.api as sm

# 독립 변수와 종속 변수 정의
X = df.iloc[:, 0:4]
X = sm.add_constant(X)

# 타깃 변수를 이진화 (중간값 기준)
y = (df['target'] > df['target'].median()).astype(int)

# 모델 적합 및 요약
logit_model = sm.Logit(y, X).fit()
print(logit_model.summary())

p_values = logit_model.pvalues
non_significant_vars = p_values[p_values>=0.05]
num_non_significant_vars = len(non_significant_vars)

print(f'유의하지 않은 변수의 수 : {num_non_significant_vars}')

Optimization terminated successfully.
         Current function value: 0.543957
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                 target   No. Observations:                  442
Model:                          Logit   Df Residuals:                      437
Method:                           MLE   Df Model:                            4
Date:                Tue, 10 Jun 2025   Pseudo R-squ.:                  0.2152
Time:                        08:38:28   Log-Likelihood:                -240.43
converged:                       True   LL-Null:                       -306.37
Covariance Type:            nonrobust   LLR p-value:                 1.540e-27
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0462      0.112      0.413      0.680      -0.173       0.266
age            1.1309      2.

In [11]:
#5
significant_vars = p_values[p_values < 0.05]
significant_var_names = significant_vars.index.drop('const', errors = 'ignore')  #상수(constant) 제외

X_significant = X[significant_var_names]
X_significant = sm.add_constant(X_significant)

logit_model_significant = sm.Logit(y, X_significant).fit()

# 유의한 변수들의 회귀계수 평균 구하기
significant_coef_mean = logit_model_significant.params.mean()
print(f'유의한 변수들만 사용 시 회귀계수들의 평균: {significant_coef_mean}')

Optimization terminated successfully.
         Current function value: 0.548382
         Iterations 6
유의한 변수들만 사용 시 회귀계수들의 평균: 11.09340765301607


In [12]:
#6

# age 변수의 회귀계수
coef_bmi = logit_model.params['age']

# 1 단위 증가할 때 오즈비 계산
delta_x = 1
odds_ratio = np.exp(coef_bmi * delta_x)
print(f'age  변수가 1단위 증가할 때 오즈비: {odds_ratio}')

age  변수가 1단위 증가할 때 오즈비: 3.098369923209826
