In [None]:
import torch
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings(action='ignore')
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import matplotlib.font_manager as fm
from sklearn.preprocessing import StandardScaler


# # 그래프에서 마이너스 폰트 깨지는 문제에 대한 대처
import matplotlib as mpl
mpl.rcParams['axes.unicode_minus'] = False

# # GPU 설정 : 런타임 > 런타임 유형 변경 > GPU
# # GPU 사용 가능한지 확인
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
data_med = sm.datasets.get_rdataset("MedGPA", package="Stat2Data")
df_gpa = data_med.data
df_gpa.tail()

# 컬럼별 값 파악
[print(i,df_gpa.iloc[:,i].unique()) for i in range(len(df_gpa.columns))]
df_gpa.head()

In [None]:
## 그래프
sns.stripplot(x="GPA",y="Acceptance", data=df_gpa,jitter=True, orient="h", order=[1,0])
plt.grid(True)
plt.show()

In [None]:
## 로지스틱 회귀 분석
y="Acceptance"
x=df_gpa.columns.difference(['Accept', 'Acceptance','MCAT',"Prediction"])
# print(x)
gpa_model=sm.Logit.from_formula("Acceptance ~ "+"+".join(x),df_gpa)
gpa_result=gpa_model.fit(disp=0)
print(gpa_result.summary())


In [None]:
## 로지스틱 회귀 분석 변수 재선별
gpa_model2=sm.Logit.from_formula("Acceptance ~ GPA",df_gpa)
gpa_result2=gpa_model2.fit(disp=0)
print(gpa_result2.summary())


In [None]:
# 분포
df_gpa["Prediction"]=gpa_result2.predict(df_gpa)
sns.distplot(df_gpa["Prediction"],color="b",hist=False)
sns.distplot(df_gpa[df_gpa["Acceptance"].values==1]["Prediction"],color="r",hist=False)
sns.distplot(df_gpa[df_gpa["Acceptance"].values==0]["Prediction"],color="m",hist=False)
plt.title("Prediction Distribution")

In [None]:
sns.boxplot(x="Acceptance",y="Prediction",data=df_gpa)
plt.show()

In [None]:
y=df_gpa.iloc[:,1:2]

plt.scatter(df_gpa["GPA"], y,lw=2, label="data")
plt.plot(df_gpa["GPA"],gpa_result2.fittedvalues*0.1,label="func value")
plt.legend()
plt.show()

In [None]:
### 로지스틱 회귀모형 성능 측정
from sklearn.metrics import log_loss

# 로그 손실
y_hat=gpa_result2.predict(df_gpa["GPA"])
print("log_loss : ",log_loss(y,y_hat,normalize=False))

# 귀무가설
mu_null=np.sum(y)/len(y)
print("mu_null : ",mu_null.values)

# 귀무가설로 계산한 로그 손실
y_null=np.ones_like(y)*mu_null[0]
print("log_loss with mu_null: ",log_loss(y,y_null,normalize=False))

# 맥 파든 의사 결정 계수
McFadden_R=(log_loss(y, y_hat)/log_loss(y, y_null))
print("McFadden_R : ",McFadden_R)

In [None]:
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y, gpa_result2.predict(df_gpa["GPA"]))

plt.plot(fpr, tpr)
plt.show()
