In [1]:
import pandas as pd
import statsmodels.api as sm
import numpy as np

In [2]:
df = pd.read_csv("./data/logistic_regression_01.csv")

In [3]:
df.head(2)

Unnamed: 0,admit,gre,gpa,rank
0,0,380,3.61,3
1,1,660,3.67,3


In [4]:
df["rank"].unique()

array([3, 1, 4, 2], dtype=int64)

In [7]:
dummy_ranks = pd.get_dummies(df["rank"], prefix = "rank")
dummy_ranks

Unnamed: 0,rank_1,rank_2,rank_3,rank_4
0,0,0,1,0
1,0,0,1,0
2,1,0,0,0
3,0,0,0,1
4,0,0,0,1
...,...,...,...,...
395,0,1,0,0
396,0,0,1,0
397,0,1,0,0
398,0,1,0,0


In [9]:
df_bind = df.drop("rank", axis = 1)
df_bind.head(2)

Unnamed: 0,admit,gre,gpa
0,0,380,3.61
1,1,660,3.67


In [10]:
df_bind = df.drop("rank", axis = 1).join(dummy_ranks.loc[:, "rank_2"])
df_bind

Unnamed: 0,admit,gre,gpa,rank_2
0,0,380,3.61,0
1,1,660,3.67,0
2,1,800,4.00,0
3,1,640,3.19,0
4,0,520,2.93,0
...,...,...,...,...
395,0,620,4.00,1
396,0,560,3.04,0
397,0,460,2.63,1
398,0,700,3.65,1


In [12]:
train_cols = df_bind.columns[1:]
train_cols

Index(['gre', 'gpa', 'rank_2'], dtype='object')

In [15]:
df_bind = pd.get_dummies(df, columns=["rank"],
                        drop_first=True)
df_bind.head(2)

Unnamed: 0,admit,gre,gpa,rank_2,rank_3,rank_4
0,0,380,3.61,0,1,0
1,1,660,3.67,0,1,0


In [20]:
model = sm.Logit(df_bind["admit"], # 종속변수
                df_bind.drop(["admit","gpa"], axis = 1)) # 독립변수
result = model.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.589897
         Iterations 5


0,1,2,3
Dep. Variable:,admit,No. Observations:,400.0
Model:,Logit,Df Residuals:,396.0
Method:,MLE,Df Model:,3.0
Date:,"Tue, 12 Oct 2021",Pseudo R-squ.:,0.05612
Time:,15:27:19,Log-Likelihood:,-235.96
converged:,True,LL-Null:,-249.99
Covariance Type:,nonrobust,LLR p-value:,3.531e-06

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
gre,0.0007,0.000,1.836,0.066,-4.88e-05,0.001
rank_2,-1.0183,0.291,-3.501,0.000,-1.588,-0.448
rank_3,-1.6189,0.315,-5.146,0.000,-2.236,-1.002
rank_4,-1.9377,0.392,-4.949,0.000,-2.705,-1.170


In [21]:
pred_prob = result.predict(df_bind.drop(["admit","gpa"], axis = 1))
pred_prob.head()

0    0.206916
1    0.242177
2    0.640957
3    0.186325
4    0.173504
dtype: float64

In [22]:
pred = (pred_prob > 0.5) + 0
pred.head()

0    0
1    0
2    1
3    0
4    0
dtype: int32

In [23]:
from sklearn.metrics import auc
from sklearn.metrics import roc_curve

In [24]:
fpr, tpr, thresholds = roc_curve(df_bind["admit"], pred_prob) # pred 가 아님
auc(fpr, tpr)

0.6698970321017566

In [25]:
pd.crosstab(df["admit"], pred)

col_0,0,1
admit,Unnamed: 1_level_1,Unnamed: 2_level_1
0,245,28
1,94,33


In [26]:
pred = (pred_prob > 0.2) + 0
pd.crosstab(df["admit"], pred)

col_0,0,1
admit,Unnamed: 1_level_1,Unnamed: 2_level_1
0,54,219
1,11,116
