In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import log_loss

PROC_DIR = Path("../data/processed")

X_train = pd.read_csv(PROC_DIR / "X_train.csv", index_col=0)
X_test  = pd.read_csv(PROC_DIR / "X_test.csv", index_col=0)
y_train = pd.read_csv(PROC_DIR / "y_train.csv", index_col=0)["status"]
y_test  = pd.read_csv(PROC_DIR / "y_test.csv", index_col=0)["status"]

# Dùng lại model đã train từ Note 02
# (nếu cần thì fit lại)
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(X_train, y_train)


In [2]:
w = logreg.coef_.ravel()
b = float(logreg.intercept_[0])

print("Intercept (b):", b)
print("Vector w có", len(w), "phần tử")
print("5 hệ số đầu:", w[:5])


Intercept (b): 1.6319408686545231
Vector w có 21 phần tử
5 hệ số đầu: [-0.50073478  0.50103346 -0.34395424  0.34425292  0.12939979]


In [3]:
# Lấy tên cột sau OneHotEncoder từ note01
feat_file = PROC_DIR / "feature_names.txt"
if feat_file.exists():
    feature_names = [ln.strip() for ln in open(feat_file, "r", encoding="utf-8")]
else:
    feature_names = [f"f{i}" for i in range(len(w))]

coef_df = pd.DataFrame({
    "feature": feature_names,
    "w": w,
    "odds_ratio": np.exp(w)
}).sort_values("w", ascending=False)

display(coef_df.head(10))   # top tăng odds
display(coef_df.tail(10))   # top giảm odds


Unnamed: 0,feature,w,odds_ratio
16,ssc_p,1.852987,6.378843
18,degree_p,1.095345,2.990214
17,hsc_p,0.904111,2.469735
13,workex_Yes,0.846709,2.33196
9,degree_t_Comm&Mgmt,0.644774,1.905557
1,gender_M,0.501033,1.650426
3,ssc_b_Others,0.344253,1.410935
6,hsc_s_Arts,0.309199,1.362334
4,hsc_b_Central,0.1294,1.138145
14,specialisation_Mkt&Fin,0.05736,1.059037


Unnamed: 0,feature,w,odds_ratio
15,specialisation_Mkt&HR,-0.057062,0.944536
10,degree_t_Others,-0.088653,0.915163
5,hsc_b_Others,-0.129101,0.878885
8,hsc_s_Science,-0.146146,0.864031
7,hsc_s_Commerce,-0.162755,0.8498
2,ssc_b_Central,-0.343954,0.708961
0,gender_F,-0.500735,0.606085
11,degree_t_Sci&Tech,-0.555823,0.5736
12,workex_No,-0.84641,0.428952
20,mba_p,-0.9611,0.382472


In [18]:
# Chọn 1 sinh viên bất kỳ từ X_test
x0 = X_test.iloc[5].values
z0 = np.dot(w, x0) + b
p0 = 1 / (1 + np.exp(-z0))

print("Log-odds (z):", z0)
print("Xác suất Placed:", p0)
print("Label thật:", y_test.iloc[5])


Log-odds (z): 0.3181390009593794
Xác suất Placed: 0.5788706455901212
Label thật: 0


In [5]:
y_score_train = logreg.predict_proba(X_train)[:,1]
y_score_test  = logreg.predict_proba(X_test)[:,1]

print("Log loss (train):", log_loss(y_train, y_score_train))
print("Log loss (test) :", log_loss(y_test,  y_score_test))


Log loss (train): 0.22860266486709327
Log loss (test) : 0.2988325164837325
