Mục đích: nạp thư viện, đặt seed, cấu hình hiển thị

In [1]:
import os
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

SEED = 42
np.random.seed(SEED)

RAW_PATH = Path("../data/raw/Placement_Data_Full_Class.csv")
PROC_DIR = Path("../data/processed")
PROC_DIR.mkdir(parents=True, exist_ok=True)

pd.set_option("display.max_columns", None)


Mục đích: đọc CSV gốc, tạo bản sao để xử lý

In [2]:
df_raw = pd.read_csv(RAW_PATH)
df = df_raw.copy()

print("Kích thước dữ liệu:", df.shape)
df.head()


Kích thước dữ liệu: (215, 15)


Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed,250000.0
3,4,M,56.0,Central,52.0,Central,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,
4,5,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed,425000.0


Mục đích: map status → nhị phân, loại cột không dùng (sl_no, salary)

Lý do: làm classification Placed/Not Placed; salary chỉ có cho Placed nên loại khỏi X để tránh rò rỉ thông tin.

In [3]:
# kiểm tra cột
assert "status" in df.columns, "Thiếu cột target 'status' trong dữ liệu."

# map target
y = df["status"].map({"Placed": 1, "Not Placed": 0})
assert y.isnull().sum() == 0, "Target có giá trị không map được. Kiểm tra lại giá trị 'status'."

# loại cột không dùng cho X
drop_cols = [c for c in ["sl_no", "salary", "status"] if c in df.columns]
X = df.drop(columns=drop_cols)

print("X shape:", X.shape, "| y shape:", y.shape)
X.head()


X shape: (215, 12) | y shape: (215,)


Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p
0,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8
1,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28
2,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8
3,M,56.0,Central,52.0,Central,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43
4,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5


Mục đích: tách nhóm biến để xây dựng pipeline phù hợp

In [4]:
# đề xuất nhóm biến cho dataset Placement
categorical_cols = [c for c in X.columns if X[c].dtype == "object"]
numeric_cols = [c for c in X.columns if c not in categorical_cols]

print("Categorical:", categorical_cols)
print("Numeric:", numeric_cols)


Categorical: ['gender', 'ssc_b', 'hsc_b', 'hsc_s', 'degree_t', 'workex', 'specialisation']
Numeric: ['ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p']


Mục đích: tách dữ liệu với tỉ lệ lớp được giữ nguyên

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED, stratify=y
)

print("Train size:", X_train.shape, "| Test size:", X_test.shape)
print("Tỉ lệ lớp train:\n", y_train.value_counts(normalize=True).round(3))
print("Tỉ lệ lớp test:\n", y_test.value_counts(normalize=True).round(3))


Train size: (172, 12) | Test size: (43, 12)
Tỉ lệ lớp train:
 status
1    0.686
0    0.314
Name: proportion, dtype: float64
Tỉ lệ lớp test:
 status
1    0.698
0    0.302
Name: proportion, dtype: float64


Mục đích: impute + encode (categorical) & impute + scale (numeric) trong một ColumnTransformer

In [6]:
cat_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", drop=None, sparse_output=False))
])

num_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

preprocess = ColumnTransformer(
    transformers=[
        ("cat", cat_pipeline, categorical_cols),
        ("num", num_pipeline, numeric_cols),
    ],
    remainder="drop"  # chỉ giữ các cột đã chỉ định
)


Mục đích: áp dụng pipeline, thu được ma trận đã xử lý và tên cột sau OHE

In [7]:
# fit trên train, transform cả train & test
X_train_proc = preprocess.fit_transform(X_train)
X_test_proc = preprocess.transform(X_test)

# lấy tên feature sau OHE + numeric
cat_feature_names = []
if len(categorical_cols) > 0:
    ohe = preprocess.named_transformers_["cat"].named_steps["ohe"]
    # tên cột dạng <col>_<category>
    cat_feature_names = ohe.get_feature_names_out(categorical_cols).tolist()

num_feature_names = numeric_cols
feature_names = cat_feature_names + num_feature_names

# convert về DataFrame cho tiện kiểm tra/lưu
X_train_df = pd.DataFrame(X_train_proc, columns=feature_names, index=X_train.index)
X_test_df = pd.DataFrame(X_test_proc, columns=feature_names, index=X_test.index)

print("Processed shapes:", X_train_df.shape, X_test_df.shape)
X_train_df.head()


Processed shapes: (172, 21) (43, 21)


Unnamed: 0,gender_F,gender_M,ssc_b_Central,ssc_b_Others,hsc_b_Central,hsc_b_Others,hsc_s_Arts,hsc_s_Commerce,hsc_s_Science,degree_t_Comm&Mgmt,degree_t_Others,degree_t_Sci&Tech,workex_No,workex_Yes,specialisation_Mkt&Fin,specialisation_Mkt&HR,ssc_p,hsc_p,degree_p,etest_p,mba_p
147,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.260729,0.69412,-0.166768,0.778647,0.000789
161,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,-1.065759,-1.405431,-1.192126,-1.123471,0.074478
169,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,-0.664128,-2.212388,-0.67808,-1.359642,0.536711
131,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.712104,-0.857722,-0.576911,0.553721,-0.250425
27,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,-0.384091,0.055126,-0.030053,-0.345979,-0.767924


Mục đích: lưu bộ train/test đã tiền xử lý để note 02 dùng ngay

In [8]:
# lưu features
X_train_path = PROC_DIR / "X_train.csv"
X_test_path  = PROC_DIR / "X_test.csv"
y_train_path = PROC_DIR / "y_train.csv"
y_test_path  = PROC_DIR / "y_test.csv"
featnames_path = PROC_DIR / "feature_names.txt"

X_train_df.to_csv(X_train_path, index=True)
X_test_df.to_csv(X_test_path, index=True)
y_train.to_csv(y_train_path, index=True, header=["status"])
y_test.to_csv(y_test_path, index=True, header=["status"])

with open(featnames_path, "w", encoding="utf-8") as f:
    for name in feature_names:
        f.write(name + "\n")

print("Đã lưu processed vào:", PROC_DIR.resolve())


Đã lưu processed vào: C:\Users\nguye\OneDrive\Máy tính\ML_Study_hihi\Tuan_5\Graduate Career Outcome\data\processed


Mục đích: kiểm tra lại số chiều, NaN sau xử lý, và ví dụ vài cột OHE

In [9]:
assert not np.isnan(X_train_df.values).any(), "X_train có NaN sau xử lý!"
assert not np.isnan(X_test_df.values).any(),  "X_test có NaN sau xử lý!"

print("Số cột cuối cùng:", len(feature_names))
print("Ví dụ 10 cột đầu:", feature_names[:10])

# kiểm tra cân bằng lớp sau split
print("Train class balance:", y_train.mean().round(3))
print("Test class balance:", y_test.mean().round(3))


Số cột cuối cùng: 21
Ví dụ 10 cột đầu: ['gender_F', 'gender_M', 'ssc_b_Central', 'ssc_b_Others', 'hsc_b_Central', 'hsc_b_Others', 'hsc_s_Arts', 'hsc_s_Commerce', 'hsc_s_Science', 'degree_t_Comm&Mgmt']
Train class balance: 0.686
Test class balance: 0.698
