In [5]:
import pandas as pd

df = pd.read_csv("glass-data.csv", sep="\t")
df.columns = df.columns.str.strip()  

# ƒê·ªïi t√™n c·ªôt ƒë·ªÉ d·ªÖ th·ª±c hi·ªán
df.columns = [
    "Id", "RI", "Na", "Mg", "Al", "Si", "K", "Ca", "Ba", "Fe", "Type"
]

# a. K√≠ch th∆∞·ªõc d·ªØ li·ªáu
print("a. K√≠ch th∆∞·ªõc d·ªØ li·ªáu:", df.shape)
print("   S·ªë h√†ng (m·∫´u):", df.shape[0])
print("   S·ªë c·ªôt (thu·ªôc t√≠nh):", df.shape[1])

# b. Ki·ªÉu d·ªØ li·ªáu
print("\nb. Ki·ªÉu d·ªØ li·ªáu c·ªßa c√°c thu·ªôc t√≠nh:")
print(df.dtypes)

# c. S·ªë l∆∞·ª£ng t·ª´ng gi√° tr·ªã nh√£n (c·ªôt Type)
print("\nc. S·ªë l∆∞·ª£ng th·ª±c th·ªÉ c·ªßa t·ª´ng gi√° tr·ªã nh√£n:")
print(df["Type"].value_counts())

# d. Gi√° tr·ªã min, max, mean
print("\nd. Gi√° tr·ªã th·ªëng k√™ c·ªßa c√°c c·ªôt s·ªë th·ª±c:")
print(df.describe().loc[["min", "max", "mean"]])



a. K√≠ch th∆∞·ªõc d·ªØ li·ªáu: (214, 11)
   S·ªë h√†ng (m·∫´u): 214
   S·ªë c·ªôt (thu·ªôc t√≠nh): 11

b. Ki·ªÉu d·ªØ li·ªáu c·ªßa c√°c thu·ªôc t√≠nh:
Id        int64
RI      float64
Na      float64
Mg      float64
Al      float64
Si      float64
K       float64
Ca      float64
Ba      float64
Fe      float64
Type      int64
dtype: object

c. S·ªë l∆∞·ª£ng th·ª±c th·ªÉ c·ªßa t·ª´ng gi√° tr·ªã nh√£n:
Type
2    76
1    70
7    29
3    17
5    13
6     9
Name: count, dtype: int64

d. Gi√° tr·ªã th·ªëng k√™ c·ªßa c√°c c·ªôt s·ªë th·ª±c:
         Id        RI        Na        Mg        Al         Si         K  \
min     1.0  1.511150  10.73000  0.000000  0.290000  69.810000  0.000000   
max   214.0  1.533930  17.38000  4.490000  3.500000  75.410000  6.210000   
mean  107.5  1.518365  13.40785  2.684533  1.444907  72.650935  0.497056   

             Ca        Ba        Fe      Type  
min    5.430000  0.000000  0.000000  1.000000  
max   16.190000  3.150000  0.510000  7.000000  
mean   8.956

In [5]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import f1_score, make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# === 1. Load data ===
df = pd.read_csv("glass-data.csv", sep="\t")

X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# === 2. Define CV & scorer ===
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
f1 = make_scorer(f1_score, average='macro')

# === 3. Models & params ===
models = {
    "KNN": (
        Pipeline([("scaler", StandardScaler()), ("knn", KNeighborsClassifier())]),
        {"knn__n_neighbors": [3, 5, 7, 9],
         "knn__weights": ["uniform", "distance"]}
    ),
    
    "Random Forest": (
        RandomForestClassifier(random_state=42),
        {"n_estimators": [100, 200, 300],
         "max_depth": [None, 5, 10, 20],
         "min_samples_split": [2, 5]}
    ),
    
    "SVM (RBF)": (
        Pipeline([("scaler", StandardScaler()), ("svm", SVC())]),
        {"svm__C": [0.1, 1, 10],
         "svm__gamma": ["scale", "auto"],
         "svm__kernel": ["rbf"]}
    )
}

# === 4. Training & saving results ===
rows = []

for name, (model, params) in models.items():
    grid = GridSearchCV(model, params, cv=kfold, scoring=f1, n_jobs=-1)
    grid.fit(X, y)

    rows.append([
        name,
        round(grid.best_score_, 4),
        grid.best_params_
    ])

results = pd.DataFrame(rows, columns=["Model", "F1 Score (Macro)", "Best Hyperparameters"])

# === 5. Display better formatted output ===
print("\n K·∫æT QU·∫¢ SO S√ÅNH M√î H√åNH\n")
for idx, row in results.iterrows():
    print(f"üîπ Model: {row['Model']}")
    print(f"   F1 Score (Macro): {row['F1 Score (Macro)']}")
    print(f"   Best Params:")
    for k, v in row["Best Hyperparameters"].items():
        print(f"      {k}: {v}")
    print("-" * 50)



 K·∫æT QU·∫¢ SO S√ÅNH M√î H√åNH

üîπ Model: KNN
   F1 Score (Macro): 0.7873
   Best Params:
      knn__n_neighbors: 3
      knn__weights: distance
--------------------------------------------------
üîπ Model: Random Forest
   F1 Score (Macro): 0.9765
   Best Params:
      max_depth: None
      min_samples_split: 2
      n_estimators: 200
--------------------------------------------------
üîπ Model: SVM (RBF)
   F1 Score (Macro): 0.8221
   Best Params:
      svm__C: 10
      svm__gamma: scale
      svm__kernel: rbf
--------------------------------------------------
