In [None]:
# 1. Imports
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from imblearn.over_sampling import SMOTE
import joblib

# 2. Load Data
df = pd.read_csv('/content/glass.csv')

X = df.drop('Type', axis=1)
y = df['Type']

print(f"Before SMOTE: {np.bincount(y)}")

# 3. Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

print(f"After SMOTE: {np.bincount(y_resampled)}")

# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.25, random_state=42, stratify=y_resampled
)

# 5. Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 6. Final Optimized Gradient Boosting Classifier
best_gbm = GradientBoostingClassifier(
    learning_rate=0.05,
    n_estimators=300,
    max_depth=3,
    max_features='sqrt',
    subsample=0.8,
    min_samples_split=4,
    min_samples_leaf=2,
    validation_fraction=0.1,
    n_iter_no_change=10,
    random_state=42
)

# 7. Train with early stopping
best_gbm.fit(X_train_scaled, y_train)

# 8. Evaluate on test set
y_pred = best_gbm.predict(X_test_scaled)

print(f"\nTest Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%\n")
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# 9. Cross-validated training accuracy
cv_scores = cross_val_score(best_gbm, X_train_scaled, y_train, cv=5, scoring='accuracy')
print(f"Cross-validated Training Accuracy: {cv_scores.mean()*100:.2f}% (+/- {cv_scores.std()*100:.2f}%)")

print("Quick Accuracy Check:")
print(f"Training Accuracy: {best_gbm.score(X_train_scaled, y_train):.4f}")
print(f"Testing Accuracy: {best_gbm.score(X_test_scaled, y_test):.4f}")

# 10. Save model and scaler
joblib.dump(best_gbm, 'gradient_boosting_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

print("Model and scaler saved successfully! ✅")


Before SMOTE: [ 0 70 76 17  0 13  9 29]
After SMOTE: [ 0 76 76 76  0 76 76 76]

Test Accuracy: 89.47%

Classification Report:
               precision    recall  f1-score   support

           1       0.74      0.89      0.81        19
           2       0.87      0.68      0.76        19
           3       0.84      0.84      0.84        19
           5       0.95      1.00      0.97        19
           6       1.00      1.00      1.00        19
           7       1.00      0.95      0.97        19

    accuracy                           0.89       114
   macro avg       0.90      0.89      0.89       114
weighted avg       0.90      0.89      0.89       114

Confusion Matrix:
 [[17  1  1  0  0  0]
 [ 3 13  2  1  0  0]
 [ 3  0 16  0  0  0]
 [ 0  0  0 19  0  0]
 [ 0  0  0  0 19  0]
 [ 0  1  0  0  0 18]]
Cross-validated Training Accuracy: 88.32% (+/- 2.87%)
Quick Accuracy Check:
Training Accuracy: 0.9854
Testing Accuracy: 0.8947
Model and scaler saved successfully! ✅
