In [1]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

data = pd.read_csv("Dry_Bean_Dataset.csv")

# Assuming the last column is the label
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# Convert labels to integer values
classes = np.unique(y)
y_int = np.array([np.where(classes == label)[0][0] for label in y])

X_train, X_test, y_train, y_test = train_test_split(X, y_int, test_size=0.3, random_state=42)

# Create DMatrix, the internal data structure that XGBoost uses
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Define parameters for classification
params = {
    'objective': 'multi:softmax',  # for multiclass classification
    'num_class': len(classes),
    'eta': 0.1,
    'max_depth': 4
}
num_round = 20  # number of boosting rounds

# Train the model
bst = xgb.train(params, dtrain, num_round)

# Make predictions
predictions = bst.predict(dtest)
# Make predictions on the training data
train_predictions = bst.predict(dtrain)
print("Train accuracy:", accuracy_score(y_train, train_predictions))
print("Predictions:", predictions)
print("test accuracy:", accuracy_score(y_test, predictions))
print("test data report:", classification_report(y_test, predictions))

ModuleNotFoundError: No module named 'xgboost'

In [None]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Load data
data = pd.read_csv("Dry_Bean_Dataset.csv")
# Assuming the last column is the label
X = data.iloc[:, :-1]
y = data.iloc[:, -1]
# Convert labels to integer values
classes = np.unique(y)
y_int = np.array([np.where(classes == label)[0][0] for label in y])

# Split data into train+val and test (70/30 split)
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y_int, test_size=0.3, random_state=42)
# Further split train+val into training and validation (80/20 split)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.2, random_state=42)

# Create DMatrix objects
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test, label=y_test)

# Hyperparameter grid
etas = [0.1, 0.3, 0.5]
max_depths = [4, 5, 6]
num_boost_round = 200      # maximum rounds to allow early stopping to decide the best round
early_stopping_rounds = 10

best_val_accuracy = 0.0
best_params = None
best_model = None
best_iteration = None

# Grid search over eta and max_depth using early stopping on the validation set
evals = [(dtrain, 'train'), (dval, 'eval')]
for eta in etas:
    for max_depth in max_depths:
        params = {
            'objective': 'multi:softmax',
            'num_class': len(classes),
            'eta': eta,
            'max_depth': max_depth
        }
        print(f"Training with eta={eta}, max_depth={max_depth}...")
        bst = xgb.train(
            params,
            dtrain,
            num_boost_round=num_boost_round,
            evals=evals,
            early_stopping_rounds=early_stopping_rounds,
            verbose_eval=False
        )
        # Evaluate on the validation set
        val_pred = bst.predict(dval, iteration_range=(0, bst.best_iteration))
        val_acc = accuracy_score(y_val, val_pred)
        print(f"Validation accuracy: {val_acc:.4f} (best iteration: {bst.best_iteration})")
        
        if val_acc > best_val_accuracy:
            best_val_accuracy = val_acc
            best_params = params.copy()
            best_params['best_iteration'] = bst.best_iteration
            best_model = bst
            best_iteration = bst.best_iteration

print("\nBest hyperparameters found:")
print(best_params)
print("Best validation accuracy:", best_val_accuracy)

# Evaluate the best model on the test set
test_pred = best_model.predict(dtest, iteration_range=(0, best_iteration))
print("\nTest Accuracy:", accuracy_score(y_test, test_pred))
print("Test Classification Report:\n", classification_report(y_test, test_pred))

Training with eta=0.1, max_depth=4...
Validation accuracy: 0.9334 (best iteration: 97)
Training with eta=0.1, max_depth=5...
Validation accuracy: 0.9302 (best iteration: 111)
Training with eta=0.1, max_depth=6...
Validation accuracy: 0.9323 (best iteration: 85)
Training with eta=0.3, max_depth=4...
Validation accuracy: 0.9292 (best iteration: 39)
Training with eta=0.3, max_depth=5...
Validation accuracy: 0.9339 (best iteration: 27)
Training with eta=0.3, max_depth=6...
Validation accuracy: 0.9276 (best iteration: 25)
Training with eta=0.5, max_depth=4...
Validation accuracy: 0.9328 (best iteration: 21)
Training with eta=0.5, max_depth=5...
Validation accuracy: 0.9276 (best iteration: 16)
Training with eta=0.5, max_depth=6...
Validation accuracy: 0.9292 (best iteration: 13)

Best hyperparameters found:
{'objective': 'multi:softmax', 'num_class': 7, 'eta': 0.3, 'max_depth': 5, 'best_iteration': 27}
Best validation accuracy: 0.9338929695697796

Test Accuracy: 0.9258080313418218
Test Class

In [None]:
import xgboost as xgb
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

# Load data
data = pd.read_csv("Dry_Bean_Dataset.csv")
# Assuming the last column is the label
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# Convert labels to integer values
classes = np.unique(y)
y_int = np.array([np.where(classes == label)[0][0] for label in y])

# Split data into train+val and test (70/30 split)
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y_int, test_size=0.3, random_state=42)
# Further split train+val into training and validation (80/20 split)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.2, random_state=42)

# Create an XGBClassifier that integrates with scikit-learn.
# Note: use_label_encoder=False and an eval_metric are recommended for newer XGBoost versions.
clf = XGBClassifier(objective='multi:softmax',
                    num_class=len(classes),
                    use_label_encoder=False,
                    eval_metric='mlogloss')

# Define hyperparameter grid. Here we include 'eta' (learning_rate), 'max_depth', and 'n_estimators'
param_grid = {
    'eta': [0.1, 0.3, 0.5],
    'max_depth': [4, 5, 6],
    'n_estimators': [100, 200]  # equivalent to number of boosting rounds
}

# Setup GridSearchCV.
# Extra fit arguments are passed via 'fit_params'
grid = GridSearchCV(estimator=clf,
                    param_grid=param_grid,
                    scoring='accuracy',
                    cv=3,
                    n_jobs=-1,
                    verbose=1)

# Fit GridSearchCV. Use the validation set for early stopping.
# fit_params = {
#     "eval_set": [(X_val, y_val)],
#     "early_stopping_rounds": 10,
#     "verbose": False
# }

grid.fit(X_train, y_train)

print("Best hyperparameters found:")
print(grid.best_params_)
print("Best cross-validation score:", grid.best_score_)

# Evaluate the best model on the test set
best_model = grid.best_estimator_
y_test_pred = best_model.predict(X_test)
print("\nTest Accuracy:", accuracy_score(y_test, y_test_pred))
print("Test Classification Report:\n", classification_report(y_test, y_test_pred))

Fitting 3 folds for each of 18 candidates, totalling 54 fits


Parameters: { "use_label_encoder" } are not used.



Best hyperparameters found:
{'eta': 0.1, 'max_depth': 5, 'n_estimators': 100}
Best cross-validation score: 0.9255996409539717

Test Accuracy: 0.9258080313418218
Test Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.92      0.93       395
           1       1.00      1.00      1.00       161
           2       0.95      0.95      0.95       479
           3       0.91      0.91      0.91      1043
           4       0.97      0.96      0.96       588
           5       0.95      0.94      0.95       619
           6       0.86      0.88      0.87       799

    accuracy                           0.93      4084
   macro avg       0.94      0.94      0.94      4084
weighted avg       0.93      0.93      0.93      4084



In [None]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import accuracy_score, classification_report

data = pd.read_csv("Dry_Bean_Dataset.csv")

# Assuming the last column is the label
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# Convert labels to integer values
classes = np.unique(y)
y_int = np.array([np.where(classes == label)[0][0] for label in y])

X_train, X_test, y_train, y_test = train_test_split(X, y_int, test_size=0.3, random_state=42)

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'multi:softmax',  # multiclass classification
    'num_class': len(classes),
    'eta': 0.1,           # try reducing the learning rate
    'max_depth': 4,       # adjust by lowering to reduce overfitting
    # You can experiment further by adding 'gamma', 'subsample', etc.
}
num_round = 50  # Increase boosting rounds to see the effect with early stopping

evals = [(dtrain, "train"), (dtest, "eval")]
bst = xgb.train(params, dtrain, num_boost_round=num_round, evals=evals, early_stopping_rounds=5)

train_predictions = bst.predict(dtrain)
print("Train accuracy:", accuracy_score(y_train, train_predictions))

predictions = bst.predict(dtest)
print("Test accuracy:", accuracy_score(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions))

[0]	train-mlogloss:1.66896	eval-mlogloss:1.67187
[1]	train-mlogloss:1.46646	eval-mlogloss:1.47224
[2]	train-mlogloss:1.30796	eval-mlogloss:1.31615
[3]	train-mlogloss:1.17852	eval-mlogloss:1.18869
[4]	train-mlogloss:1.06917	eval-mlogloss:1.08118
[5]	train-mlogloss:0.97606	eval-mlogloss:0.98974
[6]	train-mlogloss:0.89505	eval-mlogloss:0.91018
[7]	train-mlogloss:0.82346	eval-mlogloss:0.84023
[8]	train-mlogloss:0.76029	eval-mlogloss:0.77846
[9]	train-mlogloss:0.70459	eval-mlogloss:0.72420
[10]	train-mlogloss:0.65455	eval-mlogloss:0.67531
[11]	train-mlogloss:0.61048	eval-mlogloss:0.63209
[12]	train-mlogloss:0.57058	eval-mlogloss:0.59305
[13]	train-mlogloss:0.53454	eval-mlogloss:0.55827
[14]	train-mlogloss:0.50213	eval-mlogloss:0.52679
[15]	train-mlogloss:0.47270	eval-mlogloss:0.49821
[16]	train-mlogloss:0.44611	eval-mlogloss:0.47270
[17]	train-mlogloss:0.42170	eval-mlogloss:0.44917
[18]	train-mlogloss:0.39975	eval-mlogloss:0.42818
[19]	train-mlogloss:0.37975	eval-mlogloss:0.40920
[20]	train

In [None]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import accuracy_score, classification_report

data = pd.read_csv("Dry_Bean_Dataset.csv")

# Assuming the last column is the label
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# Convert labels to integer values
classes = np.unique(y)
y_int = np.array([np.where(classes == label)[0][0] for label in y])

X_train, X_test, y_train, y_test = train_test_split(X, y_int, test_size=0.3, random_state=42)

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'multi:softmax',  # multiclass classification
    'num_class': len(classes),
    'eta': 0.1,           # try reducing the learning rate
    'max_depth': 4,       # adjust by lowering to reduce overfitting
    # You can experiment further by adding 'gamma', 'subsample', etc.
}
num_round = 50  # Increase boosting rounds to see the effect with early stopping

