In [1]:
import xgboost
print(xgboost.__version__)

3.0.2


In [2]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from xgboost import XGBClassifier
import numpy as np

In [3]:
categories = ['rec.sport.baseball', 'sci.med']  # Binary classification
data = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))

X = data.data  # Text
y = data.target  # Labels: 0 or 1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [4]:
dt_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),  # Step 1: text to TF-IDF features
    ('clf', DecisionTreeClassifier(random_state=42))  # Step 2: Decision Tree classifier
])


In [5]:
rf_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
])


In [6]:
dt_scores = cross_val_score(dt_pipeline, X_train, y_train, cv=5, scoring='accuracy')
rf_scores = cross_val_score(rf_pipeline, X_train, y_train, cv=5, scoring='accuracy')

print("Decision Tree Accuracy:", np.mean(dt_scores))
print("Random Forest Accuracy:", np.mean(rf_scores))


Decision Tree Accuracy: 0.7626674014880133
Random Forest Accuracy: 0.8802810691650592


In [8]:
xgb_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', XGBClassifier(eval_metric='logloss', random_state=42))
])

# Grid of hyperparameters to tune
param_grid = {
    'clf__n_estimators': [50, 100],
    'clf__max_depth': [3, 5],
    'clf__learning_rate': [0.1, 0.3]
}

grid = GridSearchCV(xgb_pipeline, param_grid, cv=5, scoring='accuracy', verbose=1)
grid.fit(X_train, y_train)

print("Best Parameters:", grid.best_params_)
print("Best Cross-Validated Accuracy:", grid.best_score_)


Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best Parameters: {'clf__learning_rate': 0.3, 'clf__max_depth': 5, 'clf__n_estimators': 50}
Best Cross-Validated Accuracy: 0.8760925874896666
