In [1]:
# Import necessary libraries
import pandas as pd
import autosklearn.classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Load the dataset
data = pd.read_csv('data.csv')
data = data.iloc[:, 1:]  # Remove the first column (if it's an index or unwanted)
print(data.head())

# Separate features (X) and target variable (y)
X = data.drop('diagnosis', axis=1)  # Drop target column for features
y = data['diagnosis']  # Target variable

# Split the Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and Train Auto-sklearn Model
model = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=60,  # Maximum time for the search (in seconds)
    per_run_time_limit=30,       # Maximum time for each model training (in seconds)
)

# Fit the model to the training data
model.fit(X_train, y_train)

# Evaluate the Model
y_pred = model.predict(X_test)

# Calculate accuracy
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')

# Print detailed classification report (precision, recall, F1-score)
print(classification_report(y_test, y_pred))

# Show the Models Used by Auto-sklearn
models = model.show_models()
print("Models and their performance:")
for model_name, performance in models.items():
    print(f"Model: {model_name}, Performance: {performance}")

# Extract the best model based on performance
models = model.show_models()
best_model = model.get_models_with_weights()
print(f"The best model is: {best_model[0]}")

# Save the trained model to a file for later use
joblib.dump(model, 'auto_sklearn_cancer_model.pkl')

# Load the saved model (if needed)
loaded_model = joblib.load('auto_sklearn_cancer_model.pkl')

# Use the loaded model to make predictions (just to verify it works)
y_pred_loaded = loaded_model.predict(X_test)
print(f'Accuracy of loaded model: {accuracy_score(y_test, y_pred_loaded)}')

  diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0         M        17.99         10.38          122.80     1001.0   
1         M        20.57         17.77          132.90     1326.0   
2         M        19.69         21.25          130.00     1203.0   
3         M        11.42         20.38           77.58      386.1   
4         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   symmetry_mean  ...  radius_worst  texture_worst  perimeter_worst  \
0         0.2419  ...         25.38          17.33 

In [27]:
# xgboost performance
import xgboost as xgb
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import LabelEncoder

# Encode the target variable (labels 'B' and 'M') as numerical values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)  # Convert 'B' to 0 and 'M' to 1

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Add other classifiers into the ensemble if needed
xgboost_model = xgb.XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    learning_rate=0.05,        
    max_depth=5,               
    n_estimators=500,         
    subsample=0.85,           
    colsample_bytree=0.8,     
    gamma=0.1,                
    scale_pos_weight=1,        
    # max_leaves=30,
)

# Create a pipeline that scales the features, then applies XGBoost
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Feature scaling
    ('xgboost', xgboost_model)     # Using XGBoost classifier
])

# Fit the model to the training data
pipeline.fit(X_train, y_train)

# Evaluate the Model
y_pred = pipeline.predict(X_test)

# Calculate accuracy
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')

# Print detailed classification report (precision, recall, F1-score)
print(classification_report(y_test, y_pred))

Accuracy: 0.9649122807017544
              precision    recall  f1-score   support

           0       0.96      0.99      0.97        71
           1       0.98      0.93      0.95        43

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114

