In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
# Replace 'your_dataset.csv' with the actual file name or path
data = pd.read_csv('breast-cancer-wisconsin.csv')

# Drop '?'
data = data[~data.isin(['?']).any(axis=1)]

# Convert the features to category type
categorical_features = ['F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9']
data[categorical_features] = data[categorical_features].astype('category')

# Split data into features and target
X = data[categorical_features]
y = data['Class']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest model
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_model.fit(X_train, y_train)

# Make predictions
predictions = random_forest_model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions))

# Feature Importance
importances = random_forest_model.feature_importances_
feature_importance = sorted(zip(importances, categorical_features), reverse=True)
print("Feature Importance:")
for importance, feature in feature_importance:
    print(f"{feature}: {importance:.4f}")


Accuracy: 0.948905109489051
Classification Report:
               precision    recall  f1-score   support

           2       0.93      0.99      0.96        79
           4       0.98      0.90      0.94        58

    accuracy                           0.95       137
   macro avg       0.95      0.94      0.95       137
weighted avg       0.95      0.95      0.95       137

Feature Importance:
F2: 0.3555
F3: 0.2231
F6: 0.1247
F7: 0.1002
F8: 0.0895
F1: 0.0378
F5: 0.0366
F4: 0.0233
F9: 0.0093
