In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, roc_auc_score

# Load datasets
train_data = pd.read_csv("data/train_set_basic.csv")
test_data = pd.read_csv("data/test_set_constant.csv")

# Inspect the columns in the dataset
print("Train Data Columns:", train_data.columns)
print("Test Data Columns:", test_data.columns)

# Adjust feature columns based on the actual columns in your dataset
feature_columns = [
    'hemoglobin', 'ferritin', 'ret_count', 'segmented_neutrophils', 'tibc',
    'mcv', 'serum_iron', 'rbc', 'gender', 'creatinine', 'cholestrol',
    'copper', 'ethanol', 'folate', 'glucose', 'hematocrit', 'tsat'
]
target_column = 'label'

# Ensure all feature columns exist in the dataset
missing_columns = [col for col in feature_columns if col not in train_data.columns]
if missing_columns:
    print(f"Missing columns in train data: {missing_columns}")
else:
    # Split train and test data
    X_train = train_data[feature_columns]
    y_train = train_data[target_column]

    X_test = test_data[feature_columns]
    y_test = test_data[target_column]

    # Initialize Random Forest Classifier
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

    # Train the model
    rf_model.fit(X_train, y_train)

    # Predictions
    y_pred = rf_model.predict(X_test)
    y_pred_proba = rf_model.predict_proba(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')

    print("Accuracy:", accuracy)
    print("F1 Score:", f1)
    print("ROC AUC Score:", roc_auc)
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Train Data Columns: Index(['hemoglobin', 'ferritin', 'ret_count', 'segmented_neutrophils', 'tibc',
       'mcv', 'serum_iron', 'rbc', 'gender', 'creatinine', 'cholestrol',
       'copper', 'ethanol', 'folate', 'glucose', 'hematocrit', 'tsat',
       'label'],
      dtype='object')
Test Data Columns: Index(['hemoglobin', 'ferritin', 'ret_count', 'segmented_neutrophils', 'tibc',
       'mcv', 'serum_iron', 'rbc', 'gender', 'creatinine', 'cholestrol',
       'copper', 'ethanol', 'folate', 'glucose', 'hematocrit', 'tsat',
       'label'],
      dtype='object')
Accuracy: 0.9990714285714286
F1 Score: 0.9990709087091866
ROC AUC Score: 0.999999634637839

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      1.00      2000
           1       1.00      1.00      1.00      1801
           2       1.00      1.00      1.00      1793
           3       1.00      1.00      1.00      1772
           4       1.00      1.00      1.00      1