In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler

# Load datasets
train_data = pd.read_csv("data/train_set_basic.csv")
test_data = pd.read_csv("data/test_set_constant.csv")

# Inspect the columns in the dataset
print("Train Data Columns:", train_data.columns)
print("Test Data Columns:", test_data.columns)

# Adjust feature columns based on the actual columns in your dataset
feature_columns = [
    'hemoglobin', 'ferritin', 'ret_count', 'segmented_neutrophils', 'tibc',
    'mcv', 'serum_iron', 'rbc', 'gender', 'creatinine', 'cholestrol',
    'copper', 'ethanol', 'folate', 'glucose', 'hematocrit', 'tsat'
]
target_column = 'label'

# Ensure all feature columns exist in the dataset
missing_columns = [col for col in feature_columns if col not in train_data.columns]
if missing_columns:
    print(f"Missing columns in train data: {missing_columns}")
else:
    # Split train and test data
    X_train = train_data[feature_columns]
    y_train = train_data[target_column]

    X_test = test_data[feature_columns]
    y_test = test_data[target_column]

    # Standardize the data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Initialize SVM Classifier
    svm_model = SVC(probability=True, random_state=42)

    # Train the model
    svm_model.fit(X_train, y_train)

    # Predictions
    y_pred = svm_model.predict(X_test)
    y_pred_proba = svm_model.predict_proba(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')

    print("Accuracy:", accuracy)
    print("F1 Score:", f1)
    print("ROC AUC Score:", roc_auc)
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Train Data Columns: Index(['hemoglobin', 'ferritin', 'ret_count', 'segmented_neutrophils', 'tibc',
       'mcv', 'serum_iron', 'rbc', 'gender', 'creatinine', 'cholestrol',
       'copper', 'ethanol', 'folate', 'glucose', 'hematocrit', 'tsat',
       'label'],
      dtype='object')
Test Data Columns: Index(['hemoglobin', 'ferritin', 'ret_count', 'segmented_neutrophils', 'tibc',
       'mcv', 'serum_iron', 'rbc', 'gender', 'creatinine', 'cholestrol',
       'copper', 'ethanol', 'folate', 'glucose', 'hematocrit', 'tsat',
       'label'],
      dtype='object')
Accuracy: 0.8845
F1 Score: 0.8805435723097026
ROC AUC Score: 0.9922426493811956

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.98      2000
           1       0.89      0.93      0.91      1801
           2       0.85      1.00      0.92      1793
           3       0.88      0.94      0.91      1772
           4       0.75      0.89      0.82      1679
       