In [11]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, confusion_matrix, classification_report, roc_auc_score, f1_score,
                             recall_score, precision_score, balanced_accuracy_score, matthews_corrcoef,roc_curve, auc, plot_confusion_matrix)

# Load data from CSV file
data = pd.read_csv("/Users/alyssajames/BMEN415/Classification Models/breast-cancer.csv")

# Fill missing values with the mean of the respective column
data = data.fillna(data.mean())

# Define the features and target variables
X = data.drop('diagnosis', axis=1)
y = data['diagnosis']

# Encoding the target variable
le = LabelEncoder()
y = le.fit_transform(y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create the penalized SVM classifier with L2 penalty
classifier = make_pipeline(StandardScaler(), LinearSVC(penalty='l2', dual=False, random_state=42))

# Add k-fold cross-validation
k = 10
cv_scores = cross_val_score(classifier, X, y, cv=k)

# Train the classifier using the training data
classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Since LinearSVC does not have the 'predict_proba' method, we cannot calculate AUC-ROC directly
# We can use the decision function to get the distance from the decision boundary
y_decision_function = classifier.decision_function(X_test)
auc_roc = roc_auc_score(y_test, y_decision_function)

f1 = f1_score(y_test, y_pred, average="weighted")
recall = recall_score(y_test, y_pred, average="weighted")
precision = precision_score(y_test, y_pred, average="weighted")
balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
matthews_corr = matthews_corrcoef(y_test, y_pred)

# Print the results
print("\nCross-Validation Scores:", cv_scores)
print("\nAverage Cross-Validation Score:", cv_scores.mean())
print("\nAccuracy:", accuracy)
print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)
print("\nAUC-ROC:", auc_roc)
print("\nF1 Score:", f1)
print("\nRecall:", recall)
print("\nPrecision:", precision)
print("\nBalanced Accuracy:", balanced_accuracy)
print("\nMatthews Correlation Coefficient:", matthews_corr)

print("Confusion matrix:")
plot_confusion_matrix(classifier, X_test_scaled, y_test, display_labels=['Benign', 'Malignant'], cmap=plt.cm.Blues)
plt.show()

# Plot ROC curve
plot_roc_curve(classifier, X_test_scaled, y_test)
plt.show()

ImportError: cannot import name 'plot_confusion_matrix' from 'sklearn.metrics' (/Users/alyssajames/opt/anaconda3/lib/python3.8/site-packages/sklearn/metrics/__init__.py)