In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Loading the dataset
data = pd.read_csv('breast_cancer.csv')

# Data preprocessing
# Removing 'SN' and 'Year' columns
data = data.drop(['Unnamed: 0', 'SN', 'Year'], axis=1)

# Converting categorical variables to dummy variables
data = pd.get_dummies(data, columns=['Breast', 'BreastQuadrant'])

# Splitting the data into features and target variable
X = data.drop('DiagnosisResult', axis=1)
Y = data['DiagnosisResult']

# Splitting the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

# Function to train and evaluate models
def train_evaluate_model(model, X_train, Y_train, X_test, Y_test, model_name):
    model.fit(X_train, Y_train)
    predictions = model.predict(X_test)
    accuracy = accuracy_score(Y_test, predictions)
    print(f"{model_name} Accuracy:", accuracy)
    print(f"{model_name} Classification Report:\n", classification_report(Y_test, predictions))

# Original Models
print("Original Models:\n")
train_evaluate_model(LogisticRegression(max_iter=1000), X_train, Y_train, X_test, Y_test, "Original Logistic Regression")
train_evaluate_model(DecisionTreeClassifier(), X_train, Y_train, X_test, Y_test, "Original Decision Tree")
train_evaluate_model(SVC(), X_train, Y_train, X_test, Y_test, "Original SVM")

# Oversampled Models
print("\nOversampled Models:\n")
ros = RandomOverSampler(random_state=42)
X_ros, Y_ros = ros.fit_resample(X_train, Y_train)
train_evaluate_model(LogisticRegression(max_iter=1000), X_ros, Y_ros, X_test, Y_test, "Oversampled Logistic Regression")
train_evaluate_model(DecisionTreeClassifier(), X_ros, Y_ros, X_test, Y_test, "Oversampled Decision Tree")
train_evaluate_model(SVC(), X_ros, Y_ros, X_test, Y_test, "Oversampled SVM")

# Undersampled Models
print("\nUndersampled Models:\n")
rus = RandomUnderSampler(random_state=42)
X_rus, Y_rus = rus.fit_resample(X_train, Y_train)
train_evaluate_model(LogisticRegression(max_iter=1000), X_rus, Y_rus, X_test, Y_test, "Undersampled Logistic Regression")
train_evaluate_model(DecisionTreeClassifier(), X_rus, Y_rus, X_test, Y_test, "Undersampled Decision Tree")
train_evaluate_model(SVC(), X_rus, Y_rus, X_test, Y_test, "Undersampled SVM")

Original Models:

Original Logistic Regression Accuracy: 0.890625
Original Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.84      1.00      0.91        36
           1       1.00      0.75      0.86        28

    accuracy                           0.89        64
   macro avg       0.92      0.88      0.88        64
weighted avg       0.91      0.89      0.89        64

Original Decision Tree Accuracy: 0.875
Original Decision Tree Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.89      0.89        36
           1       0.86      0.86      0.86        28

    accuracy                           0.88        64
   macro avg       0.87      0.87      0.87        64
weighted avg       0.88      0.88      0.88        64

Original SVM Accuracy: 0.8125
Original SVM Classification Report:
               precision    recall  f1-score   support

           0       