<a href="https://colab.research.google.com/github/ashw6/stml-b40/blob/main/Assignment_11%5BSML%5D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA

# Load data
data = pd.read_csv('/content/breast_cancer_survival.csv')

# Drop rows where the target variable 'Patient_Status' is missing
data = data.dropna(subset=['Patient_Status'])

# Encoding categorical features
categorical_columns = ['Gender', 'Tumour_Stage', 'Histology', 'ER status', 'PR status', 'HER2 status', 'Surgery_type']
for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])

# Encode target variable
target_encoder = LabelEncoder()
data['Patient_Status'] = target_encoder.fit_transform(data['Patient_Status'])

# Select features and target
X = data.drop(columns=['Patient_Status', 'Date_of_Surgery', 'Date_of_Last_Visit'])
y = data['Patient_Status']

# Impute missing values if any remain after dropping missing target rows
imputer = SimpleImputer(strategy='mean')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train and evaluate SVM
svm = SVC(random_state=42)
svm.fit(X_train, y_train)
svm_accuracy = accuracy_score(y_test, svm.predict(X_test))

# Train and evaluate KNN
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_accuracy = accuracy_score(y_test, knn.predict(X_test))

# Train and evaluate Logistic Regression
log_reg = LogisticRegression(random_state=42, max_iter=200)
log_reg.fit(X_train, y_train)
log_reg_accuracy = accuracy_score(y_test, log_reg.predict(X_test))

print("Accuracies without PCA:")
print("SVM:", svm_accuracy)
print("KNN:", knn_accuracy)
print("Logistic Regression:", log_reg_accuracy)

# Apply PCA to reduce dimensions
pca = PCA(n_components=5)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Train and evaluate SVM with PCA
svm.fit(X_train_pca, y_train)
svm_pca_accuracy = accuracy_score(y_test, svm.predict(X_test_pca))

# Train and evaluate KNN with PCA
knn.fit(X_train_pca, y_train)
knn_pca_accuracy = accuracy_score(y_test, knn.predict(X_test_pca))

# Train and evaluate Logistic Regression with PCA
log_reg.fit(X_train_pca, y_train)
log_reg_pca_accuracy = accuracy_score(y_test, log_reg.predict(X_test_pca))

print("\nAccuracies with PCA:")
print("SVM with PCA:", svm_pca_accuracy)
print("KNN with PCA:", knn_pca_accuracy)
print("Logistic Regression with PCA:", log_reg_pca_accuracy)


Accuracies without PCA:
SVM: 0.7692307692307693
KNN: 0.7384615384615385
Logistic Regression: 0.7692307692307693

Accuracies with PCA:
SVM with PCA: 0.7692307692307693
KNN with PCA: 0.7538461538461538
Logistic Regression with PCA: 0.7692307692307693
