<a href="https://colab.research.google.com/github/Zash2000/Breast-Cancer-Classifier/blob/main/breast_cancer_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Breast Cancer Classification

##Introduction
##### In this analysis, a logistic regression model will be built to predict whether a breast tumour is benign or malignant.

###Installing required packages

In [None]:
suppressMessages({
  install.packages("gt")
  install.packages("gtExtras")
  install.packages("repr")
  library(repr)
  library(ggplot2)
  library(caret)
  library(readr)
  library(tidymodels)
  library(car)
  library(naniar)
  library(dplyr)
  library(GGally)
  library(tidyr)
  library(mice)
  library(dlookr)
  library(lattice)
  library(glmnet)
  library(pROC)
  library(gt)
  library(gtExtras)
})

###Loading and preparing the data

In [None]:
# Import dataset
data <- read_csv("Cancer dataset.csv")

# Remove unneccessary columns from dataset (including the participant ID column)
data1 <- data[-c(1,2)]

# Further clean the data by removing the diagnosis_code column as this is
#essentially conveying the same information as the diagnosis column.
cancer_data <- data1 %>% select(-diagnosis_code)

##Exploratory Data Analysis

In [None]:
# Summary statistics of the cancer dataset
summary(cancer_data)

In [None]:
# To see the number of malignant and benign diagnoses in cancer_data
cancer_data %>% count(diagnosis)

In [None]:
# Box plot of Radius Mean by Diagnosis
ggplot(cancer_data %>% filter(!is.na(radius_mean)),
       aes(x = diagnosis, y = radius_mean, fill = diagnosis)) +
  geom_boxplot() +
  labs(title = "Box plot Distribution of Radius Mean by Diagnosis",
       x = "Diagnosis",
       y = "Radius Mean") +
  theme_minimal()


In [None]:
# Box plot of Texture Mean by Diagnosis
ggplot(cancer_data %>% filter(!is.na(texture_mean)),
        aes(x = diagnosis, y = texture_mean, fill = diagnosis)) +
  geom_boxplot() +
  labs(title = "Box plot Distribution of Texture Mean by Diagnosis",
       x = "Diagnosis",
       y = "Texture Mean") +
  theme_minimal()

In [None]:
# Box plot of Area Mean by Diagnosis
ggplot(cancer_data %>% filter(!is.na(area_mean)),
       aes(x = diagnosis, y = area_mean, fill = diagnosis)) +
  geom_boxplot() +
  labs(title = "Box plot Distribution of Area Mean by Diagnosis",
       x = "Diagnosis",
       y = "Area Mean") +
  theme_minimal()

In [None]:
# Box plots of some of the features by Diagnosis
cancer_data_long <- cancer_data %>%
  pivot_longer(cols = c(radius_mean, texture_mean, perimeter_mean, area_mean),
               names_to = "Feature", values_to = "Value") %>%
                 filter(!is.na(Value))

ggplot(cancer_data_long, aes(x = diagnosis, y = Value, fill = diagnosis)) +
  geom_boxplot() +
  facet_wrap(~ Feature, scales = "free") +
  labs(title = "Boxplots of Features by Diagnosis") +
  theme_minimal()

In [None]:
# Scatter Plot of Radius Mean vs Texture Mean
ggplot(cancer_data %>% filter(is.finite(radius_mean), is.finite(texture_mean)),
       aes(x = radius_mean, y = texture_mean, color = diagnosis)) +
  geom_point(alpha = 0.7) +
  labs(title = "Scatter Plot of Radius Mean vs Texture Mean",
       x = "Radius Mean",
       y = "Texture Mean") +
  theme_minimal()


In [None]:
# Correlation matrix
numeric_variables <- cancer_data[, sapply(cancer_data, is.numeric)]
correlation_matrix <- cor(numeric_variables, use = "complete.obs")
heatmap(correlation_matrix, main = "Correlation Matrix - Numeric Variables")

## Data-preprocessing: Handling of missing data

In [None]:
# How much missing data is there?
sum(is.na(cancer_data))

In [None]:
# Number of missing values in every column:
colSums(is.na(cancer_data))

In [None]:
#-----Understanding the missing data----

# Pareto chart demonstrating the proportion of missing data and potential handling
# based on the amount of data that is missing
options(repr.plot.width = 15, repr.plot.height = 10)
plot_na_pareto(cancer_data)

In [None]:
# ----- Handling the missing data -----

# Total values in dataset: 569 * 30 (excluding diagnosis variables) = 17,070
# With only a small percentage of the total data missing (~5%), complete case analysis
# could be justified, however, multiple imputation is a method that retains
# mean and variance of the data.

# To handle the missing data, I'm using the
# Multivarite Imputation by Chained Equations method

imputed_data <- mice(cancer_data, m = 5, maxit = 50, method = "pmm", seed = 500, printFlag = FALSE)
completeData <- complete(imputed_data,2)
fit <- with(data = imputed_data, exp = lm(radius_mean ~ texture_mean + perimeter_mean))
combine <- pool(fit)
summary(combine)

# Extraction of the Imputed Dataset
completed_data <- complete(imputed_data, 2)

# The original dataset is preserved and the imputed dataset is stored in a new variable
imputed_cancer_data <- completed_data

In [None]:
# Summary stats of the imputed cancer dataset
summary(imputed_cancer_data)

# To check whether all values have been imputed
colSums(is.na(imputed_cancer_data))

####Validation of Imputation

In [None]:
# Validation of imputation

# Plots of the original data VS imputed data

# Radius_mean plot
ggplot() +
  geom_density(data = cancer_data, aes(x = radius_mean), alpha = 0.5) +
  geom_density(data = imputed_cancer_data, aes(x = radius_mean), alpha = 0.5) +
  labs(title = "Plot of the original data VS imputed data for radius_mean",
       x = "radius_mean",
       y = "Density") +
  theme_minimal() +
  scale_fill_manual(name = "Data Type", values = c("red", "blue"), labels = c("Original", "Imputed"))

# Radius_se plot
ggplot() +
  geom_density(data = cancer_data, aes(x = radius_se), alpha = 0.5) +
  geom_density(data = imputed_cancer_data, aes(x = radius_se), alpha = 0.5) +
  labs(title = "Plot of the original data VS imputed data for radius_se",
       x = "radius_se",
       y = "Density") +
  theme_minimal() +
  scale_fill_manual(name = "Data Type", values = c("red", "blue"), labels = c("Original", "Imputed"))

# Radius_worst plot
ggplot() +
  geom_density(data = cancer_data, aes(x = radius_worst), alpha = 0.5) +
  geom_density(data = imputed_cancer_data, aes(x = radius_worst), alpha = 0.5) +
  labs(title = "Plot of the original data VS imputed data for radius_worst",
       x = "radius_worst",
       y = "Density") +
  theme_minimal() +
  scale_fill_manual(name = "Data Type", values = c("red", "blue"), labels = c("Original", "Imputed"))

# Texture_mean plot
ggplot() +
  geom_density(data = cancer_data, aes(x = texture_mean), alpha = 0.5) +
  geom_density(data = imputed_cancer_data, aes(x = texture_mean), alpha = 0.5) +
  labs(title = "Plot of the original data VS imputed data for texture_mean",
       x = "texture_mean",
       y = "Density") +
  theme_minimal() +
  scale_fill_manual(name = "Data Type", values = c("red", "blue"), labels = c("Original", "Imputed"))


# Texture_se plot
ggplot() +
  geom_density(data = cancer_data, aes(x = texture_se), alpha = 0.5) +
  geom_density(data = imputed_cancer_data, aes(x = texture_se), alpha = 0.5) +
  labs(title = "Comparison of Original vs Imputed Data for texture_se",
       x = "texture_se",
       y = "Density") +
  theme_minimal() +
  scale_fill_manual(name = "Data Type", values = c("red", "blue"), labels = c("Original", "Imputed"))

# Texture_worst plot
ggplot() +
  geom_density(data = cancer_data, aes(x = texture_worst), alpha = 0.5) +
  geom_density(data = imputed_cancer_data, aes(x = texture_worst), alpha = 0.5) +
  labs(title = "Plot of the original data VS imputed data for texture_worst",
       x = "texture_worst",
       y = "Density") +
  theme_minimal() +
  scale_fill_manual(name = "Data Type", values = c("red", "blue"), labels = c("Original", "Imputed"))

##Further Data-preprocessing: Splitting of data

In [None]:
# ------ Splitting the data into training and test sets: ------

# Set the seed to make partition reproducible
set.seed(123)

# Stratified split: 70% training, 30% test
# Stratified sampling based on the diagnosis column so the class distribution
# is maintained in both sets.
split <- createDataPartition(imputed_cancer_data$diagnosis, p = 0.7, list = FALSE)
train_set <- imputed_cancer_data[split, ]
test_set <- imputed_cancer_data[-split, ]

sum(is.na(train_set))

# Class distribution check
table(train_set$diagnosis)  # Training set class distribution
table(test_set$diagnosis)   # Test set class distribution

##Logistic Regression Model

In [None]:
# ------- Logistic regression model -------

# Changing the 'diagnosis' feature to binary, where 1 = Malignant and 0 = Benign
train_set$diagnosis <- ifelse(train_set$diagnosis == "M", 1, 0)

# Check the conversion has taken place
train_set$diagnosis <- as.numeric(train_set$diagnosis)
table(train_set$diagnosis)

# Fit a logistic regression model
cancer_logistic_model <- glm(diagnosis ~., data = train_set, family="binomial",
                       control = glm.control(maxit = 100))
summary(cancer_logistic_model)

In [None]:
# Check for multicollinearity using the Variance Inflation Factor (VIF):
library(car)
vif(glm(diagnosis ~ ., data = train_set, family = "binomial",
        control = glm.control(maxit = 100)))

####Feature Selection

In [None]:
# --------- Feature selection using LASSO -------

# Fitting a regular logistic regression model intitally to obtain the 'x' and 'y'
logistic_model <- glm(diagnosis ~ ., data=train_set, family=binomial,
                      control = glm.control(maxit = 100))
summary(logistic_model)


X <- model.matrix(logistic_model)
y <- as.numeric(logistic_model$y)
unique(y)
dim(X)
length(y)
head(y)
X <- as.matrix(X)

any(is.na(X))  # Check if X has missing values
any(is.na(y))

# Lasso model
lasso_cancer1 <- glmnet(X, y, alpha=1, family='binomial')
plot(lasso_cancer1)

# The best value of lambda using cv.glmnet
lasso_cancer2 <- cv.glmnet(X, y, alpha=1, family='binomial')
plot(lasso_cancer2)

# K-fold cross-validation to find the value of lambda with minimal error
coef(lasso_cancer2, s='lambda.min')

# Print the value of lambda.min
lasso_cancer2$lambda.min

# Computed predictions to find the accuracy of the model.
predictions <- predict(lasso_cancer2, s='lambda.min', newx=X, type='response')
acc <- as.numeric(predictions>=0.5) == y
mean(acc)

table(test_set$diagnosis)

####Model Validation

In [None]:
# -------- Validate the model on the test_data ------

# Check the diagnosis column values
unique(test_set$diagnosis)

# Change the diagnosis column to binary
test_set$diagnosis <- ifelse(test_set$diagnosis == "M", 1, 0)

# Fit model to obtain X_test and y_test
lasso_model <- glm(diagnosis ~ ., data=test_set, family=binomial,
                   control = glm.control(maxit = 100))
summary(lasso_model)

X_test <- model.matrix(lasso_model)
y_test <- as.numeric(test_set$diagnosis)

# Computed predictions to find the accuracy of the lasso_cancer2 model
# (using the minimum value of lambda).
predictions <- predict(lasso_cancer2, s='lambda.min', newx=X_test, type='response')
acc <- as.numeric(predictions>=0.5) == y_test
mean(acc)

####Model performance

In [None]:
# ------ Model performance ------
# Change predictions to binary
predictions_binary <- ifelse(predictions >= 0.5, 1, 0)

# Change the predictions and actual values to factors
predictions_binary <- factor(predictions_binary, levels = c(0, 1))
y_test <- factor(y_test, levels = c(0, 1))

# Confusion Matrix
confusion_matrix <- confusionMatrix(data = predictions_binary, reference = y_test, positive = "1")
print(confusion_matrix)

# Simpler Confusion Matrix
conf_matrix <- table(Predicted = predictions_binary, Actual = y_test)
print(conf_matrix)

# Use confusion matrix values to assign true and false positives and negatives
true_positives <- conf_matrix[2, 2]
false_positives <- conf_matrix[2, 1]
true_negatives <- conf_matrix[1, 1]
false_negatives <- conf_matrix[1, 2]

# Precision, Recall, and F1-Score
precision <- true_positives / (true_positives + false_positives)
recall <- true_positives / (true_positives + false_negatives)
f1_score <- 2 * (precision * recall) / (precision + recall)

cat("Precision:", precision, "\n")
cat("Recall:", recall, "\n")
cat("F1-Score:", f1_score, "\n")

# Compute the ROC curve
roc_curve <- roc(y_test, as.numeric(predictions))
auc_ci <- ci.auc(roc_curve, conf.level = 0.95)

# ROC curve - plot
plot(roc_curve, col = "blue", main = "ROC Curve")
auc_value <- auc(roc_curve)
cat("AUC:", auc_value, "\n")
cat("95% CI for AUC:", auc_ci[1], "-", auc_ci[3], "\n")

####Calibration Curve

In [None]:
# Plot calibration curve

# Change X_test to a matrix
X_test <- as.matrix(X_test)

# Predictions from Lasso model
predictions <- predict(lasso_cancer2, s = "lambda.min", newx = X_test, type = "response")

# Checking to see if yTest is numeric and binary
yTest <- as.numeric(as.factor(test_set$diagnosis)) - 1

# val.prob() is used to check calibration performance
calPerf <- val.prob(p = predictions, y = yTest)

# Metrics of the Calibration plot
print(calPerf)