## CNN Feature Extraction using RestNet

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import torch
import torchvision.transforms as transforms

# import torchvision.models as models
from torchvision.models import resnet50, ResNet50_Weights

from torch.utils.data import DataLoader
import os
import sklearn.model_selection as model_selection
import sklearn.linear_model as linear_model
import subprocess
from modeling_methods import ImageDataset
from tqdm import tqdm

from sklearn.decomposition import PCA
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    precision_score,
    f1_score,
    recall_score,
)

%load_ext autoreload
%autoreload 2

## Load the Model


In [None]:
# Using the RestNet 50 model to extract features using pretrained weights
model = resnet50(weights=ResNet50_Weights.DEFAULT)

## Data PreProcessing 

In [None]:
# load repo and
repo_dir = (
    subprocess.Popen(["git", "rev-parse", "--show-toplevel"], stdout=subprocess.PIPE)
    .communicate()[0]
    .rstrip()
    .decode("utf-8")
)
original_folder_path = os.path.join(repo_dir, "dataverse_files/HAM10000_images_part_1")
# original_folder_path =  os.path.join(repo_dir, 'dataverse_files/HAM10000_images_part_1_2')
# original_folder_path = os.path.join(repo_dir, 'preprocessed_images')
# original_folder_path = os.path.join(repo_dir, "dataverse_files/JS_Selection")

In [None]:
transform = transforms.Compose(
    [
        transforms.Resize(232),
        transforms.CenterCrop(450),  # adapted to use larger region
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ]
)
dataset = ImageDataset(directory=original_folder_path, transform=transform)
data_loader = DataLoader(dataset, batch_size=16, shuffle=False, num_workers=0)

## Feature Extraction

In [None]:
# Determine the best available device
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"
print(f"Using device: {device}")

In [None]:
model = model.to(device)  # Move your model to the appropriate device
model.eval()  # Set the model to evaluation mode

features_map2 = {}
with torch.no_grad():
    for batch_idx, (key, images) in enumerate(tqdm(data_loader)):
        images = images.to(device)  # Move images to the appropriate device

        batch_features = model(images)
        batch_features = batch_features.view(
            batch_features.size(0), -1
        )  # Flatten features

        batch_features = (
            batch_features.cpu().numpy()
        )  # Move features to CPU for numpy conversion

        for i, feature in enumerate(batch_features):
            image_id = (
                batch_idx * data_loader.batch_size + i
            )  # Compute global image ID/index
            features_map2[key[i]] = feature

In [None]:
# Saving the features
features_df = pd.DataFrame(features_map2)
features_df
features_df.to_json("features_js.json")
# features_df

## Training the model using logistic regression

In [None]:
label = pd.read_csv(os.path.join(repo_dir, "dataverse_files/", "HAM10000_metadata.csv"))
# label = label.set_index('image_id')
cancerous = ["akiec", "bcc", "mel"]
non_cancerous = ["bkl", "df", "nv", "vasc"]
label["cancer"] = False
label.loc[label["dx"].isin(cancerous), "cancer"] = True
label.loc[label["dx"].isin(non_cancerous), "cancer"] = False
label

In [None]:
files = os.listdir(original_folder_path)
files = [f.split(".")[-2] for f in files if f.endswith(".jpg")]

temp_files = pd.DataFrame(files, columns=["filename"])
temp_files["image_id"] = temp_files.filename.apply(
    lambda x: x.split("_")[-2] + "_" + x.split("_")[-1]
)

label_ = temp_files.merge(label, on="image_id", how="left")

In [None]:
features = pd.read_json("features_js.json")
features = features.T

merged_data = features.merge(label_, left_index=True, right_on="image_id")
merged_data

In [None]:
# export cnn features
merged_data.to_csv(os.path.join(repo_dir, "features_js.csv"))

# Modeling

In [None]:
augmented_unique_files = np.unique(
    [
        f.split("_")[-2] + "_" + f.split("_")[-1]
        for f in merged_data.filename
        if "augmented" in f
    ]
)

include_in_testing = [
    False if id in augmented_unique_files else True for id in merged_data.image_id
]

In [None]:
x = merged_data.iloc[:, :1000].to_numpy()
y = merged_data["cancer"].to_numpy()
TEST_SIZE = 0.30

# this formula accounts for the fact that we want to have 30% pure training data
# if we oversample the images before we need to ignore the oversampled images in that count
# (this only work because only a fraction of the pictures is oversampled in the first place)
test_size_sklearn = (
    TEST_SIZE * len(np.unique(merged_data.image_id)) / np.sum(include_in_testing)
)

x_train_, x_test, y_train_, y_test = model_selection.train_test_split(
    x[include_in_testing], y[include_in_testing], test_size=test_size_sklearn
)
x_train = np.concatenate((x_train_, x[np.invert(include_in_testing)]), axis=0)
y_train = np.concatenate((y_train_, y[np.invert(include_in_testing)]), axis=0)

In [None]:
np.shape(x_train), np.shape(y_train), np.shape(x_test), np.shape(y_test), len(
    x_test
) / len(x), len(np.unique(merged_data.image_id))

In [None]:
# Initialize PCA,
pca = PCA(n_components=0.9)

# Fit and transform the data
pca.fit(np.concatenate((x_train, x_test), axis=0))
x_train_pca = pca.transform(x_train)
x_test_pca = pca.transform(x_test)

# Check the new shape of the data
print(x_train_pca.shape)

In [None]:
# Scatter plot of the first two PCA components
# Here, X_pca[:, 0] is the first component, X_pca[:, 1] is the second component
plt.figure(figsize=(10, 7))
plt.scatter(
    x_train_pca[y_train == 0, 0],
    x_train_pca[y_train == 0, 1],
    c="blue",
    label="Non-Cancerous",
    alpha=0.5,
)  # Non-cancerous in blue
plt.scatter(
    x_train_pca[y_train == 1, 0],
    x_train_pca[y_train == 1, 1],
    c="red",
    label="Cancerous",
    alpha=0.5,
)  # Cancerous labeled in red

# Adding labels and title
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("PCA of Image Data")
plt.legend()

In [None]:
print("starting model")
lin_model = linear_model.LogisticRegression(
    solver="newton-cg", multi_class="auto", max_iter=5000, class_weight=None
)
print("starting fitting")
# lin_model.fit(x_train, y_train)
lin_model.fit(x_train_pca, y_train)

# y_pred = lin_model.predict(x_test)
y_pred = lin_model.predict(x_test_pca)

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred)

# Initialize the ConfusionMatrixDisplay object with the confusion matrix
cmd = ConfusionMatrixDisplay(conf_matrix)

# Plot the confusion matrix
cmd.plot(
    cmap=plt.cm.Blues
)  # You can choose other color maps like 'viridis', 'plasma', etc.
plt.title("Confusion Matrix")
plt.show()

In [None]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label=1)
recall = recall_score(y_test, y_pred, pos_label=1)
f1 = f1_score(y_test, y_pred, pos_label=1)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")