# Feature Extraction and Modeling

# General Setup

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import torch
import torchvision.transforms as transforms

from torchvision.models import resnet50, ResNet50_Weights

from torch.utils.data import DataLoader
import os
import sklearn.model_selection as model_selection
import sklearn.linear_model as linear_model
import subprocess
from tqdm import tqdm

from sklearn.decomposition import PCA, KernelPCA

from methods import (
    get_labels,
    ImageHeuristicFeatureExtractor,
    standardize_features,
    ImageDataset,
    merge_features_with_labels,
    not_oversampled_images,
    calculate_test_size,
    plot_confusion_matrix,
    plot_low_dim_components,
)


from sklearn.preprocessing import StandardScaler
from sklearn import svm

from sklearn.manifold import TSNE
import shap


%load_ext autoreload
%autoreload 2

In [None]:
# load repo and
repo_dir = (
    subprocess.Popen(["git", "rev-parse", "--show-toplevel"], stdout=subprocess.PIPE)
    .communicate()[0]
    .rstrip()
    .decode("utf-8")
)
original_folder_path = os.path.join(repo_dir, "dataverse_files/HAM10000_images_part_1")
# original_folder_path =  os.path.join(repo_dir, 'dataverse_files/HAM10000_images_part_1_2')
# original_folder_path = os.path.join(repo_dir, "dataverse_files/JS_Selection")

processed_folder_path = os.path.join(repo_dir, "preprocessed_images")
features_folder_path = os.path.join(repo_dir, "features_extracted")

os.makedirs(processed_folder_path, exist_ok=True)
os.makedirs(features_folder_path, exist_ok=True)

data_folder_path = original_folder_path

label = get_labels(repo_dir)

# Feature Extraction

## (A) Feature Exctraction using ResNet50 (CNN)

In [None]:
# Using the RestNet 50 model to extract features using pretrained weights
model = resnet50(weights=ResNet50_Weights.DEFAULT)

In [None]:
transform = transforms.Compose(
    [
        transforms.Resize(232),
        transforms.CenterCrop(450),  # adapted to use larger region
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ]
)
dataset = ImageDataset(directory=data_folder_path, transform=transform)
data_loader = DataLoader(dataset, batch_size=16, shuffle=False, num_workers=0)

In [None]:
# Determine the best available device
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"
print(f"Using device: {device}")

In [None]:
model = model.to(device)  # Move your model to the appropriate device
model.eval()  # Set the model to evaluation mode

features_map2 = {}
with torch.no_grad():
    for batch_idx, (key, images) in enumerate(tqdm(data_loader)):
        images = images.to(device)  # Move images to the appropriate device

        batch_features = model(images)
        batch_features = batch_features.view(
            batch_features.size(0), -1
        )  # Flatten features

        batch_features = (
            batch_features.cpu().numpy()
        )  # Move features to CPU for numpy conversion

        for i, feature in enumerate(batch_features):
            image_id = (
                batch_idx * data_loader.batch_size + i
            )  # Compute global image ID/index
            features_map2[key[i]] = feature

In [None]:
# Saving the raw features
features_df = pd.DataFrame(features_map2)
cnn_features_path = os.path.join(features_folder_path, "features.json")
features_df.to_json(cnn_features_path)

### Load CNN Features (also previously generated)

In [None]:
# load features + combine the features with labels dataframe
features_path = os.path.join(features_folder_path, "features.json")
cnn_features = merge_features_with_labels(
    features_path=features_path,
    labels_df=label,
    export=True,
)
cnn_features

## (B) Heuristic Feature Extraction
Attention! The order of the features using the CNN and this Class is not necessarily the same!

In [None]:
extractor = ImageHeuristicFeatureExtractor(
    data_folder_path, label.set_index("image_id")
)

feature_label_data = extractor.get_feature_and_label_arrays()
df_heuristic = (
    extractor.return_one_df()
)  # effectively dummy df with the filenames and image ids

x_rgb, y_rgb = feature_label_data["rgb"]
x_hsv, y_hsv = feature_label_data["hsv"]
x_glcm, y_glcm = feature_label_data["glcm"]
# x_gabor, y_gabor = feature_label_data['gabor']

In [None]:
x_rgb_standardized = standardize_features(x_rgb, use_pca=True, n_components=0.9)
x_hsv_stanardized = standardize_features(x_hsv, use_pca=True, n_components=0.9)

x_heuristic = np.concatenate((x_rgb_standardized, x_hsv_stanardized, x_glcm), axis=1)
y_heuristic = y_hsv
np.shape(x_heuristic)

In [None]:
names_heuristic_features = []

for i in range(len(x_rgb_standardized[0])):
    names_heuristic_features.append(f"rgb{i}")

for i in range(len(x_hsv_stanardized[0])):
    names_heuristic_features.append(f"hsv{i}")

for i in range(len(x_glcm[0])):
    names_heuristic_features.append(f"glcm{i}")

len(names_heuristic_features)

In [None]:
np.save(os.path.join(features_folder_path, "x_heuristic"), x_heuristic)
np.save(os.path.join(features_folder_path, "y_heuristic"), y_heuristic)
np.save(
    os.path.join(features_folder_path, "names_heuristic_features"),
    names_heuristic_features,
)
df_heuristic.to_csv(os.path.join(features_folder_path, "df_heuristic"), index=True)

In [None]:
x_heuristic = np.load(os.path.join(features_folder_path, "x_heuristic.npy"))
y_heuristic = np.load(os.path.join(features_folder_path, "y_heuristic.npy"))
names_heuristic_features = np.load(
    os.path.join(features_folder_path, "names_heuristic_features.npy")
)
df_heuristic = pd.read_csv(
    os.path.join(features_folder_path, "df_heuristic"), index_col=0
)
df_heuristic

# Modeling

## Generate the Train and Test Split

In [None]:
# Define which x and y to use

# CNN:
x = cnn_features.iloc[:, :1000].to_numpy()
y = cnn_features["cancer"].to_numpy()
df_ = cnn_features

# Heuristic
"""
x = x_heuristic
y = y_heuristic
df_ = df_heuristic
np.shape(x)
"""

In [None]:
# only include those files in testing that have not been oversampled
include_in_testing = not_oversampled_images(df_)

TEST_SIZE = 0.30

# Calculate the adjusted test size
test_size_sklearn = calculate_test_size(df_, TEST_SIZE, include_in_testing)

x_train_, x_test, y_train_, y_test = model_selection.train_test_split(
    x[include_in_testing], y[include_in_testing], test_size=test_size_sklearn
)
x_train = np.concatenate((x_train_, x[np.invert(include_in_testing)]), axis=0)
y_train = np.concatenate((y_train_, y[np.invert(include_in_testing)]), axis=0)

In [None]:
np.shape(x_train), np.shape(y_train), np.shape(x_test), np.shape(y_test), len(
    x_test
) / len(x), len(np.unique(df_.image_id))

## Dimensionality Reduction
### PCA

In [None]:
# Initialize PCA,
pca = PCA(n_components=0.999)

# Fit and transform the data
pca.fit(np.concatenate((x_train, x_test), axis=0))
x_train_pca = pca.transform(x_train)
x_test_pca = pca.transform(x_test)

# Check the new shape of the data
print(x_train_pca.shape)

In [None]:
plot_low_dim_components(x_train_pca, y_train, component_1=0, component_2=1)

### Kernel PCA (slow)

In [None]:
# Initialize PCA,
kpca = KernelPCA(n_components=25, kernel="rbf")  # kernel: rbf, sigmoid

# Fit and transform the data
kpca.fit(np.concatenate((x_train, x_test), axis=0))
x_train_kpca = kpca.transform(x_train)
x_test_kpca = kpca.transform(x_test)

# Check the new shape of the data
print(x_train_kpca.shape)

In [None]:
plot_low_dim_components(x_train_kpca, y_train, label="kPCA")

### t-SNE (Visualisation *only*)

In [None]:
# Set the parameters for t-SNE
tsne = TSNE(n_components=2, random_state=0, perplexity=15, n_iter=2000, verbose=1)

# Perform t-SNE on the data
X_tsne = tsne.fit_transform(np.concatenate((x_train, x_test), axis=0))

In [None]:
plot_low_dim_components(
    X_tsne, np.concatenate((y_train, y_test), axis=0), label="t-SNE"
)

### Use Lower Dimensional Features?

In [None]:
x_train = x_train  # x_train #x_train_kpca #x_train_pca
x_test = x_test  # x_test #x_test_kpca#x_test_pca

## LogisticRegression

In [None]:
prediction_model = linear_model.LogisticRegression(
    solver="newton-cg", multi_class="auto", max_iter=5000, class_weight="balanced"
)

prediction_model.fit(x_train, y_train)

y_train_pred = prediction_model.predict(x_train)
y_pred = prediction_model.predict(x_test)
# plot_confusion_matrix(y_train, y_train_pred)
plot_confusion_matrix(y_test, y_pred)

## SVM

In [None]:
# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(x_train)
X_test = scaler.transform(x_test)

# Create an SVM classifier
prediction_model = svm.SVC(
    kernel="poly",
    C=1.0,
    gamma=0.5,
    class_weight="balanced",
)

# Train the classifier
prediction_model.fit(x_train, y_train)

# Make predictions
y_pred = prediction_model.predict(x_test)
plot_confusion_matrix(y_test, y_pred)

## SHAP Values
Not too sure how helpful this is. But we can can see how much individual features are influencing the result

In [None]:
# Create the SHAP Explainer

# Without names
explainer = shap.Explainer(prediction_model.predict, x_train, max_evals=2500, verbose=1)

# With names (only defined for heuristic model)
# explainer = shap.Explainer(prediction_model.predict, x_train, max_evals=2500, verbose=1, feature_names=names_heuristic_features)

shap_values = explainer(x_test)

In [None]:
fig = plt.figure()
shap.summary_plot(shap_values, x_test)

# Save the current figure
save_path = os.path.join(features_folder_path, "shap_values.png")
fig.savefig(save_path, dpi=150, bbox_inches="tight")