## CNN Feature Extraction using RestNet

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import methods 
import numpy as np
import torch
import torchvision
import torchvision.transforms as transforms
import torchvision.models as models
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import os
import torch.nn as nn
import sklearn.model_selection as model_selection
import sklearn.linear_model as linear_model
import sklearn.metrics as metrics
from sklearn import linear_model, metrics, model_selection
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

## Load the Model


In [None]:
#Using the RestNet 50 model to extract features
model = models.resnet50(pretrained = True)
feature_extractor = torch.nn.Sequential(*list(model.children())[:-1])
feature_extractor.eval()
if torch.cuda.is_available():
    model = model.cuda()

## Data PreProcessing 

In [None]:
class ImageDataset(Dataset):
    def __init__(self, directory, transform=None):
        self.directory = directory
        self.transform = transform
        self.images = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith((".jpg", ".png"))]

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        #Do preprocessing here 
        image_path = self.images[idx]
        rgb_image_arr = methods.convert_rgb(image_path)
        normalised_img = methods.z_normalization(rgb_image_arr)
        image = Image.fromarray(normalised_img.astype('uint8'), 'RGB')
        image_tensor = self.transform(image) if self.transform else image
        key = os.path.basename(image_path).removesuffix('.jpg').removesuffix('.png')
        return key, image_tensor

In [None]:
original_folder_path =  '../dataverse_files/HAM10000_images_part_1'
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
dataset = ImageDataset(directory=original_folder_path, transform=transform)
data_loader = DataLoader(dataset, batch_size=16, shuffle=False, num_workers=0)


## Feature Extraction

In [None]:
#Extracting features
features_map2 = {}
with torch.no_grad():
    for batch_idx, output in enumerate(data_loader):
        if torch.cuda.is_available():
            images = images.cuda()

        
        key, images = output
        batch_features = model(images) 
        #print(batch_features.shape)
        batch_features = batch_features.view(batch_features.size(0), -1)  # Flatten features
              
        batch_features = batch_features.cpu().numpy()
        
        for i, feature in enumerate(batch_features):
            image_id = batch_idx * data_loader.batch_size + i  # Compute global image ID/index
            features_map2[key[i]] = feature
            print(f'Done for image {image_id}')


In [None]:
#Saving the features
features_df = pd.DataFrame(features_map2)
features_df
features_df.to_json('features.json')
features_df
#np.save('features.npy', features_df)

##Training the model using logistic regression

In [5]:
features = pd.read_json('features.json')
label = pd.read_csv('dataverse_files/HAM10000_metadata.csv')

features = features.T
label = label.set_index('image_id') #TODO: Does it make it fast or what?

merged_data = features.merge(label, left_index=True, right_on='image_id') #TODO: what does this do? why can't we switch features and label

merged_data['cancer'] = False
cancerous = ["akiec", "bcc", "mel"]
non_cancerous = ["bkl", "df", "nv", "vasc"]
merged_data.loc[merged_data['dx'].isin(cancerous), "cancer"] = True
merged_data.loc[merged_data['dx'].isin(non_cancerous), "cancer"] = False
merged_data.drop(columns=['lesion_id', 'dx_type','age','sex','localization','dataset', 'dx'], inplace=True)

x = merged_data.drop(columns=['cancer'], axis=1)
y = merged_data['cancer']

x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=0.2, random_state=42)

print("starting model")
model = linear_model.LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=10000)
print("starting fitting")
model.fit(x_train, y_train) #TODO: which model to pick and how many cores to run on?

y_pred = model.predict(x_test)
print(metrics.accuracy_score(y_test, y_pred))

# plt.scatter()


starting model
starting fitting
0.811


TypeError: scatter() missing 2 required positional arguments: 'x' and 'y'