# Transfer learning with disease and non disease data



### Import tensorflow

In [None]:
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.


### Import modules and download the diseases and none diseases dataset.

In [None]:
import urllib.request
import os
import zipfile
import random
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import layers
from tensorflow.keras import Model
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras.optimizers import RMSprop
from shutil import copyfile




In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os

# Set the path to your desired directory
new_directory_path = '/content/drive/MyDrive/moffitt/'

# Change the current working directory
os.chdir(new_directory_path)


In [None]:
data_file_name = "moffitt.zip"
download_dir = '/tmp/'
zip_ref = zipfile.ZipFile(data_file_name, 'r')
zip_ref.extractall(download_dir)
zip_ref.close()


KeyboardInterrupt



Check that the dataset has the expected number of examples.

In [None]:
print("Number of diseases images:",len(os.listdir('/tmp/moffitt/classify_cytology/diseases/')))
print("Number of no diseases images:", len(os.listdir('/tmp/moffitt/classify_cytology/no_diseases/')))



Number of diseases images: 20


FileNotFoundError: [Errno 2] No such file or directory: '/tmp/moffitt/classify_cytology/no_diseases/'

In [None]:
import shutil

try:
    shutil.rmtree('/tmp/diseases-v-nondiseases')
    print("Directories deleted successfully.")
except FileNotFoundError:
    print("Directories not found.")
except Exception as e:
    print(f"An error occurred: {e}")


Directories deleted successfully.


Create some folders that will store the training and test data.
- There will be a training folder and a testing folder.
- Each of these will have a subfolder for diseases and another subfolder for none diseases.

In [None]:
try:
    os.mkdir('/tmp/diseases-v-nondiseases')
    os.mkdir('/tmp/diseases-v-nondiseases/training')
    os.mkdir('/tmp/diseases-v-nondiseases/testing')
    os.mkdir('/tmp/diseases-v-nondiseases/training/diseases')
    os.mkdir('/tmp/diseases-v-nondiseases/training/non_diseases')
    os.mkdir('/tmp/diseases-v-nondiseases/testing/diseases')
    os.mkdir('/tmp/diseases-v-nondiseases/testing/non_diseases')
except OSError:
    pass

### Split data into training and test sets

- The following code put first checks if an image file is empty (zero length)
- Of the files that are not empty, it puts 90% of the data into the training set, and 10% into the test set.

In [None]:
import random
from shutil import copyfile
def split_data(SOURCE, TRAINING, TESTING, SPLIT_SIZE):
    files = []
    for filename in os.listdir(SOURCE):
        file = SOURCE + filename
        if os.path.getsize(file) > 0:
            files.append(filename)
        else:
            print(filename + " is zero length, so ignoring.")

    training_length = int(len(files) * SPLIT_SIZE)
    testing_length = int(len(files) - training_length)
    shuffled_set = random.sample(files, len(files))
    training_set = shuffled_set[0:training_length]
    testing_set = shuffled_set[training_length:]

    for filename in training_set:
        this_file = SOURCE + filename
        destination = TRAINING + filename
        copyfile(this_file, destination)

    for filename in testing_set:
        this_file = SOURCE + filename
        destination = TESTING + filename
        copyfile(this_file, destination)


DISEASES_SOURCE_DIR = "/tmp/moffitt/classify_cytology/diseases/"
TRAINING_DISEASES_DIR = "/tmp/diseases-v-nondiseases/training/diseases/"
TESTING_DISEASES_DIR = "/tmp/diseases-v-nondiseases/testing/diseases/"
NO_DISEASES_SOURCE_DIR = "/tmp/moffitt/classify_cytology/no_diseases/"
TRAINING_NO_DISEASES_DIR = "/tmp/diseases-v-nondiseases/training/non_diseases/"
TESTING_NO_DISEASES_DIR = "/tmp/diseases-v-nondiseases/testing/non_diseases/"

split_size = 0.95
split_data(DISEASES_SOURCE_DIR, TRAINING_DISEASES_DIR, TESTING_DISEASES_DIR, split_size)
split_data(NO_DISEASES_SOURCE_DIR, TRAINING_NO_DISEASES_DIR, TESTING_NO_DISEASES_DIR, split_size)
print("Number of images:",len(os.listdir('/tmp/diseases-v-nondiseases/training/')))


Number of images: 2


In [None]:
from google.colab import drive
import shutil

# Mount Google Drive
drive.mount('/content/drive')

# Set your source and destination directories
source_directory = '/tmp/diseases-v-nondiseases/testing/'
destination_directory = '/content/drive/MyDrive/moffitt/testing1/'

# Copy files from source to destination
shutil.copytree(source_directory, destination_directory)

# Repeat for the training directory
source_directory = '/tmp/diseases-v-nondiseases/training/'
destination_directory = '/content/drive/MyDrive/moffitt/training1/'

shutil.copytree(source_directory, destination_directory)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


'/content/drive/MyDrive/moffitt/training1/'

In [None]:
print("Number of training diseases images", len(os.listdir('/content/drive/MyDrive/moffitt/training1/diseases/')))
print("Number of training no diseases images", len(os.listdir('/content/drive/MyDrive/moffitt/training1/non_diseases/')))
print("Number of testing diseases images", len(os.listdir('/content/drive/MyDrive/moffitt/testing1/diseases/')))
print("Number of testing no diseases images", len(os.listdir('/content/drive/MyDrive/moffitt/testing1/non_diseases/')))

Number of training diseases images 1603
Number of training no diseases images 1517
Number of testing diseases images 85
Number of testing no diseases images 80


In [None]:
import os
import pandas as pd

# Directories
TRAINING_DISEASES_DIR = "/content/drive/MyDrive/moffitt/training1/diseases/"
TRAINING_NO_DISEASES_DIR = "/content/drive/MyDrive/moffitt/training1/non_diseases/"

# Function to get image paths and labels
def get_image_paths_and_labels(directory, label):
  if label=="Disease":
    image_paths = ["disease_"+ img for img in os.listdir(directory)]
  else:
    image_paths = ["non_disease_"+ img for img in os.listdir(directory)]
  labels = [label] * len(image_paths)
  return image_paths, labels

# Creating DataFrame for diseases
diseases_train_paths, diseases_train_labels = get_image_paths_and_labels(TRAINING_DISEASES_DIR, label="Disease")
diseases_df = pd.DataFrame({'Image_Path': diseases_train_paths,
                            'Label': diseases_train_labels})

# Creating DataFrame for no diseases
no_diseases_train_paths, no_diseases_train_labels = get_image_paths_and_labels(TRAINING_NO_DISEASES_DIR, label="No Disease")
no_diseases_df = pd.DataFrame({'Image_Path': no_diseases_train_paths ,
                               'Label': no_diseases_train_labels })

# Concatenating both DataFrames
full_df2 = pd.concat([diseases_df, no_diseases_df], ignore_index=True)
# Save the DataFrame to the same directory as training
output_df_path = os.path.join("/content/drive/MyDrive/moffitt/", "full_df.csv")
full_df2.to_csv(output_df_path, index=False)
# Display the DataFrame
print(full_df2)



                   Image_Path       Label
0         disease_23T (7).jpg     Disease
1         disease_72T (4).jpg     Disease
2         disease_92T (4).jpg     Disease
3          disease_92 (4).jpg     Disease
4           disease_4 (9).jpg     Disease
...                       ...         ...
3115  non_disease_71T (3).jpg  No Disease
3116   non_disease_34 (3).jpg  No Disease
3117  non_disease_69T (5).jpg  No Disease
3118  non_disease_50T (5).jpg  No Disease
3119   non_disease_2T (2).jpg  No Disease

[3120 rows x 2 columns]


In [None]:
print("Number of files in combined directory:", len(os.listdir('/content/drive/MyDrive/moffitt/training_combined/')))

Number of files in combined directory: 1914


Check that the training and test sets are the expected lengths.

In [None]:

print("Number of training diseases images", len(os.listdir('/tmp/diseases-v-nondiseases/training/diseases')))
print("Number of training no diseases images", len(os.listdir('/tmp/diseases-v-nondiseases/training/non_diseases')))
print("Number of testing diseases images", len(os.listdir('/tmp/diseases-v-nondiseases/testing/diseases/')))
print("Number of testing no diseases images", len(os.listdir('/tmp/diseases-v-nondiseases/testing/non_diseases/')))



Number of training diseases images 1603
Number of training no diseases images 1517
Number of testing diseases images 85
Number of testing no diseases images 80


In [None]:
import os
import shutil

# Set source directories
non_diseases_directory = '/content/drive/MyDrive/moffitt/training1/non_diseases/'
diseases_directory = '/content/drive/MyDrive/moffitt/training1/diseases/'

# Set destination directory
combined_directory = '/content/drive/MyDrive/moffitt/training_combined/'

# Create the combined directory if it doesn't exist
os.makedirs(combined_directory, exist_ok=True)

# Copy contents of non_diseases_directory to combined_directory
for filename in os.listdir(non_diseases_directory):
    source_file = os.path.join(non_diseases_directory, filename)
    destination_file = os.path.join(combined_directory, "non_disease_"+filename)
    shutil.copyfile(source_file, destination_file)

# Copy contents of diseases_directory to combined_directory
for filename in os.listdir(diseases_directory):
    source_file = os.path.join(diseases_directory, filename)
    destination_file = os.path.join(combined_directory, "disease_"+filename)
    shutil.copyfile(source_file, destination_file)

# Check the combined directory
print("Number of files in combined directory:", len(os.listdir(combined_directory)))


Number of files in combined directory: 3120


### K fold cross validation

In [None]:
!pip install keras-layer-normalization

Collecting keras-layer-normalization
  Downloading keras-layer-normalization-0.16.0.tar.gz (3.9 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: keras-layer-normalization
  Building wheel for keras-layer-normalization (setup.py) ... [?25l[?25hdone
  Created wheel for keras-layer-normalization: filename=keras_layer_normalization-0.16.0-py3-none-any.whl size=4653 sha256=0349c2697b5012543490433b949087ec9ef23aa420fb8e507bbbd14c12e0bbb7
  Stored in directory: /root/.cache/pip/wheels/ed/3a/4b/21db23c0cc56c4b219616e181f258eb7c57d36cc5d056fae9a
Successfully built keras-layer-normalization
Installing collected packages: keras-layer-normalization
Successfully installed keras-layer-normalization-0.16.0


In [None]:
import pandas as pd
import numpy as np
import os.path
import matplotlib.pyplot as plt
import seaborn as sns
import gc

from PIL import Image

%matplotlib inline
from keras.applications import ResNet50,ResNet101
import cv2
from tqdm import tqdm
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import backend as K
from keras import applications
from keras.models import Model
from keras import optimizers
from keras.utils import to_categorical
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import EarlyStopping
from keras.preprocessing import image

In [None]:
from sklearn.model_selection import StratifiedKFold , KFold ,RepeatedKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
train = pd.read_csv('/content/drive/MyDrive/moffitt/full_df.csv')
train.head()

Unnamed: 0,Image_Path,Label
0,disease_23T (7).jpg,Disease
1,disease_72T (4).jpg,Disease
2,disease_92T (4).jpg,Disease
3,disease_92 (4).jpg,Disease
4,disease_4 (9).jpg,Disease


In [None]:
df = train.copy()

In [None]:
Disease = train[train["Label"]=='Disease']
No_Disease = train[train["Label"]=='No Disease']


df = pd.concat([df,Disease])
df = pd.concat([df,No_Disease])

In [None]:
TRAIN_PATH = '/content/drive/MyDrive/moffitt/training_combined/'

In [None]:
from keras.layers import BatchNormalization
from keras.callbacks import EarlyStopping

def get_model(IMG_SIZE):
    base_model =applications.ResNet50(weights='imagenet', include_top=False, input_shape=(IMG_SIZE, IMG_SIZE, 3))
    add_model = Sequential()
    add_model.add(Flatten(input_shape=base_model.output_shape[1:]))
    add_model.add(Dropout(0.3))
    add_model.add(Dense(64, activation='relu'))
    add_model.add(Dropout(0.4))

    add_model.add(Dense(2, activation='sigmoid'))

    model = Model(inputs=base_model.input, outputs=add_model(base_model.output))

    model.compile(loss='categorical_crossentropy', optimizer=optimizers.SGD(lr=1e-4, momentum=0.9),
                  metrics=['accuracy'])
    return model
#     model.summary()

In [None]:
# Storing the average of all predictions

main_pred = []
data_kfold = pd.DataFrame()

# Creating X, Y for training

train_y = df.Label
train_x = df.drop(['Label'],axis=1)

In [None]:
IMG_SIZE = 150
BATCH_SIZE = 20
EPOCHS = 10
N_SPLIT = 10


In [None]:
train_datagen = ImageDataGenerator(rescale = 1./255,
                                   shear_range = 0.2,
                                   zoom_range = 0.2,
                                   horizontal_flip = True)

validation_datagen = ImageDataGenerator(rescale = 1./255)

# Specify the directory to save the models
save_dir = '/content/drive/MyDrive/moffitt/model/'

# Check if the directory exists, if not, create it
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

kfold = StratifiedKFold(n_splits=N_SPLIT,shuffle=True,random_state=42)
j = 0
for train_idx, val_idx in list(kfold.split(train_x,train_y)):
    x_train_df = df.iloc[train_idx]
    x_valid_df = df.iloc[val_idx]
    j+=1


    training_set = train_datagen.flow_from_dataframe(dataframe=x_train_df, directory=TRAIN_PATH,
                                                 x_col="Image_Path", y_col="Label",
                                                 class_mode="categorical",
                                                 target_size=(IMG_SIZE,IMG_SIZE), batch_size=BATCH_SIZE)

    validation_set = validation_datagen.flow_from_dataframe(dataframe=x_valid_df, directory=TRAIN_PATH,
                                                 x_col="Image_Path", y_col="Label",
                                                 class_mode="categorical",
                                                 target_size=(IMG_SIZE,IMG_SIZE), batch_size=BATCH_SIZE)

    model_test = get_model(IMG_SIZE)


    history = model_test.fit_generator(
        training_set,
        validation_data=validation_set,
        epochs=EPOCHS,
        steps_per_epoch=x_train_df.shape[0] // BATCH_SIZE,
    )
    # Save the model after training
    model_filename = f'model_{j}.h5'
    model_path = os.path.join(save_dir, model_filename)
    model_test.save(model_path)
    print(f"Model {j} saved to {model_path}")

Found 5616 validated image filenames belonging to 2 classes.
Found 624 validated image filenames belonging to 2 classes.
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5


  history = model_test.fit_generator(


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


  saving_api.save_model(


Model 1 saved to /content/drive/MyDrive/moffitt/model/model_1.h5
Found 5616 validated image filenames belonging to 2 classes.
Found 624 validated image filenames belonging to 2 classes.


  history = model_test.fit_generator(


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10

In [None]:
from keras.models import load_model

# Load the entire model
loaded_model = load_model('your_model.h5')

# Load the model architecture from JSON and load the learned weights
with open('your_model.json', 'r') as json_file:
    loaded_model_json = json_file.read()

loaded_model = model_from_json(loaded_model_json)
loaded_model.load_weights('your_model_weights.h5')


OSError: No file or directory found at your_model.h5

In [None]:
IMG_SIZE = (150, 150)
k_folds=10

core_idg = ImageDataGenerator(samplewise_center=True,
                              samplewise_std_normalization=True,
                              horizontal_flip = True,
                              vertical_flip = False,
                              height_shift_range= 0.05,
                              width_shift_range=0.1,
                              rotation_range=5,
                              shear_range = 0.1,
                              fill_mode = 'reflect',
                              zoom_range=0.15)

# Training with K-fold cross validation
kf = KFold(n_splits=k_folds, random_state=None, shuffle=True)
X= np.array(full_df2['Image_Path'])
i = 1
for train_index, test_index in kf.split(X):
    trainData = X[train_index]
    testData = X[test_index]
    ## create train, valid dataframe and thus train_gen , valid_gen for each fold-loop
    train_df = full_df2.loc[full_df2['Image_Path'].isin(list(trainData))]
    valid_df = full_df2.loc[full_df2['Image_Path'].isin(list(testData))]
    #create model object
    all_labels = [ "Diseases" , "No Diseases" ]
    train_gen = core_idg.flow_from_dataframe(dataframe=train_df,
                                         directory="None",
                                         x_col = 'Image_Path',
                                         y_col = 'Label',
                                         class_mode = 'categorical',
                                         classes = all_labels,
                                         target_size = IMG_SIZE,
                                         color_mode = 'rgb',
                                         batch_size = 64)
    valid_gen = core_idg.flow_from_dataframe(dataframe=valid_df,
                                         directory="None",
                                         x_col = 'Image_Path',
                                         y_col = 'Label',
                                         class_mode = 'categorical',
                                         classes = all_labels,
                                         target_size = IMG_SIZE,
                                         color_mode = 'rgb',
                                         batch_size = 256)

### Data augmentation (try adjusting the parameters)!

Here, you'll use the `ImageDataGenerator` to perform data augmentation.  
- Things like rotating and flipping the existing images allows you to generate training data that is more varied, and can help the model generalize better during training.  
- You can also use the data generator to apply data augmentation to the validation set.

You can use the default parameter values for a first pass through this lab.
- Later, try to experiment with the parameters of `ImageDataGenerator` to improve the model's performance.
- Try to drive reach 99.9% validation accuracy or better.

In [None]:
!rm -rf VALIDATION_DIR

In [None]:

TRAINING_DIR = "/tmp/diseases-v-nondiseases/trainingkfolds/"
# Experiment with your own parameters to reach 99.9% validation accuracy or better
train_datagen = ImageDataGenerator(rescale=1./255,
      rotation_range=40,
      width_shift_range=0.2,
      height_shift_range=0.2,
      shear_range=0.2,
      zoom_range=0.2,
      horizontal_flip=True,
      fill_mode='nearest')
train_generator = train_datagen.flow_from_directory(TRAINING_DIR,
                                                    batch_size=10,
                                                    class_mode='binary',
                                                    target_size=(150, 150))

VALIDATION_DIR = "/tmp/diseases-v-nondiseases/validationkfolds/"

validation_datagen = ImageDataGenerator(rescale=1./255)
validation_generator = validation_datagen.flow_from_directory(VALIDATION_DIR,
                                                              batch_size=40,
                                                              class_mode='binary',
                                                              target_size=(150, 150))



### Get and prepare the model

You'll be using the `InceptionV3` model.  
- Since you're making use of transfer learning, you'll load the pre-trained weights of the model.
- You'll also freeze the existing layers so that they aren't trained on your downstream task with the data.
- You'll also get a reference to the last layer, 'mixed7' because you'll add some layers after this last layer.

In [None]:
weights_url = "https://storage.googleapis.com/mledu-datasets/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5"
weights_file = "inception_v3.h5"
urllib.request.urlretrieve(weights_url, weights_file)

# Instantiate the model
pre_trained_model = InceptionV3(input_shape=(150, 150, 3),
                                include_top=False,
                                weights=None)

# load pre-trained weights
pre_trained_model.load_weights(weights_file)

# freeze the layers
for layer in pre_trained_model.layers:
    layer.trainable = False

# pre_trained_model.summary()

last_layer = pre_trained_model.get_layer('mixed7')
print('last layer output shape: ', last_layer.output_shape)
last_output = last_layer.output



### Add layers
Add some layers that you will train on the cats and dogs data.
- `Flatten`: This will take the output of the `last_layer` and flatten it to a vector.
- `Dense`: You'll add a dense layer with a relu activation.
- `Dense`: After that, add a dense layer with a sigmoid activation.  The sigmoid will scale the output to range from 0 to 1, and allow you to interpret the output as a prediction between two categories.

Then create the model object.

In [None]:
# Flatten the output layer to 1 dimension
x = layers.Flatten()(last_output)
# Add a fully connected layer with 1,024 hidden units and ReLU activation
x = layers.Dense(1024, activation='relu')(x)
# Add a final sigmoid layer for classification
x = layers.Dense(1, activation='sigmoid')(x)

model = Model(pre_trained_model.input, x)


### Train the model
Compile the model, and then train it on the test data using `model.fit`
- Feel free to adjust the number of epochs.  This project was originally designed with 20 epochs.
- For the sake of time, you can use fewer epochs (2) to see how the code runs.
- You can ignore the warnings about some of the images having corrupt EXIF data. Those will be skipped.

In [None]:

# compile the model
model.compile(optimizer=RMSprop(lr=0.0001),
              loss='binary_crossentropy',
              metrics=['acc'])

# train the model (adjust the number of epochs from 1 to improve performance)
history = model.fit(
            train_generator,
            validation_data=validation_generator,
            epochs=15,
            verbose=1)

### Visualize the training and validation accuracy

You can see how the training and validation accuracy change with each epoch on an x-y plot.

In [None]:
%matplotlib inline

import matplotlib.image  as mpimg
import matplotlib.pyplot as plt

#-----------------------------------------------------------
# Retrieve a list of list results on training and test data
# sets for each training epoch
#-----------------------------------------------------------
acc=history.history['acc']
val_acc=history.history['val_acc']
loss=history.history['loss']
val_loss=history.history['val_loss']

epochs=range(len(acc)) # Get number of epochs

#------------------------------------------------
# Plot training and validation accuracy per epoch
#------------------------------------------------
plt.plot(epochs, acc, 'r', "Training Accuracy")
plt.plot(epochs, val_acc, 'b', "Validation Accuracy")
plt.title('Training and validation accuracy')
plt.figure()



### Predict on a test image

You can upload any image and have the model predict whether it's a disease image or a no disease image.
- Find an image of a disease or no disease
- Run the following code cell.  It will ask you to upload an image.
- The model will print "is a disease image" or "is a no disease image" depending on the model's prediction.

In [None]:
import numpy as np
from google.colab import files
from tensorflow.keras.utils import load_img, img_to_array
os.chdir('/content/')

uploaded = files.upload()

for fn in uploaded.keys():

  # predicting images
  path = '/content/' + fn
  img = load_img(path, target_size=(150, 150))
  x = img_to_array(img)
  x /= 255
  x = np.expand_dims(x, axis=0)

  image_tensor = np.vstack([x])
  classes = model.predict(image_tensor)
  print(classes[0])
  if classes[0]>0.5:
    print(fn + " is no disease")
  else:
    print(fn + " is disease")