In [295]:
from google_drive_downloader import GoogleDriveDownloader as gdd

gdd.download_file_from_google_drive(file_id='12umDKmXJ8--ZmuiTrchSQRCs8SmRl12h',
dest_path='content/mammographic_images.zip',
unzip=True)

In [296]:
import pandas as pd 
import numpy as np 
import tensorflow as tf 
import os 
import cv2 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import f1_score

In [297]:
labels = pd.read_csv("/content/content/mammography_images/Training_set.csv") 
labels.head() #

Unnamed: 0,filename,label
0,Image_1.jpg,Density3Benign
1,Image_2.jpg,Density1Benign
2,Image_3.jpg,Density1Malignant
3,Image_4.jpg,Density1Benign
4,Image_5.jpg,Density1Malignant


In [298]:
labels.tail()

Unnamed: 0,filename,label
5719,Image_5720.jpg,Density2Malignant
5720,Image_5721.jpg,Density2Malignant
5721,Image_5722.jpg,Density2Malignant
5722,Image_5723.jpg,Density1Benign
5723,Image_5724.jpg,Density3Benign


In [299]:
file_paths = [[fname, '/content/content/mammography_images/train/' + fname] for fname in labels['filename']]

In [300]:
# Confirm if number of images is same as number of labels given
if len(labels) == len(file_paths):
    print('Number of labels i.e. ', len(labels), 'matches the number of filenames i.e. ', len(file_paths))
else:
    print('Number of labels does not match the number of filenames')

Number of labels i.e.  5724 matches the number of filenames i.e.  5724


In [301]:
images = pd.DataFrame(file_paths, columns=['filename', 'filepaths'])
images.head()

Unnamed: 0,filename,filepaths
0,Image_1.jpg,/content/content/mammography_images/train/Imag...
1,Image_2.jpg,/content/content/mammography_images/train/Imag...
2,Image_3.jpg,/content/content/mammography_images/train/Imag...
3,Image_4.jpg,/content/content/mammography_images/train/Imag...
4,Image_5.jpg,/content/content/mammography_images/train/Imag...


In [302]:
train_data = pd.merge(images, labels, how = 'inner', on = 'filename')
train_data.head()

Unnamed: 0,filename,filepaths,label
0,Image_1.jpg,/content/content/mammography_images/train/Imag...,Density3Benign
1,Image_2.jpg,/content/content/mammography_images/train/Imag...,Density1Benign
2,Image_3.jpg,/content/content/mammography_images/train/Imag...,Density1Malignant
3,Image_4.jpg,/content/content/mammography_images/train/Imag...,Density1Benign
4,Image_5.jpg,/content/content/mammography_images/train/Imag...,Density1Malignant


In [303]:
data = [] # initialize an empty numpy array
image_size = 50 # image size taken is 200 here. one can take other size too
for i in range(len(train_data)):

    img_array = cv2.imread(train_data['filepaths'][i], cv2.IMREAD_GRAYSCALE) # converting the image to gray scale

    new_img_array = cv2.resize(img_array, (image_size, image_size)) # resizing the image array
    if train_data['label'][i] == 'Density1Benign':
      data.append([new_img_array, 0])
    elif train_data['label'][i] == 'Density1Malignant':
      data.append([new_img_array, 1])
    elif train_data['label'][i] == 'Density2Benign':
      data.append([new_img_array, 2])
    elif train_data['label'][i] == 'Density2Malignant':
      data.append([new_img_array, 3])
    elif train_data['label'][i] == 'Density3Benign':
      data.append([new_img_array, 4])
    elif train_data['label'][i] == 'Density3Malignant':
      data.append([new_img_array, 5])
    elif train_data['label'][i] == 'Density4Benign':
      data.append([new_img_array, 6])
    elif train_data['label'][i] == 'Density4Malignant':
      data.append([new_img_array, 7])

In [304]:
# image pixels of a image
data[0]

[array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=uint8), 4]

In [305]:
np.random.shuffle(data)

In [306]:
x = []
y = []
for image in data:
    x.append(image[0])
    y.append(image[1])

# converting x & y to numpy array as they are list
x = np.array(x)
y = np.array(y)

In [307]:
np.unique(y, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7]),
 array([ 648, 1620,  216, 1728,  702,  432,  324,   54]))

In [308]:
# split the data
X_train, X_val, y_train, y_val = train_test_split(x,y,test_size=0.3, random_state = 42)

In [309]:
# Defining the model
model = tf.keras.Sequential([
tf.keras.layers.Flatten(input_shape=(50, 50)), # flattening the image
tf.keras.layers.Dense(50, activation='relu'),
tf.keras.layers.Dense(40, activation='relu'),
tf.keras.layers.Dense(30, activation='relu'),
tf.keras.layers.Dense(20, activation='relu'),
tf.keras.layers.Dense(16, activation='sigmoid')
])

model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])

model.fit(X_train, y_train, epochs=40, batch_size=10)


Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<tensorflow.python.keras.callbacks.History at 0x7f21f8f31850>

In [310]:
model.evaluate(X_val, y_val)



[1.1709938049316406, 0.5314319133758545]

In [311]:
# Loading the order of the image's name that has been provided
test_image_order = pd.read_csv("/content/content/mammography_images/Testing_set.csv")
test_image_order.head()

Unnamed: 0,filename
0,Image_1.jpg
1,Image_2.jpg
2,Image_3.jpg
3,Image_4.jpg
4,Image_5.jpg


In [312]:
file_paths = [[fname, '/content/content/mammography_images/test/' + fname] for fname in test_image_order['filename']]

In [313]:
# Confirm if number of images is same as number of labels given
if len(test_image_order) == len(file_paths):
    print('Number of image names i.e. ', len(test_image_order), 'matches the number of file paths i.e. ', len(file_paths))
else:
    print('Number of image names does not match the number of filepaths')    

Number of image names i.e.  1908 matches the number of file paths i.e.  1908


In [314]:
test_images = pd.DataFrame(file_paths, columns=['filename', 'filepaths'])
test_images.head()

Unnamed: 0,filename,filepaths
0,Image_1.jpg,/content/content/mammography_images/test/Image...
1,Image_2.jpg,/content/content/mammography_images/test/Image...
2,Image_3.jpg,/content/content/mammography_images/test/Image...
3,Image_4.jpg,/content/content/mammography_images/test/Image...
4,Image_5.jpg,/content/content/mammography_images/test/Image...


In [315]:
test_pixel_data = [] # initialize an empty numpy array
image_size = 50 # image size taken is 100 here. one can take other size too
for i in range(len(test_images)):

    img_array = cv2.imread(test_images['filepaths'][i], cv2.IMREAD_GRAYSCALE) # converting the image to gray scale

    new_img_array = cv2.resize(img_array, (image_size, image_size)) # resizing the image array

    test_pixel_data.append(new_img_array)

In [316]:
test_pixel_data = np.array(test_pixel_data)

In [317]:
pred = model.predict(test_pixel_data)

In [318]:
# The predicted values are the probabilities value
pred[0]

array([9.7377503e-01, 9.4306123e-01, 3.1911552e-02, 9.9808824e-01,
       9.9856210e-01, 9.9729431e-01, 5.8259320e-01, 3.2337049e-01,
       7.2205423e-20, 4.3916461e-11, 7.8715047e-12, 2.3496420e-12,
       1.0047010e-08, 2.2008404e-19, 1.7538948e-06, 1.5482212e-16],
      dtype=float32)

In [319]:
predictions = []
for item in pred:
    if np.argmax(item)== 0:
        predictions.append('Density1Benign')
    elif np.argmax(item)== 1:
        predictions.append('Density1Malignant')
    elif np.argmax(item)== 2:
        predictions.append('Density2Benign')
    elif np.argmax(item)== 3:
        predictions.append('Density2Malignant')
    elif np.argmax(item)== 4:
        predictions.append('Density3Benign')
    elif np.argmax(item)== 5:
        predictions.append('Density3Malignant')
    elif np.argmax(item)== 6:
        predictions.append('Density4Benign')
    elif np.argmax(item)== 7:
        predictions.append('Density4Malignant')

In [320]:
res = pd.DataFrame({'filename': test_images['filename'], 'label': predictions}) # prediction is nothing but the final predictions of your model on input features of your new unseen test data
res.to_csv("submission.csv", index = False)

# To download the csv file locally
from google.colab import files
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>