# Parse through data and Save to file to use for input to future models
## Only use images 4096 x 3328

## Import Libraries

In [1]:
from collections import Counter
import matplotlib.pyplot as plt
import pandas as pd
from pandasql import sqldf
import time
import numpy as np
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

import os
import seaborn as sns  # for nicer plots
sns.set(style="darkgrid")  # default style

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from skimage.transform import resize

# tf and keras
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from keras import models
from keras import layers


import pydicom
import csv

## Useful functions

In [2]:
#Function to go from numeric label to text label
def n_to_t_label(n_label):
    if n_label == 0:
        t_label = 'noCancer'
    elif n_label == 1:
        t_label = 'withCancer'
    else:
        raise ValueError('Invalid numeric label. Expected 0 or 1, but got {}'.format(n_label))
    return t_label

## Assemble dataframe of images
### Only include images that are 4096x3328, resize to 256x208

In [5]:
# Get df_joined from the file saved in notebook 4
# This data frame has the image file_names, patient_ids, outcomes (cancer or no cancer in 0/1 format and pixel information)
df_joined = pd.read_csv(r"C:\Users\mandy\Documents\MIDS\W207\W207MammogramProject\JoinedData1.csv")
print(df_joined.shape,'\n')
df_joined.head()

(54706, 5) 



Unnamed: 0,file_name,patient_id,cancer,n_rows_image,n_cols_image
0,1459541791.dcm,10006,0,5355,4915
1,1864590858.dcm,10006,0,5355,4915
2,1874946579.dcm,10006,0,5355,4915
3,462822612.dcm,10006,0,5355,4915
4,1031443799.dcm,10011,0,2776,2082


In [6]:
# Filter to only keep images with 4096 x 3328 pixels
df_filtered = df_joined.loc[(df_joined['n_rows_image'] == 4096) & (df_joined['n_cols_image'] == 3328)].reset_index(drop=True)
print('df_filtered shape:', df_filtered.shape, '\n')
print('df_filtered:\n',df_filtered.head())

df_filtered shape: (24109, 5) 

df_filtered:
         file_name  patient_id  cancer  n_rows_image  n_cols_image
0  1007799858.dcm       10097       0          4096          3328
1   166011602.dcm       10097       0          4096          3328
2  1957744616.dcm       10097       0          4096          3328
3   268368896.dcm       10097       0          4096          3328
4   664674273.dcm       10097       0          4096          3328


In [7]:
# Check how many cancer vs no cancer cases we're left with after the filtering
# 600 cancer cases. No cancer is now ~97% of images vs the pre-filter 98%
# conclusion that this filter returns an ok data set, about half of the original and about the same % split outcomes
df_filtered['cancer'].value_counts()

0    23509
1      600
Name: cancer, dtype: int64

In [8]:
#Take a look at the filtered dataframe
df_filtered = df_filtered.set_index('file_name')
print(df_filtered)
print('df_filtered cancer value counts:', df_filtered['cancer'].value_counts())
print('\n98% dataset is positive for cancer')

                patient_id  cancer  n_rows_image  n_cols_image
file_name                                                     
1007799858.dcm       10097       0          4096          3328
166011602.dcm        10097       0          4096          3328
1957744616.dcm       10097       0          4096          3328
268368896.dcm        10097       0          4096          3328
664674273.dcm        10097       0          4096          3328
...                    ...     ...           ...           ...
244243187.dcm         9967       0          4096          3328
2098937312.dcm        9968       0          4096          3328
294168046.dcm         9968       0          4096          3328
568005453.dcm         9968       0          4096          3328
766198919.dcm         9968       0          4096          3328

[24109 rows x 4 columns]
df_filtered cancer value counts: 0    23509
1      600
Name: cancer, dtype: int64

98% dataset is positive for cancer


In [9]:
#Take a look at column data types
df_filtered.dtypes

patient_id      int64
cancer          int64
n_rows_image    int64
n_cols_image    int64
dtype: object

In [10]:
# Set each row's index to the imageID
imageIds = df_filtered.index
print(imageIds)

Index(['1007799858.dcm', '166011602.dcm', '1957744616.dcm', '268368896.dcm',
       '664674273.dcm', '929754876.dcm', '1181635673.dcm', '1241778584.dcm',
       '1245250349.dcm', '1304802631.dcm',
       ...
       '255391556.dcm', '629099099.dcm', '1245697729.dcm', '1260162631.dcm',
       '155474390.dcm', '244243187.dcm', '2098937312.dcm', '294168046.dcm',
       '568005453.dcm', '766198919.dcm'],
      dtype='object', name='file_name', length=24109)


In [11]:
#Get Data
#X is mammogram images
#y is labels

# Set the directory where DICOM files are located
dicom_directory = r"D:\W207\rsna-breast-cancer-detection\train_images"

# Initialize list of flattened training images and outcomes
images = []
labels = []

# Set a counter for the number of DICOM files found
count = 0

# Start the timer
start_time = time.time()

# Recursively search for DICOM files in all subdirectories
for root, dirs, files in os.walk(dicom_directory):
    for filename in files:
        # Check if the file is a DICOM file
        if filename.endswith(".dcm"):
            # Check if the file is in the training list
            if (imageIds == filename).any().any():
                #print(filename)
                file_path = os.path.join(root, filename)
                # Add the pixel data 
                ds = pydicom.dcmread(file_path)
                pixel_array = ds.pixel_array
                resized_pixel_array = resize(pixel_array, (256, 208), anti_aliasing=True)
                images.append(resized_pixel_array.flatten())
                # Add the label information
                labels.append(df_filtered.loc[filename,'cancer'])
            # Increment the counter
            count += 1
            if count % 100 == 0:
                print(f"Processed {count} DICOM files...")

images = np.array(images)       

Processed 100 DICOM files...
Processed 200 DICOM files...
Processed 300 DICOM files...
Processed 400 DICOM files...
Processed 500 DICOM files...
Processed 600 DICOM files...
Processed 700 DICOM files...
Processed 800 DICOM files...
Processed 900 DICOM files...
Processed 1000 DICOM files...
Processed 1100 DICOM files...
Processed 1200 DICOM files...
Processed 1300 DICOM files...
Processed 1400 DICOM files...
Processed 1500 DICOM files...
Processed 1600 DICOM files...
Processed 1700 DICOM files...
Processed 1800 DICOM files...
Processed 1900 DICOM files...
Processed 2000 DICOM files...
Processed 2100 DICOM files...
Processed 2200 DICOM files...
Processed 2300 DICOM files...
Processed 2400 DICOM files...
Processed 2500 DICOM files...
Processed 2600 DICOM files...
Processed 2700 DICOM files...
Processed 2800 DICOM files...
Processed 2900 DICOM files...
Processed 3000 DICOM files...
Processed 3100 DICOM files...
Processed 3200 DICOM files...
Processed 3300 DICOM files...
Processed 3400 DICO

Processed 26900 DICOM files...
Processed 27000 DICOM files...
Processed 27100 DICOM files...
Processed 27200 DICOM files...
Processed 27300 DICOM files...
Processed 27400 DICOM files...
Processed 27500 DICOM files...
Processed 27600 DICOM files...
Processed 27700 DICOM files...
Processed 27800 DICOM files...
Processed 27900 DICOM files...
Processed 28000 DICOM files...
Processed 28100 DICOM files...
Processed 28200 DICOM files...
Processed 28300 DICOM files...
Processed 28400 DICOM files...
Processed 28500 DICOM files...
Processed 28600 DICOM files...
Processed 28700 DICOM files...
Processed 28800 DICOM files...
Processed 28900 DICOM files...
Processed 29000 DICOM files...
Processed 29100 DICOM files...
Processed 29200 DICOM files...
Processed 29300 DICOM files...
Processed 29400 DICOM files...
Processed 29500 DICOM files...
Processed 29600 DICOM files...
Processed 29700 DICOM files...
Processed 29800 DICOM files...
Processed 29900 DICOM files...
Processed 30000 DICOM files...
Processe

Processed 53400 DICOM files...
Processed 53500 DICOM files...
Processed 53600 DICOM files...
Processed 53700 DICOM files...
Processed 53800 DICOM files...
Processed 53900 DICOM files...
Processed 54000 DICOM files...
Processed 54100 DICOM files...
Processed 54200 DICOM files...
Processed 54300 DICOM files...
Processed 54400 DICOM files...
Processed 54500 DICOM files...
Processed 54600 DICOM files...
Processed 54700 DICOM files...


In [12]:
import csv
#Save images and labels so don't have to walk through data again
#images
np.savez_compressed('CNN_Xs3.npz', data=images)

#labels
with open('CNN_labels3.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(labels)