# Gather image metrics from metadata of images

## Create a dataframe with information about a subset of images
## Of Interest are:
- Samples per Pixel 
- Rows
- Columns
- Pixel Intensity Relationship


In [1]:
# Import the libraries we'll use below.
import os
import time
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns  # for nicer plots
sns.set(style="darkgrid")  # default style

import tensorflow as tf
from tensorflow import keras
from keras import metrics
tf.get_logger().setLevel('INFO')

import pydicom

from collections import deque

In [2]:
# Make a dataframe of a subset of files that includes image metrics

# Create a deque to store the data
dicom_data = deque()

# Set the directory where your DICOM files are located
dicom_directory = r"F:\rsna-breast-cancer-detection\train_images"

# Set a counter for the number of DICOM files found
count = 0

# Start the timer
# We are doing a subset of file. Useful to have a timer to get an idea how long would take for all files
start_time = time.time()

# Recursively search for DICOM files in all subdirectories
for root, dirs, files in os.walk(dicom_directory):
    for filename in files:
        # Check if the file is a DICOM file
        if filename.endswith(".dcm"):
            # Use PyDicom to read the file
            ds = pydicom.dcmread(os.path.join(root, filename))
            # Extract the file name and image metrics
            file_name = filename
            samp_per_pix = ds.SamplesPerPixel
            n_rows_image = ds.Rows
            n_cols_image = ds.Columns
            # Add the data to the list
            dicom_data.append([file_name, samp_per_pix, n_rows_image, n_cols_image])
            # Increment the counter
            count += 1
            if count % 200 == 0:
                print(f"Processed {count} DICOM files...")
             #Check if the count has reached 5000 and exit the loop if it has
            #if count == 50:
                #break
    # Check if the count has reached 5000 and exit the loop if it has
    #if count >= 50:
        #break

# Stop the timer
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Processed {count} DICOM files in {elapsed_time:.2f} seconds")
        
# Create a Pandas DataFrame from the list
df = pd.DataFrame(dicom_data, columns=['file_name', 'samp_per_pix', 'n_rows_image', 'n_cols_image'])

Processed 200 DICOM files...
Processed 400 DICOM files...
Processed 600 DICOM files...
Processed 800 DICOM files...
Processed 1000 DICOM files...
Processed 1200 DICOM files...
Processed 1400 DICOM files...
Processed 1600 DICOM files...
Processed 1800 DICOM files...
Processed 2000 DICOM files...
Processed 2200 DICOM files...
Processed 2400 DICOM files...
Processed 2600 DICOM files...
Processed 2800 DICOM files...
Processed 3000 DICOM files...
Processed 3200 DICOM files...
Processed 3400 DICOM files...
Processed 3600 DICOM files...
Processed 3800 DICOM files...
Processed 4000 DICOM files...
Processed 4200 DICOM files...
Processed 4400 DICOM files...
Processed 4600 DICOM files...
Processed 4800 DICOM files...
Processed 5000 DICOM files...
Processed 5200 DICOM files...
Processed 5400 DICOM files...
Processed 5600 DICOM files...
Processed 5800 DICOM files...
Processed 6000 DICOM files...
Processed 6200 DICOM files...
Processed 6400 DICOM files...
Processed 6600 DICOM files...
Processed 6800

Processed 53400 DICOM files...
Processed 53600 DICOM files...
Processed 53800 DICOM files...
Processed 54000 DICOM files...
Processed 54200 DICOM files...
Processed 54400 DICOM files...
Processed 54600 DICOM files...
Processed 54706 DICOM files in 5995.65 seconds


In [6]:
df.head()

Unnamed: 0,file_name,samp_per_pix,n_rows_image,n_cols_image
0,1459541791.dcm,1,5355,4915
1,1864590858.dcm,1,5355,4915
2,1874946579.dcm,1,5355,4915
3,462822612.dcm,1,5355,4915
4,1031443799.dcm,1,2776,2082


In [3]:
# Output dataframe to file for future use
output_file = "ImageMetrics_Subset.csv"
df.to_csv(output_file, index=False)

In [4]:
# upload dataframe
df = pd.read_csv(r"C:\Users\mandy\Documents\MIDS\W207\W207MammogramProject\ImageMetrics_Subset.csv")

In [5]:
# Get summary of value counts for each metric
#samples per pixel
print( df['samp_per_pix'].value_counts(),'\n')
print( df['n_rows_image'].value_counts(),'\n')
print( df['n_cols_image'].value_counts())

1    54706
Name: samp_per_pix, dtype: int64 

4096    24109
3328     9042
5355     8267
2776     8221
2294     2703
3062     1276
4740      732
5928      338
2850       13
2473        3
1236        2
Name: n_rows_image, dtype: int64 

3328    24109
2560     9042
4915     8267
2082     8221
1914     2703
2394     1289
3540      732
4728      338
2045        3
1022        2
Name: n_cols_image, dtype: int64


## Conclusion: A lot of different image sizes. Will need to resize

# Image metrics - with cancer images

In [None]:
df_with_cancer = df[df['cancer'] == 1]