In [None]:
# import libraries
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
pd.options.display.max_columns = None

In [None]:
df_meta = pd.read_csv('/Users/amritanand/Documents/Programming/Breast Cancer/Input/cbism-ddsm-breast-cancer/csv/meta.csv')
df_meta.head()

# load dicom info file
df_dicom = pd.read_csv('/Users/amritanand/Documents/Programming/Breast Cancer/Input/cbism-ddsm-breast-cancer/csv/dicom_info.csv')
df_dicom.head()

In [None]:
# check image types in dataset
df_dicom.SeriesDescription.unique()

In [None]:
# check image path in dataset
# cropped images
cropped_images = df_dicom[df_dicom.SeriesDescription=='cropped images'].image_path
print(cropped_images)

#full mammogram images
full_mammo = df_dicom[df_dicom.SeriesDescription=='full mammogram images'].image_path


# ROI images
roi_img = df_dicom[df_dicom.SeriesDescription=='ROI mask images'].image_path

In [None]:
# set correct image path for image types
# set correct image path for image types
imdir = 'kaggle/input/cbis-ddsm-breast-cancer-image-dataset/jpeg'

# change directory path of images
cropped_images = cropped_images.replace('CBIS-DDSM/jpeg', imdir, regex=True)
full_mammo = full_mammo.replace('CBIS-DDSM/jpeg', imdir, regex=True)
roi_img = roi_img.replace('CBIS-DDSM/jpeg', imdir, regex=True)

# view new paths
print('Cropped Images paths:\n')
print(cropped_images.iloc[0])
print('Full mammo Images paths:\n')
print(full_mammo.iloc[0])
print('ROI Mask Images paths:\n')
print(roi_img.iloc[0])


In [None]:
# organize image paths
full_mammo_dict = dict()
cropped_images_dict = dict()
roi_img_dict = dict()

for dicom in full_mammo:
    key = dicom.split("/")[4]
    full_mammo_dict[key] = dicom
for dicom in cropped_images:
    key = dicom.split("/")[4]
    cropped_images_dict[key] = dicom
for dicom in roi_img:
    key = dicom.split("/")[4]
    roi_img[key] = dicom

# view keys
next(iter((full_mammo_dict.items())))

In [None]:
# load the mass dataset
mass_train = pd.read_csv('/Users/amritanand/Documents/Programming/Breast Cancer/Input/cbism-ddsm-breast-cancer/csv/mass_case_description_train_set.csv')
mass_test  =  pd.read_csv('/Users/amritanand/Documents/Programming/Breast Cancer/Input/cbism-ddsm-breast-cancer/csv/mass_case_description_test_set.csv')

mass_train.head()


In [None]:
# fix image paths
def fix_image_path(data):
    """correct dicom paths to correct image paths"""
    for index, img in enumerate(data.values):
        img_name = img[11].split("/")[2]
        data.iloc[index,11] = full_mammo_dict[img_name]
        img_name = img[12].split("/")[2]
        data.iloc[index,12] = cropped_images_dict[img_name]
        
# apply to datasets
fix_image_path(mass_train)
fix_image_path(mass_test)

In [None]:
# check unique values in pathology column
mass_train.pathology.unique()
mass_train.info()

# rename columns
mass_train = mass_train.rename(columns={'left or right breast': 'left_or_right_breast',
                                           'image view': 'image_view',
                                           'abnormality id': 'abnormality_id',
                                           'abnormality type': 'abnormality_type',
                                           'mass shape': 'mass_shape',
                                           'mass margins': 'mass_margins',
                                           'image file path': 'image_file_path',
                                           'cropped image file path': 'cropped_image_file_path',
                                           'ROI mask file path': 'ROI_mask_file_path'})

mass_train.head(5)

In [None]:
# check for null values
mass_train.isnull().sum()

# fill in missing values using the backwards fill method
mass_train['mass_shape'] = mass_train['mass_shape'].fillna(method='bfill')
mass_train['mass_margins'] = mass_train['mass_margins'].fillna(method='bfill')

#check null values
mass_train.isnull().sum()

# quantitative summary of features
mass_train.describe()

In [None]:
# view mass_test
mass_test.head()

In [None]:
# check datasets shape
print(f'Shape of mass_train: {mass_train.shape}')
print(f'Shape of mass_test: {mass_test.shape}')


mass_test.isnull().sum()

print(mass_test.columns)
print('\n')


In [None]:
# rename columns
mass_test = mass_test.rename(columns={'left or right breast': 'left_or_right_breast',
                                           'image view': 'image_view',
                                           'abnormality id': 'abnormality_id',
                                           'abnormality type': 'abnormality_type',
                                           'mass shape': 'mass_shape',
                                           'mass margins': 'mass_margins',
                                           'image file path': 'image_file_path',
                                           'cropped image file path': 'cropped_image_file_path',
                                           'ROI mask file path': 'ROI_mask_file_path'})

# view renamed columns
mass_test.columns


In [None]:

# fill in missing values using the backwards fill method
mass_test['mass_margins'] = mass_test['mass_margins'].fillna(method='bfill')

#check null values
mass_test.isnull().sum()

In [None]:
# pathology distributions
value = mass_train['pathology'].value_counts()

plt.figure(figsize=(8,6))
plt.pie(value, labels=value.index, autopct='%1.1f%%')
plt.title('Breast Cancer Mass Types', fontsize=14)
plt.savefig('kaggle/pathology_distributions_red.png')
plt.show()



In [None]:
# Display some images
import matplotlib.image as mpimg


In [None]:
# create function to display images
def display_images(column, number):
    """displays images in dataset"""
    # create figure and axes
    number_to_visualize = number
    rows = 1
    cols = number_to_visualize
    fig, axes = plt.subplots(rows, cols, figsize=(15, 5))
    
    # Loop through rows and display images
    for index, row in mass_train.head(number_to_visualize).iterrows():
        image_path = row[column]
        image = mpimg.imread(image_path)
        ax = axes[index]
        ax.imshow(image, cmap='gray')
        ax.set_title(f"{row['pathology']}")
        ax.axis('off')
    plt.tight_layout()
    plt.show()

print('Full Mammograms:\n')
display_images('image_file_path', 5)
print('Cropped Mammograms:\n')
display_images('cropped_image_file_path', 5)