In [1]:
import os
import shutil
import pandas as pd

In [2]:
# Define paths
csv_file = 'train.csv'  # Update with the actual path to your CSV file
source_dir = 'train_images'     # Update with the path to your 'train_images' folder
destination_dir = 'IMG'  # Update with the path where you want to copy the folders

In [7]:



# Load the CSV file
data = pd.read_csv(csv_file)

# Filter rows where 'cancer' is 1
cancer_positive = data[data['cancer'] == 1]

# Ensure the destination directory exists
os.makedirs(destination_dir, exist_ok=True)

# Iterate through the filtered rows and copy corresponding folders
for patient_id in cancer_positive['patient_id']:
    src_folder = os.path.join(source_dir, str(patient_id))
    dest_folder = os.path.join(destination_dir, str(patient_id))
    
    if os.path.exists(src_folder):
        shutil.copytree(src_folder, dest_folder, dirs_exist_ok=True)  # Allow overwriting
        print(f"Copied: {src_folder} to {dest_folder}")
    else:
        print(f"Folder not found: {src_folder}")


Copied: train_images\10130 to IMG\10130
Copied: train_images\10130 to IMG\10130
Copied: train_images\10130 to IMG\10130
Copied: train_images\10130 to IMG\10130
Copied: train_images\10226 to IMG\10226
Copied: train_images\10226 to IMG\10226
Copied: train_images\1025 to IMG\1025
Copied: train_images\1025 to IMG\1025
Copied: train_images\10432 to IMG\10432
Copied: train_images\10432 to IMG\10432
Copied: train_images\10589 to IMG\10589
Copied: train_images\10589 to IMG\10589
Copied: train_images\10589 to IMG\10589
Copied: train_images\106 to IMG\106
Copied: train_images\106 to IMG\106
Copied: train_images\10635 to IMG\10635
Copied: train_images\10635 to IMG\10635
Copied: train_images\10638 to IMG\10638
Copied: train_images\10638 to IMG\10638
Copied: train_images\10668 to IMG\10668
Copied: train_images\10668 to IMG\10668
Copied: train_images\10940 to IMG\10940
Copied: train_images\10940 to IMG\10940
Copied: train_images\1109 to IMG\1109
Copied: train_images\1109 to IMG\1109
Copied: train_im

In [10]:
cancer_positive.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1158 entries, 87 to 54596
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   site_id                  1158 non-null   int64  
 1   patient_id               1158 non-null   int64  
 2   image_id                 1158 non-null   int64  
 3   laterality               1158 non-null   object 
 4   view                     1158 non-null   object 
 5   age                      1158 non-null   float64
 6   cancer                   1158 non-null   int64  
 7   biopsy                   1158 non-null   int64  
 8   invasive                 1158 non-null   int64  
 9   BIRADS                   664 non-null    float64
 10  implant                  1158 non-null   int64  
 11  density                  664 non-null    object 
 12  machine_id               1158 non-null   int64  
 13  difficult_negative_case  1158 non-null   bool   
dtypes: bool(1), float64(2), int

In [23]:
# Filter rows where 'cancer' is 1
cancer_negative = data[data['cancer'] == 0]
cancer_negative.info()

<class 'pandas.core.frame.DataFrame'>
Index: 53548 entries, 0 to 54705
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   site_id                  53548 non-null  int64  
 1   patient_id               53548 non-null  int64  
 2   image_id                 53548 non-null  int64  
 3   laterality               53548 non-null  object 
 4   view                     53548 non-null  object 
 5   age                      53511 non-null  float64
 6   cancer                   53548 non-null  int64  
 7   biopsy                   53548 non-null  int64  
 8   invasive                 53548 non-null  int64  
 9   BIRADS                   25622 non-null  float64
 10  implant                  53548 non-null  int64  
 11  density                  28806 non-null  object 
 12  machine_id               53548 non-null  int64  
 13  difficult_negative_case  53548 non-null  bool   
dtypes: bool(1), float64(2), int

In [11]:
import os

# Define the path to the main directory
img_cancer_dir = 'IMG_Cancer'  # Replace with the actual path to IMG_Cancer

# Initialize a counter for DICOM files
dicom_file_count = 0

# Traverse the directory structure
for root, dirs, files in os.walk(img_cancer_dir):
    for file in files:
        # Check if the file is a DICOM file (typically with .dcm extension)
        if file.lower().endswith('.dcm'):
            dicom_file_count += 1

print(f"Total number of DICOM files: {dicom_file_count}")


Total number of DICOM files: 2280


In [38]:
import os
import shutil
import pandas as pd

# Define paths
csv_file = 'train.csv'  # Path to your CSV file
source_dir = 'train_images'       # Path to the 'IMG_Cancer' folder
destination_dir = 'cancer_images'      # Path to the new 'cancer' folder

# Load the CSV file
data = pd.read_csv(csv_file)

# Filter rows where 'cancer' is 1
cancer_positive = data[data['cancer'] == 1]

# Ensure the destination directory exists
os.makedirs(destination_dir, exist_ok=True)

# Iterate through the filtered rows and copy corresponding files
for _, row in cancer_positive.iterrows():
    patient_id = row['patient_id']  # Get the patient ID
    image_id = row['image_id']    # Use the patient_id for naming the file
    # Build the full path to the DICOM file
    dicom_file = os.path.join(source_dir, str(patient_id), f"{image_id}.dcm")
    if os.path.exists(dicom_file):
        # Copy the DICOM file to the destination directory
        shutil.copy(dicom_file, destination_dir)
        print(f"Copied: {dicom_file} to {destination_dir}")
    else:
        print(f"File not found: {dicom_file}")

Copied: train_images\10130\388811999.dcm to cancer_images
Copied: train_images\10130\613462606.dcm to cancer_images
Copied: train_images\10130\1360338805.dcm to cancer_images
Copied: train_images\10130\1672636630.dcm to cancer_images
Copied: train_images\10226\461614796.dcm to cancer_images
Copied: train_images\10226\530620473.dcm to cancer_images
Copied: train_images\1025\773597682.dcm to cancer_images
Copied: train_images\1025\1803952236.dcm to cancer_images
Copied: train_images\10432\458553546.dcm to cancer_images
Copied: train_images\10432\1434858530.dcm to cancer_images
Copied: train_images\10589\195400299.dcm to cancer_images
Copied: train_images\10589\605115808.dcm to cancer_images
Copied: train_images\10589\1967460233.dcm to cancer_images
Copied: train_images\106\76321767.dcm to cancer_images
Copied: train_images\106\2018825992.dcm to cancer_images
Copied: train_images\10635\797737008.dcm to cancer_images
Copied: train_images\10635\1095412840.dcm to cancer_images
Copied: train_

In [17]:
cancer_negative.info()

<class 'pandas.core.frame.DataFrame'>
Index: 53548 entries, 0 to 54705
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   site_id                  53548 non-null  int64  
 1   patient_id               53548 non-null  int64  
 2   image_id                 53548 non-null  int64  
 3   laterality               53548 non-null  object 
 4   view                     53548 non-null  object 
 5   age                      53511 non-null  float64
 6   cancer                   53548 non-null  int64  
 7   biopsy                   53548 non-null  int64  
 8   invasive                 53548 non-null  int64  
 9   BIRADS                   25622 non-null  float64
 10  implant                  53548 non-null  int64  
 11  density                  28806 non-null  object 
 12  machine_id               53548 non-null  int64  
 13  difficult_negative_case  53548 non-null  bool   
dtypes: bool(1), float64(2), int

In [24]:
cancer_negative.isnull().sum()

site_id                        0
patient_id                     0
image_id                       0
laterality                     0
view                           0
age                           37
cancer                         0
biopsy                         0
invasive                       0
BIRADS                     27926
implant                        0
density                    24742
machine_id                     0
difficult_negative_case        0
dtype: int64

In [25]:
cancer_negative.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cancer_negative.dropna(inplace=True)


In [39]:
cancer_negative.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22983 entries, 12 to 54705
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   site_id                  22983 non-null  int64  
 1   patient_id               22983 non-null  int64  
 2   image_id                 22983 non-null  int64  
 3   laterality               22983 non-null  object 
 4   view                     22983 non-null  object 
 5   age                      22983 non-null  float64
 6   cancer                   22983 non-null  int64  
 7   biopsy                   22983 non-null  int64  
 8   invasive                 22983 non-null  int64  
 9   BIRADS                   22983 non-null  float64
 10  implant                  22983 non-null  int64  
 11  density                  22983 non-null  object 
 12  machine_id               22983 non-null  int64  
 13  difficult_negative_case  22983 non-null  bool   
dtypes: bool(1), float64(2), in

In [27]:
import pandas as pd

# Assuming 'cancer_negative' is your DataFrame
# Ensure patient_id is the column containing unique patient IDs

# Randomly sample 290 unique patient_id values
sampled_patient_ids = cancer_negative['patient_id'].drop_duplicates().sample(n=290, random_state=42)

# Convert the sampled patient IDs to a list (if needed)
sampled_patient_ids_list = sampled_patient_ids.tolist()

print(f"Randomly selected patient IDs: {sampled_patient_ids_list}")

Randomly selected patient IDs: [30110, 2476, 47656, 12420, 28447, 40789, 2168, 48342, 6817, 14078, 58472, 48754, 822, 43728, 64789, 33005, 14804, 17198, 36699, 13966, 7251, 36878, 51251, 53219, 16258, 44698, 46277, 23393, 10289, 21827, 18870, 40464, 27857, 42704, 5580, 47133, 43230, 43011, 55446, 49238, 13369, 40122, 33334, 37265, 63393, 47676, 58511, 26924, 14613, 48430, 62020, 21809, 6093, 35045, 15931, 53074, 37203, 37940, 61071, 32567, 52857, 37239, 47622, 13129, 23187, 1014, 29790, 44122, 19669, 12922, 60819, 45645, 53481, 53098, 20390, 29959, 54072, 10391, 26242, 4827, 24369, 42875, 15055, 24585, 45981, 27875, 58303, 21251, 561, 24535, 53910, 35368, 16699, 5999, 25208, 21738, 3734, 22925, 37841, 13023, 61915, 32975, 4873, 4111, 4328, 36585, 2313, 20989, 13635, 39676, 8066, 6941, 21711, 26059, 50691, 27058, 7412, 29192, 1588, 17526, 59121, 26479, 17748, 10864, 22764, 19102, 22098, 56392, 55731, 29868, 22817, 58101, 64898, 30016, 55596, 45474, 30149, 3768, 34213, 13325, 42836, 5986

In [30]:
sampled_patient_ids.info()

<class 'pandas.core.series.Series'>
Index: 290 entries, 19046 to 30014
Series name: patient_id
Non-Null Count  Dtype
--------------  -----
290 non-null    int64
dtypes: int64(1)
memory usage: 4.5 KB


In [40]:
# Filter the DataFrame to include only rows with patient_id in the sampled list
non_cancerous_img = cancer_negative[cancer_negative['patient_id'].isin(sampled_patient_ids_list)]
non_cancerous_img.head()

Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case
102,1,1014,629904228,L,MLO,76.0,0,0,0,1.0,0,B,49,False
103,1,1014,669597068,L,CC,76.0,0,0,0,1.0,0,B,49,False
104,1,1014,229558076,R,MLO,76.0,0,0,0,1.0,0,B,49,False
105,1,1014,1173679750,R,CC,76.0,0,0,0,1.0,0,B,49,False
166,1,10208,638273415,L,MLO,56.0,0,0,0,0.0,0,C,49,True


In [34]:
non_cancerous_img.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1218 entries, 102 to 54318
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   site_id                  1218 non-null   int64  
 1   patient_id               1218 non-null   int64  
 2   image_id                 1218 non-null   int64  
 3   laterality               1218 non-null   object 
 4   view                     1218 non-null   object 
 5   age                      1218 non-null   float64
 6   cancer                   1218 non-null   int64  
 7   biopsy                   1218 non-null   int64  
 8   invasive                 1218 non-null   int64  
 9   BIRADS                   1218 non-null   float64
 10  implant                  1218 non-null   int64  
 11  density                  1218 non-null   object 
 12  machine_id               1218 non-null   int64  
 13  difficult_negative_case  1218 non-null   bool   
dtypes: bool(1), float64(2), in

In [41]:
source_dir = 'train_images' 
destination_dir = 'non_cancer_images'  # Path to the new 'non_cancer_images' folder

# Ensure the destination directory exists
os.makedirs(destination_dir, exist_ok=True)

# Iterate through the filtered rows and copy corresponding files
for _, row in non_cancerous_img.iterrows():
    image_id = row['image_id']  # Get the image_id
    # Build the full path to the DICOM file
    dicom_file = os.path.join(source_dir, str(row['patient_id']), f"{image_id}.dcm")
    
    if os.path.exists(dicom_file):
        # Copy the DICOM file to the destination directory
        shutil.copy(dicom_file, destination_dir)
        print(f"Copied: {dicom_file} to {destination_dir}")
    else:
        print(f"File not found: {dicom_file}")

Copied: train_images\1014\629904228.dcm to non_cancer_images
Copied: train_images\1014\669597068.dcm to non_cancer_images
Copied: train_images\1014\229558076.dcm to non_cancer_images
Copied: train_images\1014\1173679750.dcm to non_cancer_images
Copied: train_images\10208\638273415.dcm to non_cancer_images
Copied: train_images\10208\1248063999.dcm to non_cancer_images
Copied: train_images\10208\74874399.dcm to non_cancer_images
Copied: train_images\10208\1922025927.dcm to non_cancer_images
Copied: train_images\10289\906075357.dcm to non_cancer_images
Copied: train_images\10289\1616844775.dcm to non_cancer_images
Copied: train_images\10289\1934928587.dcm to non_cancer_images
Copied: train_images\10289\1092321039.dcm to non_cancer_images
Copied: train_images\10289\1390886438.dcm to non_cancer_images
Copied: train_images\10391\265003236.dcm to non_cancer_images
Copied: train_images\10391\2065907363.dcm to non_cancer_images
Copied: train_images\10864\400855687.dcm to non_cancer_images
Copie

In [42]:
non_cancerous_img.to_csv('non_cancerous_img.csv')
cancer_positive.to_csv('cancerous_images.csv')