# Upload Sample Images and Videos

In [3]:
import boto3
import sagemaker
import pandas as pd

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name="sagemaker", region_name=region)

In [14]:
# setting S3 source location
public_path = "s3://508-xray-project"

In [15]:
# storing public path
%store public_path

Stored 'public_path' (str)


In [16]:
# setting up S3 destination location
private_path = "s3://project508data".format(bucket)

In [17]:
# storing private path
%store private_path

Stored 'private_path' (str)


In [None]:
# copy data from one s3 bucket into another S3 bucket
!aws s3 cp  $public_path $private_path --recursive

In [19]:
!aws s3 ls s3://$bucket/project508data

In [20]:
!aws s3 ls 

2024-03-15 20:31:01 aws-athena-query-results-975049898153-us-east-1
2024-03-18 02:00:35 project508data
2024-03-02 04:02:13 sagemaker-studio-975049898153-refc51i7svr
2024-03-02 04:02:15 sagemaker-us-east-1-975049898153


In [23]:
# Deleting the normal xrays from the covid/covid_xrays/train folder
# getting the object (file) from s3 bucket
s3 = boto3.client('s3')

bucket_name = 'project508data'
file_path = 'covid/train.txt'

obj = s3.get_object(Bucket=bucket_name, Key=file_path)

# Read the text file into a DataFrame
# Assuming the file is space-separated and does not contain column headers
train_df = pd.read_csv(obj['Body'], sep=' ', header=None)

# adding in headers to the df
train_df.columns = ['patient id', 'filename', 'labels', 'data source'] 

# dropping data source column
train_df = train_df.drop(['patient id','data source'], axis=1 ) 

# checking to see which files are positive and which ones are negative
train_df.groupby('labels').size()

labels
negative    10664
positive    57199
dtype: int64

In [24]:
# only getting the files that have the negative label
negative_df = train_df[train_df['labels'] == 'negative'].reset_index(drop = True)
negative_df

Unnamed: 0,filename,labels
0,1e64990d1b40c1758a2aaa9c7f7a85_jumbo.jpeg,negative
1,7223b8ad031187d9a142d7f7ca02c9_jumbo.jpeg,negative
2,3392dc7d262e28423caca517f98c2e_jumbo.jpeg,negative
3,ec3a480c0926ded74429df416cfb05_jumbo.jpeg,negative
4,a72aeb349a63c79ed24e473c434efe_jumbo.jpg,negative
...,...,...
10659,sub-S29624_ses-E61050_run-1_bp-chest_vp-pa_dx-...,negative
10660,sub-S29625_ses-E61052_run-1_bp-chest_vp-ap_dx-...,negative
10661,sub-S29626_ses-E61054_run-1_bp-chest_vp-ap_dx-...,negative
10662,sub-S29629_ses-E61058_run-1_bp-chest_vp-pa_dx-...,negative


In [25]:
!aws s3 ls s3://project508data/covid/train --recursive | grep -c "train"

8511


In [None]:
# filepath information
prefix_to_delete = 'covid/train/'

# Iterate over the DataFrame and delete files in S3 based on filename
for index, row in negative_df.iterrows():
    filename = row['filename']  # Replace 'filename_column' with the actual column name
    key_to_delete = prefix_to_delete + filename
    
    # Delete the object from S3 if it exists
    try:
        s3.delete_object(Bucket=bucket_name, Key=key_to_delete)
        print(f"Deleted {key_to_delete} from S3")
    except Exception as e:
        print(f"Error deleting {key_to_delete} from S3: {str(e)}")


In [27]:
!aws s3 ls s3://project508data/covid/train --recursive | grep -c "train"

3628


In [28]:
# Deleting the normal xrays from the covid/covid_xrays/validation folder
# getting the object (file) from s3 bucket
s3 = boto3.client('s3')

bucket_name = 'project508data'
file_path = 'covid/val.txt'

obj = s3.get_object(Bucket=bucket_name, Key=file_path)

# Read the text file into a DataFrame
# Assuming the file is space-separated and does not contain column headers
val_df = pd.read_csv(obj['Body'], sep=' ', header=None)

# adding in headers to the df
val_df.columns = ['patient id', 'filename', 'labels', 'data source'] 

# dropping data source column
val_df = val_df.drop(['patient id','data source'], axis=1 ) 

# checking to see which files are positive and which ones are negative
val_df.groupby('labels').size()

labels
negative    4232
positive    4241
dtype: int64

In [29]:
# only getting the files that have the negative label
negative_val_df = val_df[val_df['labels'] == 'negative'].reset_index(drop = True)
negative_val_df

Unnamed: 0,filename,labels
0,CR.1.2.840.113564.1722810170.20200317090830828...,negative
1,CR.1.2.840.113564.1722810170.20200317104341875...,negative
2,DX.1.2.840.113564.1722810162.20200323113158393...,negative
3,CR.1.2.840.113564.1722810170.20200319202613984...,negative
4,CR.1.2.840.113564.1722810170.20200323115834968...,negative
...,...,...
4227,904289e4-96a1-43c8-9eed-6cec2e9c8ddd.png,negative
4228,90440659-a140-451d-9ddb-2908d4408c93.png,negative
4229,905f015d-8475-463d-aaa6-20953f0edd9e.png,negative
4230,90710ba1-7628-4b4c-ac71-a61944ba3825.png,negative


In [31]:
!aws s3 ls s3://project508data/covid/val --recursive | grep -c "val"

2462


In [None]:
# deleting the files from s3
# filepath information
prefix_to_delete = 'covid/val/'

# Iterate over the DataFrame and delete files in S3 based on filename
for index, row in negative_val_df.iterrows():
    filename = row['filename']  # Replace 'filename_column' with the actual column name
    key_to_delete = prefix_to_delete + filename
    
    # Delete the object from S3 if it exists
    try:
        s3.delete_object(Bucket=bucket_name, Key=key_to_delete)
        print(f"Deleted {key_to_delete} from S3")
    except Exception as e:
        print(f"Error deleting {key_to_delete} from S3: {str(e)}")

In [33]:
!aws s3 ls s3://project508data/covid/val --recursive | grep -c "val"

2


In [36]:
# renaming all covid xrays
# List Objects in Nested Folder
def list_objects(bucket, prefix=''):
    paginator = s3.get_paginator('list_objects_v2')
    operation_parameters = {'Bucket': bucket, 'Prefix': prefix}
    for page in paginator.paginate(**operation_parameters):
        if 'Contents' in page:
            for obj in page['Contents']:
                yield obj['Key']

bucket_name = 'project508data'
prefix = 'covid/train/'  # Specify the path to the nested folder
objects = list(list_objects(bucket_name, prefix))

# Iterate Over Objects Starting from the Second Object and Rename
counter = 1
for old_key in objects[1:]:
    # extracting the file name and extension
    filename, extension = old_key.rsplit('.', 1)
    # desired new file name
    new_filename = f"covid_{counter}.{extension}"
    # Rename object
    new_key = prefix + new_filename
    s3.copy_object(Bucket=bucket_name, CopySource=f"{bucket_name}/{old_key}", Key=new_key)
    s3.delete_object(Bucket=bucket_name, Key=old_key)
    # Increment counter
    counter += 1

In [37]:
!aws s3 ls s3://project508data/covid/train --recursive | grep -c "train"

3629


In [None]:
# copy data from one s3 bucket into another S3 bucket
old_path = "s3://project508data/pneumonia/pneumonia_xrays/PNEU/"
new_path = "s3://project508data/pneumonia/"
!aws s3 cp  $old_path $new_path --recursive

In [39]:
# renaming all pneumonia xrays
# List Objects in Nested Folder
def list_objects(bucket, prefix=''):
    paginator = s3.get_paginator('list_objects_v2')
    operation_parameters = {'Bucket': bucket, 'Prefix': prefix}
    for page in paginator.paginate(**operation_parameters):
        if 'Contents' in page:
            for obj in page['Contents']:
                yield obj['Key']

bucket_name = 'project508data'
prefix = 'pneumonia/'  # Specify the path to the files
objects = list(list_objects(bucket_name, prefix))

# Iterate Over Objects Starting from the Second Object and Rename
counter = 1
for old_key in objects[1:]:
    # extracting the file name and extension
    filename, extension = old_key.rsplit('.', 1)
    # desired new file name
    new_filename = f"pneumonia_{counter}.{extension}"
    # Rename object
    new_key = prefix + new_filename
    s3.copy_object(Bucket=bucket_name, CopySource=f"{bucket_name}/{old_key}", Key=new_key)
    s3.delete_object(Bucket=bucket_name, Key=old_key)
    # Increment counter
    counter += 1

In [40]:
!aws s3 ls s3://project508data/pneumonia --recursive | grep -c "pneumonia"

8546


# Release Resources

In [4]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [6]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}

<IPython.core.display.Javascript object>