# Preprocessing data for use with built-in Sagemaker algorithm
## Getting filenames and labels for the training dataset

The image classifier algorithm requires a truth source as a .lst file in order to classify images. This includes the index, filename, and label. Since this specific algorithm takes in 0,1,2 as labels, the naming convention will be as follows: 

* Covid is 0
* Normal is 1
* Pneumonia is 2

This will be done for the training, validation, and test images. All the .lst files will be saved in their corresponding files in the S3 bucket. 

In [46]:
import boto3
import s3fs
import pandas as pd

# Create a boto3 client for S3
s3_client = boto3.client('s3')

# Define the bucket name and folder prefix
bucket_name = 'project508data'
folder_prefix = 'train/'

# Create a paginator for listing objects
paginator = s3_client.get_paginator('list_objects_v2')

# Initialize an empty list to store filenames
filenames = []

# Paginate through the objects in the specified folder
for page in paginator.paginate(Bucket=bucket_name, Prefix=folder_prefix):
    contents = page.get('Contents', [])
    filenames.extend(obj['Key'] for obj in contents)

# Creating a dataframe and then only getting the filenames
df = pd.DataFrame({'Filename': filenames})
df['Filename'] = df['Filename'].apply(lambda x: x.split('/')[-1])

# Creating a label column for this dataframe
labels = []

# Iterate over each filename in the 'Filename' column
for filename in df['Filename']:
    if 'covid' in filename:
        labels.append(0)
    elif 'normal' in filename:
        labels.append(1)
    elif 'pneumonia' in filename:
        labels.append(2)
    else:
        labels.append(None)  # If the filename doesn't match any condition, append None

# Add the labels list as a new column to the DataFrame
df['Label'] = labels

# Drop the rows that contain null values
df = df.dropna()

# Reset the index
df = df.reset_index(drop=True)

# Extract the column names
columns = df.columns.tolist()

# Switch the positions of the 1st and 2nd columns
columns[0], columns[1] = columns[1], columns[0]

# Reorder the DataFrame with the new column order
df = df[columns]

# Change Labels from floating point to integer
df['Label'] = df['Label'].astype(int)

# Display the DataFrame
print(df)


      Label             Filename
0         0       covid_100.jpeg
1         0      covid_1000.jpeg
2         0      covid_1001.jpeg
3         0      covid_1002.jpeg
4         0      covid_1003.jpeg
...     ...                  ...
2684      2  pneumonia_1895.jpeg
2685      2  pneumonia_1896.jpeg
2686      2  pneumonia_1897.jpeg
2687      2  pneumonia_1898.jpeg
2688      2  pneumonia_1899.jpeg

[2689 rows x 2 columns]


In [47]:
# Convert DataFrame to CSV format
csv_buffer = df.to_csv(sep='\t', index= True, header = False)

# Write the CSV data to a .lst file
s3 = s3fs.S3FileSystem(anon=False)
with s3.open('s3://project508data/train_groundtruth/train_truth.lst', 'wb') as f:
    f.write(csv_buffer.encode())

In [None]:
# Check if the file has the required contents
# Define the bucket name and key of the .lst file in S3
bucket_name = 'project508data'
s3_key = 'train_groundtruth/train_truth.lst'  # Replace with the actual path and filename

# Create a Boto3 client for S3
s3_client = boto3.client('s3')

# Download the .lst file from S3
local_file_path = '/tmp/file.lst'  # Define a local file path to save the downloaded file
s3_client.download_file(bucket_name, s3_key, local_file_path)

# Read and print the contents of the .lst file
with open(local_file_path, 'r') as file:
    file_contents = file.read()
    print(file_contents)

# Remove the downloaded file
import os
os.remove(local_file_path)

## Getting filenames and labels for the Validation dataset

In [49]:
import boto3
import s3fs
import pandas as pd

# Create a boto3 client for S3
s3_client = boto3.client('s3')

# Define the bucket name and folder prefix
bucket_name = 'project508data'
folder_prefix = 'validation/'

# Create a paginator for listing objects
paginator = s3_client.get_paginator('list_objects_v2')

# Initialize an empty list to store filenames
filenames = []

# Paginate through the objects in the specified folder
for page in paginator.paginate(Bucket=bucket_name, Prefix=folder_prefix):
    contents = page.get('Contents', [])
    filenames.extend(obj['Key'] for obj in contents)

# Creating a dataframe and then only getting the filenames
val_df = pd.DataFrame({'Filename': filenames})
val_df['Filename'] = val_df['Filename'].apply(lambda x: x.split('/')[-1])

# Creating a label column for this dataframe
labels = []

# Iterate over each filename in the 'Filename' column
# Iterate over each filename in the 'Filename' column
for filename in val_df['Filename']:
    if 'x' in filename:
        labels.append(0)
    elif 'z' in filename:
        labels.append(1)
    elif 'y' in filename:
        labels.append(2)
    else:
        labels.append(None)  # If the filename doesn't match any condition, append None

# Add the labels list as a new column to the DataFrame
val_df['Label'] = labels

# Drop the rows that contain null values
val_df = val_df.dropna()

# Reset the index
val_df = val_df.reset_index(drop=True)

# Extract the column names
columns = val_df.columns.tolist()

# Switch the positions of the 1st and 2nd columns
columns[0], columns[1] = columns[1], columns[0]

# Reorder the DataFrame with the new column order
val_df = val_df[columns]

# Change Labels from floating point to integer
val_df['Label'] = val_df['Label'].astype(int)

# Display the DataFrame
print(val_df)

     Label   Filename
0        0   x_1.jpeg
1        0  x_10.jpeg
2        0  x_11.jpeg
3        0  x_12.jpeg
4        0  x_13.jpeg
..     ...        ...
143      2   y_5.jpeg
144      2   y_6.jpeg
145      2   y_7.jpeg
146      2   y_8.jpeg
147      2   y_9.jpeg

[148 rows x 2 columns]


In [50]:
# Convert DataFrame to CSV format
csv_buffer = val_df.to_csv(sep='\t', index= True, header = False)

# Write the CSV data to a .lst file
s3 = s3fs.S3FileSystem(anon=False)
with s3.open('s3://project508data/validation_groundtruth/validation_truth.lst', 'wb') as f:
    f.write(csv_buffer.encode())

In [None]:
# Check if the file has the required contents
# Define the bucket name and key of the .lst file in S3
bucket_name = 'project508data'
s3_key = 'validation_groundtruth/validation_truth.lst'  # Replace with the actual path and filename

# Create a Boto3 client for S3
s3_client = boto3.client('s3')

# Download the .lst file from S3
local_file_path = '/tmp/file.lst'  # Define a local file path to save the downloaded file
s3_client.download_file(bucket_name, s3_key, local_file_path)

# Read and print the contents of the .lst file
with open(local_file_path, 'r') as file:
    file_contents = file.read()
    print(file_contents)

# Remove the downloaded file
import os
os.remove(local_file_path)

## Getting filenames and labels for the Test dataset

In [3]:
import boto3
import s3fs
import pandas as pd

# Create a boto3 client for S3
s3_client = boto3.client('s3')

# Define the bucket name and folder prefix
bucket_name = 'project508data'
folder_prefix = 'test/'

# Create a paginator for listing objects
paginator = s3_client.get_paginator('list_objects_v2')

# Initialize an empty list to store filenames
filenames = []

# Paginate through the objects in the specified folder
for page in paginator.paginate(Bucket=bucket_name, Prefix=folder_prefix):
    contents = page.get('Contents', [])
    filenames.extend(obj['Key'] for obj in contents)

# Creating a dataframe and then only getting the filenames
test_df = pd.DataFrame({'Filename': filenames})
test_df['Filename'] = test_df['Filename'].apply(lambda x: x.split('/')[-1])

# Creating a label column for this dataframe
labels = []

# Iterate over each filename in the 'Filename' column
for filename in test_df['Filename']:
    if 'x' in filename:
        labels.append(0)
    elif 'z' in filename:
        labels.append(1)
    elif 'y' in filename:
        labels.append(2)
    else:
        labels.append(None)  # If the filename doesn't match any condition, append None

# Add the labels list as a new column to the DataFrame
test_df['Label'] = labels

# Drop the rows that contain null values
test_df = test_df.dropna()

# Reset the index
test_df = test_df.reset_index(drop=True)

# Extract the column names
columns = test_df.columns.tolist()

# Switch the positions of the 1st and 2nd columns
columns[0], columns[1] = columns[1], columns[0]

# Reorder the DataFrame with the new column order
test_df = test_df[columns]

# Change Labels from floating point to integer
test_df['Label'] = test_df['Label'].astype(int)

# Display the DataFrame
print(test_df)

     Label   Filename
0        0   x_1.jpeg
1        0  x_10.jpeg
2        0  x_11.jpeg
3        0  x_12.jpeg
4        0  x_13.jpeg
..     ...        ...
145      1  z_51.jpeg
146      1   z_6.jpeg
147      1   z_7.jpeg
148      1   z_8.jpeg
149      1   z_9.jpeg

[150 rows x 2 columns]


In [8]:
%store test_df

Stored 'test_df' (DataFrame)


In [4]:
# Convert DataFrame to CSV format
csv_buffer = test_df.to_csv(sep='\t', index= True, header = False)

# Write the CSV data to a .lst file
s3 = s3fs.S3FileSystem(anon=False)
with s3.open('s3://project508data/test_groundtruth/test_truth.lst', 'wb') as f:
    f.write(csv_buffer.encode())

In [5]:
# Define the bucket name and key of the .lst file in S3
bucket_name = 'project508data'
s3_key = 'test_groundtruth/test_truth.lst'  # Replace with the actual path and filename

# Create a Boto3 client for S3
s3_client = boto3.client('s3')

# Download the .lst file from S3
local_file_path = '/tmp/file.lst'  # Define a local file path to save the downloaded file
s3_client.download_file(bucket_name, s3_key, local_file_path)

# Read and print the contents of the .lst file
with open(local_file_path, 'r') as file:
    file_contents = file.read()
    print(file_contents)

# Remove the downloaded file
import os
os.remove(local_file_path)

0	0	x_1.jpeg
1	0	x_10.jpeg
2	0	x_11.jpeg
3	0	x_12.jpeg
4	0	x_13.jpeg
5	0	x_14.jpeg
6	0	x_15.jpeg
7	0	x_16.jpeg
8	0	x_17.jpeg
9	0	x_18.jpeg
10	0	x_19.jpeg
11	0	x_2.jpeg
12	0	x_20.jpeg
13	0	x_21.jpeg
14	0	x_22.jpeg
15	0	x_23.jpeg
16	0	x_24.jpeg
17	0	x_25.jpeg
18	0	x_26.jpeg
19	0	x_27.jpeg
20	0	x_28.jpeg
21	0	x_29.jpeg
22	0	x_3.jpeg
23	0	x_30.jpeg
24	0	x_31.jpeg
25	0	x_32.jpeg
26	0	x_33.jpeg
27	0	x_34.jpeg
28	0	x_35.jpeg
29	0	x_36.jpeg
30	0	x_37.jpeg
31	0	x_38.jpeg
32	0	x_39.jpeg
33	0	x_4.jpeg
34	0	x_40.jpeg
35	0	x_41.jpeg
36	0	x_42.jpeg
37	0	x_43.jpeg
38	0	x_44.jpeg
39	0	x_45.jpeg
40	0	x_46.jpeg
41	0	x_47.jpeg
42	0	x_48.jpeg
43	0	x_49.jpeg
44	0	x_5.jpeg
45	0	x_6.jpeg
46	0	x_7.jpeg
47	0	x_8.jpeg
48	0	x_9.jpeg
49	2	y_1.jpeg
50	2	y_10.jpeg
51	2	y_11.jpeg
52	2	y_12.jpeg
53	2	y_13.jpeg
54	2	y_14.jpeg
55	2	y_15.jpeg
56	2	y_16.jpeg
57	2	y_17.jpeg
58	2	y_18.jpeg
59	2	y_19.jpeg
60	2	y_2.jpeg
61	2	y_20.jpeg
62	2	y_21.jpeg
63	2	y_22.jpeg
64	2	y_23.jpeg
65	2	y_24.jpeg
66	2	y_25.jpeg
67	2	y_26.jpeg
6