# Download Raw Data from S3

This notebook downloads the three network traffic datasets (CIC-IDS2017, TON_IOT, UNSW_NB15) from the S3 bucket.

In [4]:
import boto3
import os
from botocore import UNSIGNED
from botocore.config import Config

In [5]:
# Configure S3 client
s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
bucket_name = 'aai-540-final-project-group-5'

In [6]:
response = s3.list_objects_v2(Bucket=bucket_name)

# Create local base data directory
base_data_dir = '/home/sagemaker-user/data'
os.makedirs(base_data_dir, exist_ok=True)

folders_to_download = ['raw/CIC-IDS2017/', 'raw/TON_IOT/', 'raw/UNSW_NB15/']

print("Downloading datasets from S3 folders...")

if 'Contents' in response:
    for obj in response['Contents']:
        file_key = obj['Key']
        
        # Skip directory markers (0 byte files)
        if obj['Size'] == 0:
            continue
        
        # Check if file is in one of the folders we want
        for s3_folder in folders_to_download:
            if file_key.startswith(s3_folder):
                # Remove 'raw/' prefix for local storage
                local_key = file_key.replace('raw/', '', 1)
                local_path = os.path.join(base_data_dir, local_key)
                
                # Create subdirectories if needed
                local_dir = os.path.dirname(local_path)
                os.makedirs(local_dir, exist_ok=True)
                
                size_mb = obj['Size'] / (1024 * 1024)
                print(f"Downloading: {file_key} ({size_mb:.2f} MB)")
                s3.download_file(bucket_name, file_key, local_path)
                print(f"  Saved to: {local_path}")
                print()
                break
    
    print("All datasets downloaded successfully!")
else:
    print("No files to download.")

Downloading datasets from S3 folders...
Downloading: raw/CIC-IDS2017/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv (73.55 MB)
  Saved to: /home/sagemaker-user/data/CIC-IDS2017/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv

Downloading: raw/CIC-IDS2017/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv (73.34 MB)
  Saved to: /home/sagemaker-user/data/CIC-IDS2017/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv

Downloading: raw/CIC-IDS2017/Friday-WorkingHours-Morning.pcap_ISCX.csv (55.62 MB)
  Saved to: /home/sagemaker-user/data/CIC-IDS2017/Friday-WorkingHours-Morning.pcap_ISCX.csv

Downloading: raw/CIC-IDS2017/Monday-WorkingHours.pcap_ISCX.csv (168.73 MB)
  Saved to: /home/sagemaker-user/data/CIC-IDS2017/Monday-WorkingHours.pcap_ISCX.csv

Downloading: raw/CIC-IDS2017/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv (79.25 MB)
  Saved to: /home/sagemaker-user/data/CIC-IDS2017/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv

Downloading: raw/CIC-IDS2017