# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


#### Optional: Run this cell to see available notebook commands ("magics").


In [None]:
%help

####  Run this cell to set up and start your interactive session.


In [1]:
%idle_timeout 2880
%glue_version 4.0
%worker_type G.1X
%number_of_workers 5

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.7 
Current idle_timeout is None minutes.
idle_timeout has been set to 2880 minutes.
Setting Glue version to: 4.0
Previous worker type: None
Setting new worker type to: G.1X
Previous number of workers: None
Setting new number of workers to: 5
Trying to create a Glue session for the kernel.
Session Type: glueetl
Worker Type: G.1X
Number of Workers: 5
Idle Timeout: 2880
Session ID: 5149e686-4b61-4322-a0a7-d38a3119af37
Applying the following default arguments:
--glue_kernel_version 1.0.7
--enable-glue-datacatalog true
Waiting for session 5149e686-4b61-4322-a0a7-d38a3119af37 to get into ready status...
Session 5149e686-4b61-4322-a0a7-d38a3119af37 ha

In [2]:
# Import necessary libraries
import boto3
import zipfile
import os
from io import BytesIO




In [3]:
import boto3
from datetime import datetime, timedelta

# Initialize Boto3 client for S3
s3_client = boto3.client('s3')

# Define the patterns you're looking for
patterns = [
"CAM0557",
"CAM4417",
"CAM0636",
"CAM4873",
"ARL0156",
"ARL0219",
"ARL0562",
"ARL0127",
"ARL2620",
"CAM0794",
"SOM0315",
"SOM0603",
"SOM0827"
]

# Function to handle pagination and filter files
def list_and_filter_files(bucket_name, prefix, start_date, end_date):
    paginator = s3_client.get_paginator('list_objects_v2')
    filtered_file_paths = []

    # Convert start_date and end_date to datetime objects
    start_date = datetime.strptime(start_date, '%Y_%m_%d')
    end_date = datetime.strptime(end_date, '%Y_%m_%d')

    # Generate a list of target dates within the range
    target_dates = [(start_date + timedelta(days=x)).strftime('%Y_%m_%d') 
                    for x in range((end_date - start_date).days + 1)]

    # Iterate over paginated results
    for page in paginator.paginate(Bucket=bucket_name, Prefix=prefix):
        if 'Contents' in page:
            for obj in page['Contents']:
                file_path = obj['Key']

                # Filter for files containing any of the target dates and matching patterns
                if any(target_date in file_path for target_date in target_dates) and \
                   any(pattern in file_path for pattern in patterns):
                    filtered_file_paths.append(f"s3://{bucket_name}/{file_path}")

    return filtered_file_paths

# Example usage:
bucket_name = 'mbta-tsp-signal'
prefix = 'csv/'
start_date = '2025_01_01'
end_date = '2025_02_10'

file_paths = list_and_filter_files(bucket_name, prefix, start_date, end_date)
print(f"Filtered File Paths: {len(file_paths)} found")

Filtered File Paths: 17616 found


In [7]:
# import boto3

# # Initialize Boto3 client for S3
# s3_client = boto3.client('s3')

# # Define the patterns you're looking for
# patterns = [
#     'BRKL0001', 'BRKL0002', 'BRKL0003', 'BRKL0004', 'BRKL0005', 'BRKL0006',
#     'MALD0001', 'MALD0002', 'MALD0003', 'MALD0004', 'MALD0005', 'MALD0006',
#     'MALD0007', 'SMVL0001'
# ]

# # Function to handle pagination and filter files
# def list_and_filter_files(bucket_name, prefix, target_date='2024_10_03'):
#     paginator = s3_client.get_paginator('list_objects_v2')
#     filtered_file_paths = []
    
#     # Iterate over paginated results
#     for page in paginator.paginate(Bucket=bucket_name, Prefix=prefix):
#         if 'Contents' in page:
#             for obj in page['Contents']:
#                 file_path = obj['Key']
                
#                 # Filter for files containing the target date and matching patterns for 08:00 or 16:00
#                 if (target_date in file_path) and \
#                    ('_0800.csv' in file_path or '_1600.csv' in file_path) and \
#                    any(pattern in file_path for pattern in patterns):
#                     filtered_file_paths.append(f"s3://{bucket_name}/{file_path}")
    
#     return filtered_file_paths

# # Specify the S3 bucket and prefix (if any)
# bucket_name = 'mbta-tsp-signal'
# prefix = 'csv/'

# # Get the filtered file paths for the target date
# file_paths = list_and_filter_files(bucket_name, prefix, target_date='2024_10_03')
# print(f"Filtered File Paths: {len(file_paths)} found")


Filtered File Paths: 28 found


In [5]:
filtered_s3_paths = [path for path in file_paths if not path.endswith('detail.csv')]
len(filtered_s3_paths)

8808


In [9]:
# Function to download files from S3 without extra libraries
def download_files(file_paths, local_dir):
    if not os.path.exists(local_dir):
        os.makedirs(local_dir)
    for file_path in file_paths:
        bucket_name = file_path.split('/')[2]
        key = '/'.join(file_path.split('/')[3:])
        local_filename = os.path.join(local_dir, os.path.basename(file_path))
        
        # Use GetObject to download the file
        response = s3_client.get_object(Bucket=bucket_name, Key=key)
        
        # Save the file locally
        with open(local_filename, 'wb') as file:
            file.write(response['Body'].read())
        print(f"Downloaded: {local_filename}")

# Specify local directory to store files
local_directory = '/tmp/downloaded_files/'

# Download the files
download_files(file_paths, local_directory)


Downloaded: /tmp/downloaded_files/BRKL0001_SIEM_172.20.10.13_2024_10_03_0800.csv
Downloaded: /tmp/downloaded_files/BRKL0001_SIEM_172.20.10.13_2024_10_03_1600.csv
Downloaded: /tmp/downloaded_files/BRKL0002_SIEM_172.20.10.23_2024_10_03_0800.csv
Downloaded: /tmp/downloaded_files/BRKL0002_SIEM_172.20.10.23_2024_10_03_1600.csv
Downloaded: /tmp/downloaded_files/BRKL0003_SIEM_172.20.10.33_2024_10_03_0800.csv
Downloaded: /tmp/downloaded_files/BRKL0003_SIEM_172.20.10.33_2024_10_03_1600.csv
Downloaded: /tmp/downloaded_files/BRKL0004_SIEM_172.20.10.43_2024_10_03_0800.csv
Downloaded: /tmp/downloaded_files/BRKL0004_SIEM_172.20.10.43_2024_10_03_1600.csv
Downloaded: /tmp/downloaded_files/BRKL0005_SIEM_172.20.10.53_2024_10_03_0800.csv
Downloaded: /tmp/downloaded_files/BRKL0005_SIEM_172.20.10.53_2024_10_03_1600.csv
Downloaded: /tmp/downloaded_files/BRKL0006_SIEM_172.20.10.63_2024_10_03_0800.csv
Downloaded: /tmp/downloaded_files/BRKL0006_SIEM_172.20.10.63_2024_10_03_1600.csv
Downloaded: /tmp/downloaded_

In [10]:
# Function to zip the downloaded files
def zip_files(local_dir, zip_filename):
    zip_buffer = BytesIO()
    with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
        for root, dirs, files in os.walk(local_dir):
            for file in files:
                file_path = os.path.join(root, file)
                zip_file.write(file_path, os.path.basename(file_path))
    return zip_buffer

# Zip the files
zip_buffer = zip_files(local_directory, 'downloaded_files.zip')

# Upload the zip file to S3
output_bucket = 'aws-glue-assets-992382490096-us-east-1'
output_key = 'glue-logs/downloaded_files.zip'
s3_client.put_object(Bucket=output_bucket, Key=output_key, Body=zip_buffer.getvalue())
print(f"Uploaded zip to s3://{output_bucket}/{output_key}")


Uploaded zip to s3://aws-glue-assets-992382490096-us-east-1/glue-logs/downloaded_files.zip
