<a href="https://colab.research.google.com/github/ankit-rathi/AR-Talks/blob/master/tmp_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install boto3 library
!pip install boto3



In [2]:
# Mount google drive
from google.colab import drive
drive.mount('/content/drive')
import os

import pandas as pd

project_path = '/content/drive/My Drive/Personal'
os.chdir(project_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import boto3

# S3 and Kafka configurations
# Load AWS credentials from CSV
aws_keys_df = pd.read_csv('aws-rootkey.csv')

S3_BUCKET_NAME = 'my-bucket-ar'
AWS_ACCESS_KEY_ID = aws_keys_df['Access_key_ID'][0]
AWS_SECRET_ACCESS_KEY = aws_keys_df['Secret_access_key'][0]
REGION_NAME = aws_keys_df['Region'][0]

# Initialize S3 client
s3_client = boto3.client(
    's3',
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY
)


# Create S3 Bucket
def create_s3_bucket(bucket_name):
    try:
        response = s3_client.create_bucket(
            Bucket=bucket_name,
            CreateBucketConfiguration={'LocationConstraint': REGION_NAME}
        )
        print(f"Bucket '{bucket_name}' created successfully.")
    except Exception as e:
        print(f"Error creating bucket: {str(e)}")

create_s3_bucket(S3_BUCKET_NAME)

Error creating bucket: An error occurred (BucketAlreadyOwnedByYou) when calling the CreateBucket operation: Your previous request to create the named bucket succeeded and you already own it.


In [4]:
import pandas as pd
import json
import os
import io
import zipfile

# Parse the request body JSON for zip file names
def parse_request_body(request_body):
    try:
        request_data = json.loads(request_body)
        zip_files = request_data.get("zip_files", [])
        request_id = request_data.get("request_id", "default_request")
        return zip_files, request_id
    except json.JSONDecodeError as e:
        raise ValueError(f"Invalid JSON format: {e}")

# Download a single zip file from S3 (keep in memory as binary)
def download_zip_file_from_s3(zip_file):
    response = s3_client.get_object(Bucket=S3_BUCKET_NAME, Key=zip_file)
    zip_file_binary = response['Body'].read()  # Keep as binary data
    print(f"Downloaded {zip_file} as binary")
    return zip_file_binary

# Unzip a single file from its binary content
def unzip_file(zip_file_binary, zip_file_name, request_id):
    unzipped_files = []
    with zipfile.ZipFile(io.BytesIO(zip_file_binary), 'r') as zip_ref:
        for file_name in zip_ref.namelist():
            with zip_ref.open(file_name) as file:
                binary_file_content = file.read()
            unzipped_files.append((file_name, binary_file_content))
    print(f"Unzipped {zip_file_name} in memory")
    return unzipped_files

# Rename a single unzipped file and collect details
def rename_unzipped_file(file_name, zip_file_name, request_id, file_category, datastore_id, binary_file_content):
    new_file_name = f"{request_id}_{os.path.basename(zip_file_name)}_{file_name}".replace(" ", "_").replace(".zip", "")
    new_file_type = new_file_name.split('.')[-1]
    file_message_dict = {"old_file_name": file_name, "parent_zipfilename": zip_file_name}
    print(f"Renamed {file_name} to {new_file_name}")
    return new_file_name, new_file_type, file_category, datastore_id, file_message_dict, binary_file_content

# Append file details to dataframe
def append_to_dataframe(df, file_details):
    new_row = pd.DataFrame([file_details], columns=["file_name", "file_type", "file_category", "datastore_id", "file_message_dict", "binary_file_content"])
    return pd.concat([df, new_row], ignore_index=True)

# Main workflow
def process_zip_files_workflow(request_body):
    try:
        # Parse the request body
        zip_files, request_id = parse_request_body(request_body)
        zip_files = zip_files.split(',')

        file_category = 'unstructured'
        datastore_id = 'DS100'

        # Initialize an empty dataframe
        output_df = pd.DataFrame(columns=["file_name", "file_type", "file_category", "datastore_id", "file_message_dict", "binary_file_content"])

        # Loop over each zip file
        for zip_file in zip_files:
            # Download the zip file from S3
            zip_file_binary = download_zip_file_from_s3(zip_file)

            # Unzip the file and process each unzipped file
            unzipped_files = unzip_file(zip_file_binary, zip_file, request_id)

            # Loop through each unzipped file for renaming and appending to the dataframe
            for file_name, binary_file_content in unzipped_files:
                renamed_file_details = rename_unzipped_file(file_name, zip_file, request_id, file_category, datastore_id, binary_file_content)
                output_df = append_to_dataframe(output_df, renamed_file_details)

        # Optionally: Delete zip files from S3 after processing
        # for zip_file in zip_files:
        #     delete_zip_file_from_s3(zip_file)

        # Get comma-separated new file names
        new_file_names = ','.join(output_df['file_name'].tolist())
        print(new_file_names)

        print("Process completed successfully!")
        return output_df

    except Exception as e:
        print(f"Error occurred during process: {e}")



In [5]:
request_body = '{"request_id":"request_123","zip_files": "file1.zip,file2.zip"}'
output_df = process_zip_files_workflow(request_body)
output_df

Downloaded file1.zip as binary
Unzipped file1.zip in memory
Renamed pqr.txt to request_123_file1_pqr.txt
Renamed abc.wav to request_123_file1_abc.wav
Downloaded file2.zip as binary
Unzipped file2.zip in memory
Renamed xyz.mp3 to request_123_file2_xyz.mp3
request_123_file1_pqr.txt,request_123_file1_abc.wav,request_123_file2_xyz.mp3
Process completed successfully!


Unnamed: 0,file_name,file_type,file_category,datastore_id,file_message_dict,binary_file_content
0,request_123_file1_pqr.txt,txt,unstructured,DS100,"{'old_file_name': 'pqr.txt', 'parent_zipfilena...",b''
1,request_123_file1_abc.wav,wav,unstructured,DS100,"{'old_file_name': 'abc.wav', 'parent_zipfilena...",b''
2,request_123_file2_xyz.mp3,mp3,unstructured,DS100,"{'old_file_name': 'xyz.mp3', 'parent_zipfilena...",b''
