<a href="https://colab.research.google.com/github/ankit-rathi/DE-with-AWS/blob/main/Try_Kinesis_Data_Firehose.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install boto3

Collecting boto3
  Downloading boto3-1.35.5-py3-none-any.whl.metadata (6.6 kB)
Collecting botocore<1.36.0,>=1.35.5 (from boto3)
  Downloading botocore-1.35.5-py3-none-any.whl.metadata (5.7 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.11.0,>=0.10.0 (from boto3)
  Downloading s3transfer-0.10.2-py3-none-any.whl.metadata (1.7 kB)
Downloading boto3-1.35.5-py3-none-any.whl (139 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.1/139.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading botocore-1.35.5-py3-none-any.whl (12.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.5/12.5 MB[0m [31m45.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Downloading s3transfer-0.10.2-py3-none-any.whl (82 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.7/82.7 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00

In [2]:
from google.colab import drive
drive.mount('/content/drive')
import os

import pandas as pd

project_path = '/content/drive/My Drive/Personal'
os.chdir(project_path)

Mounted at /content/drive


In [7]:
import boto3
import json
import time

aws_keys_df = pd.read_csv('aws-rootkey.csv')

# Replace with your actual AWS Access Key ID and Secret Access Key
AWS_ACCESS_KEY_ID = aws_keys_df['Access_key_ID'][0]
AWS_SECRET_ACCESS_KEY = aws_keys_df['Secret_access_key'][0]
REGION_NAME = aws_keys_df['Region'][0]

# Initialize a session using your credentials
session = boto3.Session(
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    region_name=REGION_NAME
)

# Create an IAM client
iam_client = session.client('iam')

# Create a Firehose client
firehose_client = session.client('firehose')
s3_client = session.client('s3')


In [8]:
# Define the role name
role_name = 'FirehoseToS3Role'

# Trust relationship policy document
trust_policy = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Principal": {
                "Service": "firehose.amazonaws.com"
            },
            "Action": "sts:AssumeRole"
        }
    ]
}

# Create the IAM role
try:
    role = iam_client.create_role(
        RoleName=role_name,
        AssumeRolePolicyDocument=json.dumps(trust_policy),
        Description='IAM role for Kinesis Firehose to deliver data to S3'
    )
    role_arn = role['Role']['Arn']
    print(f"Created IAM role with ARN: {role_arn}")
except iam_client.exceptions.EntityAlreadyExistsException:
    role_arn = f'arn:aws:iam::{session.client("sts").get_caller_identity()["Account"]}:role/{role_name}'
    print(f"IAM role {role_name} already exists. ARN: {role_arn}")


Created IAM role with ARN: arn:aws:iam::419441991443:role/FirehoseToS3Role


In [9]:
# Define the bucket name
bucket_name = 'my-bucket-rathakt'

# Create the S3 bucket
try:
    s3_client.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={'LocationConstraint': REGION_NAME})
    print(f"S3 bucket {bucket_name} created successfully.")
except s3_client.exceptions.BucketAlreadyOwnedByYou:
    print(f"S3 bucket {bucket_name} already exists.")

S3 bucket my-bucket-rathakt already exists.


In [10]:
# Define the delivery stream name
delivery_stream_name = 'MyFirehoseStream'

# Create the Firehose delivery stream
response = firehose_client.create_delivery_stream(
    DeliveryStreamName=delivery_stream_name,
    S3DestinationConfiguration={
        'BucketARN': f'arn:aws:s3:::{bucket_name}',
        'RoleARN': 'arn:aws:iam::419441991443:role/FirehoseToS3Role',
        'Prefix': 'firehose-data/',
        'BufferingHints': {
            'SizeInMBs': 5,
            'IntervalInSeconds': 300
        },
        'CompressionFormat': 'UNCOMPRESSED'
    }
)

print(f"Firehose delivery stream {delivery_stream_name} created successfully.")

Firehose delivery stream MyFirehoseStream created successfully.


In [13]:

# Put records into the Firehose delivery stream
for i in range(5):
    data = json.dumps({'index': i, 'message': f'This is record {i}'})
    firehose_client.put_record(
        DeliveryStreamName=delivery_stream_name,
        Record={'Data': data}
    )
    print(f"Put record {i} into Firehose stream {delivery_stream_name}.")


Put record 0 into Firehose stream MyFirehoseStream.
Put record 1 into Firehose stream MyFirehoseStream.
Put record 2 into Firehose stream MyFirehoseStream.
Put record 3 into Firehose stream MyFirehoseStream.
Put record 4 into Firehose stream MyFirehoseStream.


In [17]:
# List the objects in the S3 bucket to verify data delivery
objects = s3_client.list_objects_v2(Bucket=bucket_name, Prefix='firehose-data/')
if 'Contents' in objects:
    for obj in objects['Contents']:
        print(f"Found object {obj['Key']} in S3 bucket {bucket_name}.")
else:
    print(f"No objects found in S3 bucket {bucket_name} yet.")


No objects found in S3 bucket my-bucket-rathakt yet.


In [18]:
# Delete the Firehose delivery stream
firehose_client.delete_delivery_stream(DeliveryStreamName=delivery_stream_name, AllowForceDelete=True)
print(f"Firehose delivery stream {delivery_stream_name} deleted.")

# Delete the S3 bucket
s3_client.delete_bucket(Bucket=bucket_name)
print(f"S3 bucket {bucket_name} deleted.")


Firehose delivery stream MyFirehoseStream deleted.
S3 bucket my-bucket-rathakt deleted.
