In [1]:
# Install required packages
!pip install PyAthena[SQLAlchemy]
!pip install CurrencyConverter

# Import necessary libraries
import numpy as np
import pandas as pd
import json
from datetime import datetime, timedelta
from sqlalchemy import create_engine
from currency_converter import CurrencyConverter, ECB_URL
import tr_dash_util as util
from tr_dash_util import clean_eReg_dataframe
import pytz
import boto3

# Define constants and configurations
S3_STAGING_DIR = "s3://ets-aws-plalab-dii-prod-analyticsbucket-1ktrlhzbrcbkb/athena_query_results/"
ATHENA_REGION = "us-east-1"
APPLICATION_ID = '01845c7c-fa6d-4788-9ddd-cd888c977f36'
ENV = 'prod'
USER_TYPE = 'external_user'
EVENT_NAMES = ['ProfileCreated']
START_DATE = datetime(2024, 2, 1)
END_DATE = datetime(2024, 5, 27)

# Function to create engine connection to Athena
def create_engine_connection():
    connection_string = f"awsathena+rest://:@athena.{ATHENA_REGION}.amazonaws.com:443/labsprodeventsdatabase-x806vjuzpbrd?s3_staging_dir={S3_STAGING_DIR}"
    engine = create_engine(connection_string)
    return engine

# Function to fetch data for a specific date from Athena
def fetch_data_for_date(engine, date):
    year, month, day = date.strftime("%Y"), date.strftime("%m"), date.strftime("%d")
    query = f"""
    SELECT * FROM processed_events
    WHERE application_id='{APPLICATION_ID}'
    AND year='{year}' AND month='{month}' AND day='{day}'
    AND env = '{ENV}' AND user_type = '{USER_TYPE}'
    AND event_name IN ('ProfileCreated')
    """
    df = pd.read_sql(query, engine)
    df_clean = clean_eReg_dataframe(df)
    return df_clean

# Function to generate a range of dates
def daterange(start_date, end_date):
    for n in range(int((end_date - start_date).days) + 1):
        yield start_date + timedelta(n)

# Main script execution
if __name__ == "__main__":
    engine = create_engine_connection()
    all_data_frames = []
    for single_date in daterange(START_DATE, END_DATE):
        print(f"Fetching data for {single_date.strftime('%Y-%m-%d')}...")
        daily_data = fetch_data_for_date(engine, single_date)
        all_data_frames.append(daily_data)
    profile_data_frame = pd.concat(all_data_frames, ignore_index=True)
    
    # Extract relevant columns for profile creation data
    profile_creation_data = profile_data_frame[['user_id', 'event_timestamp']].copy()
    profile_creation_data.rename(columns={'event_timestamp': 'profile_created_time'}, inplace=True)
    
    # Save the profile creation DataFrame to a CSV file for verification
    profile_creation_data.to_csv("profile_created_data.csv", index=False)
    print("Profile creation data fetched and saved to CSV.")





  engine = create_engine(connection_string)


Fetching data for 2024-02-01...
Fetching data for 2024-02-02...
Fetching data for 2024-02-03...
Fetching data for 2024-02-04...
Fetching data for 2024-02-05...
Fetching data for 2024-02-06...
Fetching data for 2024-02-07...
Fetching data for 2024-02-08...
Fetching data for 2024-02-09...
Fetching data for 2024-02-10...
Fetching data for 2024-02-11...
Fetching data for 2024-02-12...
Fetching data for 2024-02-13...
Fetching data for 2024-02-14...
Fetching data for 2024-02-15...
Fetching data for 2024-02-16...
Fetching data for 2024-02-17...
Fetching data for 2024-02-18...
Fetching data for 2024-02-19...
Fetching data for 2024-02-20...
Fetching data for 2024-02-21...
Fetching data for 2024-02-22...
Fetching data for 2024-02-23...
Fetching data for 2024-02-24...
Fetching data for 2024-02-25...
Fetching data for 2024-02-26...
Fetching data for 2024-02-27...
Fetching data for 2024-02-28...
Fetching data for 2024-02-29...
Fetching data for 2024-03-01...
Fetching data for 2024-03-02...
Fetching