In [None]:
import boto3
import pandas as pd
from io import StringIO
import configparser
import time

In [None]:
config = configparser.ConfigParser()
config.read("aws.conf")

In [None]:
AWS_ACCESS_KEY = config["AWS"]["AWS_ACCESS_KEY"]
AWS_SECRET_KEY = config["AWS"]["AWS_SECRET_KEY"]
AWS_REGION = config["AWS"]["AWS_REGION"]
SCHEMA_NAME = config["S3"]["SCHEMA_NAME"]
S3_STAGING_DIR = config["S3"]["S3_STAGING_DIR"]
S3_BUCKET_NAME = config["S3"]["S3_BUCKET_NAME"]
S3_OUTPUT_DIRECTORY = config["S3"]["S3_OUTPUT_DIRECTORY"]

In [None]:
athena_client = boto3.client(
    "athena",
    aws_access_key_id=AWS_ACCESS_KEY,
    aws_secret_access_key=AWS_SECRET_KEY,
    region_name=AWS_REGION,
)

In [None]:
Dict = {}
def download_and_load_query_results(client: boto3.client, query_response: Dict) -> pd.DataFrame:
    while True:
        try:
            # This function only loads the first 1000 rows
            client.get_query_results(QueryExecutionId=query_response['QueryExecutionId'])
            break
        except Exception as err:
            if "not yet finished" in str(err):
                time.sleep(0.001)
            else:
                raise err
    temp_file_location: str = "athena_query_results.csv"
    s3_client = boto3.client(
        "s3",
        aws_access_key_id=AWS_ACCESS_KEY,
        aws_secret_access_key=AWS_SECRET_KEY,
        region_name=AWS_REGION,
        )
    s3_client.download_file(
        S3_BUCKET_NAME,
        f"{S3_OUTPUT_DIRECTORY}/{query_response['QueryExecutionId']}.csv",
        temp_file_location,
    )
#     print(temp_file_location)
    return pd.read_csv(temp_file_location)

In [None]:
response = athena_client.start_query_execution(
    QueryString="select * from countrycode",
    QueryExecutionContext={"Database": SCHEMA_NAME},
    ResultConfiguration={
        "OutputLocation": S3_STAGING_DIR,
        "EncryptionConfiguration": {"EncryptionOption": "SSE_S3"}
    },
)
# df_data = download_and_load_query_results(athena_client, response)

# The following is example of response format we get after athena query execution:
{'QueryExecutionId': '3c111625-3fd8-4bcc-92f1-ac24c61116f2',
 'ResponseMetadata': {'RequestId': '73ef74da-4e1d-41fb-984f-9216bf06f1d6',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Mon, 12 Feb 2024 22:47:14 GMT',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '59',
   'connection': 'keep-alive',
   'x-amzn-requestid': '73ef74da-4e1d-41fb-984f-9216bf06f1d6'},
  'RetryAttempts': 0}}

In [None]:
table = ["countrycode", "countypopulation", "enigma_jhud", "rearc_usa_hospital_beds", "state_abv", "states_daily", "us_county", "us_daily", "us_states", "us_total_latest"]
result = {}
for table_name in table:
    response = athena_client.start_query_execution(
        QueryString="select * from " + table_name,
        QueryExecutionContext={"Database": SCHEMA_NAME},
        ResultConfiguration={
            "OutputLocation": S3_STAGING_DIR,
            "EncryptionConfiguration": {"EncryptionOption": "SSE_S3"}
        },
    )
    result[table_name] = download_and_load_query_results(athena_client, response)

In [None]:
for key, val in result.items():
    print(f"Table name: {key}")
    print(val.head())
    print()

Looking the the above print of tables, "state_abv" table it's header is displayed as a record. 
So let's fix that using the following code.

In [None]:
new_header = result["state_abv"].iloc[0]

In [None]:
new_header

In [None]:
result["state_abv"].columns = new_header
result["state_abv"] = result["state_abv"].drop(0)
result["state_abv"].head()

In [None]:
result['enigma_jhud'].head() 

In [None]:
result['us_daily'].head()

In [None]:
fact_covid_1 = result['enigma_jhud'][['fips','province_state','country_region','confirmed','deaths','recovered','active']]
fact_covid_2 = result['states_daily'][['fips','date','positive','negative','hospitalizedcurrently','hospitalized','hospitalizeddischarged']]
fact_covid = pd.merge(fact_covid_1, fact_covid_2, on='fips', how='inner')

In [None]:
fact_covid.head()

In [None]:
fact_covid.shape

In [None]:
dim_region_1 = result['enigma_jhud'][['fips','province_state','country_region','latitude','longitude']]
dim_region_2 = result['us_county'][['fips','county','state']]
dim_region = pd.merge(dim_region_1, dim_region_2, on='fips', how='inner')

In [None]:
dim_region.head()

In [None]:
dim_region.shape

In [None]:
dim_hospital = result['rearc_usa_hospital_beds'][['fips','state_name','latitude','longtitude','hq_address','hospital_name','hospital_type','hq_city','hq_state']]

In [None]:
dim_hospital.head()

In [None]:
dim_date = result['states_daily'][['fips','date']]

In [None]:
dim_date.head()

In [None]:
dim_date.shape

In [None]:
dim_date['date'] = pd.to_datetime(dim_date['date'], format='%Y%m%d')

In [None]:
dim_date.head()

In [None]:
dim_date['year'] = dim_date['date'].dt.year
dim_date['month'] = dim_date['date'].dt.month
dim_date['day_of_week'] = dim_date['date'].dt.dayofweek

In [None]:
dim_date.head()

In [None]:
s3_resource = boto3.resource('s3',
    aws_access_key_id=AWS_ACCESS_KEY,
    aws_secret_access_key= AWS_SECRET_KEY)

In [None]:
bucket = "aman-covid-19-output"

In [None]:
csv_buffer = StringIO()
fact_covid.to_csv(csv_buffer)
s3_resource.Object(bucket, 'output/fact_covid.csv').put(Body=csv_buffer.getvalue())

In [None]:
dim_region.to_csv(csv_buffer)
s3_resource.Object(bucket, 'output/dim_region.csv').put(Body=csv_buffer.getvalue())

In [None]:
dim_hospital.to_csv(csv_buffer)
s3_resource.Object(bucket, 'output/dim_hospital.csv').put(Body=csv_buffer.getvalue())

In [None]:
dim_date.to_csv(csv_buffer)
s3_resource.Object(bucket, 'output/dim_date.csv').put(Body=csv_buffer.getvalue())