In [2]:
import os
import json
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
import boto3
import pandas as pd

## User Inputs

In [None]:
#AWS S3
S3_BUCKET_NAME = "rearc-quest-data-bhagath"
AWS_REGION = "ap-south-1"

# DATASETS
DATA_DIR = "data"
DATASET1_URL = "https://download.bls.gov/pub/time.series/pr"
DATASET1_HEADERS = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:141.0) Gecko/20100101 Firefox/141.0"}

DATASET2_URL = "https://honolulu-api.datausa.io/tesseract/data.jsonrecords?cube=acs_yg_total_population_1&drilldowns=Year%2CNation&locale=en&measures=Population"
DATASET2_JSON_FILE = "usa_population.json"

## Helper functions

In [4]:
def create_s3_bucket(bucket_name):
    """
    Function to create an S3 bucket if it does not already exist.
    """
    s3 = boto3.client('s3',region_name=AWS_REGION)
    bucket_exists = any([ bucket["Name"] for bucket in boto3.client('s3').list_buckets()["Buckets"] 
                 if bucket["Name"] == bucket_name])
    if bucket_exists:
        print(f"Bucket {bucket_name} already exists")
    else:
        print(f"Creating bucket {bucket_name}")
        s3.create_bucket(Bucket=bucket_name,CreateBucketConfiguration={'LocationConstraint': AWS_REGION})

In [5]:
def get_dataset1_info():
    """
    Function to retrive file, URL mappings for first dataset.
    """
    response = requests.get(DATASET1_URL,headers=DATASET1_HEADERS)
    response.raise_for_status()
    
    soup = BeautifulSoup(response.text, 'html.parser')
    files_dict = {}
    for a in soup.find_all('a'):
        href = a.get('href')
        if href and not href.endswith('/'):  # skip directories & parent links
            filename = href.split('/')[-1]
            files_dict[filename] = urljoin(DATASET1_URL, href)
    return files_dict

In [6]:
def get_dataset2_json():
    """
    Function to download the second dataset as JSON.
    """
    response = requests.get(DATASET2_URL)
    response.raise_for_status()
    data = response.json()
    download_dir = os.path.join(DATA_DIR, "dataset2")
    os.makedirs(download_dir, exist_ok=True)
    with open(os.path.join(download_dir, DATASET2_JSON_FILE), 'w') as f:
        json.dump(data, f, indent=4)
    print(f"Dataset 2 JSON saved to {DATASET2_JSON_FILE}")

In [22]:
def get_s3_objects(bucket_name,s3_prefix=""):
    """
    Function to get list of objects(files) in an S3 bucket.
    """
    s3 = boto3.client('s3',region_name=AWS_REGION)
    objects = []

    paginator = s3.get_paginator('list_objects_v2')
    for page in paginator.paginate(Bucket=bucket_name, Prefix=s3_prefix):
        if 'Contents' in page:
            for item in page['Contents']:
                key = item['Key']
                if not key.endswith('/'):  # skip folder marker keys
                    if key.startswith(s3_prefix):
                        key = key.split(s3_prefix)[1]
                    objects.append(key)

    return objects

In [23]:
def upload_files_to_s3(files_info, existing_s3_files, s3_prefix=""):
    """
    Function to upload files to S3 bucket from Dataset 1.
    """
    s3 = boto3.client('s3',region_name=AWS_REGION)
    files_to_download = [ "pr.data.0.Current" ]

    download_dir = os.path.join(DATA_DIR, "dataset1")
    os.makedirs(download_dir, exist_ok=True)

    for filename, file_url in files_info.items():
        #skip filenames with no urls to download, usually these are not part of Dataset1
        if file_url is None: continue
        file_resp = None
        if filename not in existing_s3_files:
            file_resp = requests.get(file_url, stream=True, headers=DATASET1_HEADERS)
            file_resp.raise_for_status()
            print(f"Uploading {filename} to S3...")
            s3.upload_fileobj(file_resp.raw, S3_BUCKET_NAME, f"{s3_prefix}{filename}")
        else:
            print(f"Skipping {filename}, already in S3.")
        
        filepath = os.path.join(download_dir, filename)
        if filename in files_to_download and not os.path.exists(filepath):    
            print(f"Downloading {filename} to local directory...")
            if file_resp is None:
                file_resp = requests.get(file_url, stream=True, headers=DATASET1_HEADERS)
                file_resp.raise_for_status()
            with open(filepath, "wb") as f:
                for chunk in file_resp.iter_content(chunk_size=8192):
                    f.write(chunk)

In [24]:
def remove_files_from_s3(files_info, existing_s3_files,s3_prefix=""):
    """
    Function to delete files from S3 bucket that are not present in Dataset 1.
    """
    files_to_delete = set(existing_s3_files) - set(files_info.keys())
    s3 = boto3.client('s3',region_name=AWS_REGION)
    if files_to_delete:
        print("Deleting obsolete files from S3...")
        delete_objects = [{"Key": f"{s3_prefix}{f}"} for f in files_to_delete]
        s3.delete_objects(Bucket=S3_BUCKET_NAME, Delete={"Objects": delete_objects})
        print(f"Deleted {len(delete_objects)} files.")
    else:
        print("No files to delete.")

In [None]:
def generate_index_html(prefix):
    """
    Create an index.html to place in s3 and serve the bucket objects as static webpage
    """
    s3 = boto3.client("s3", region_name=AWS_REGION)

    response = s3.list_objects_v2(Bucket=S3_BUCKET_NAME, Prefix=prefix)
    if "Contents" not in response:
        print("No files found in S3 folder.")
        return

    html_lines = [
        "<!DOCTYPE html>",
        "<html><head><title>Dataset Files</title></head><body>",
        f"<h2>Files in {prefix}</h2>",
        "<ul>"
    ]

    for obj in response["Contents"]:
        key = obj["Key"]
        if key.endswith("/"):  # skip "folder markers"
            continue
        filename = key.split("/")[-1]
        file_url = f"https://{S3_BUCKET_NAME}.s3.{AWS_REGION}.amazonaws.com/{key}"
        html_lines.append(f'<li><a href="{file_url}">{filename}</a></li>')

    html_lines.append("</ul></body></html>")
    html_content = "\n".join(html_lines)

    # upload index.html into the folder
    index_key = prefix + "index.html"
    s3.put_object(
        Bucket=S3_BUCKET_NAME,
        Key=index_key,
        Body=html_content,
        ContentType="text/html"
    )

    print(f"index.html uploaded to s3://{S3_BUCKET_NAME}/{index_key}")
    print(f"Access it via: https://{S3_BUCKET_NAME}.s3.{AWS_REGION}.amazonaws.com/{index_key}")

In [26]:
def sync_bucket(s3_prefix="dataset/"):
    """
    Function to synchronize files with S3 bucket.
    and uploads Dataset 2 JSON to S3.
    """
    remote_files = get_dataset1_info()
    s3_files = get_s3_objects(S3_BUCKET_NAME,s3_prefix)
    remote_files["index.html"] = None
    remote_files[DATASET2_JSON_FILE] = None
    upload_files_to_s3(remote_files,s3_files,s3_prefix)
    remove_files_from_s3(remote_files,s3_files,s3_prefix)
    
    print("Uploading Dataset 2 JSON to S3...")
    get_dataset2_json()
    dataset2_path = os.path.join(DATA_DIR, "dataset2", DATASET2_JSON_FILE)
    s3 = boto3.client('s3',region_name=AWS_REGION)
    s3.upload_file(dataset2_path, S3_BUCKET_NAME, f"{s3_prefix}{DATASET2_JSON_FILE}")
    print(f"Uploaded {DATASET2_JSON_FILE} to S3 bucket {S3_BUCKET_NAME}")

    print("S3 Sync complete.")
    generate_index_html(s3_prefix)

## Data sourcing

In [12]:
create_s3_bucket(S3_BUCKET_NAME)

Bucket rearc-quest-data-bhagath already exists


In [27]:
sync_bucket()

Skipping pr.class, already in S3.
Skipping pr.contacts, already in S3.
Skipping pr.data.0.Current, already in S3.
Skipping pr.data.1.AllData, already in S3.
Skipping pr.duration, already in S3.
Skipping pr.footnote, already in S3.
Skipping pr.measure, already in S3.
Skipping pr.period, already in S3.
Skipping pr.seasonal, already in S3.
Skipping pr.sector, already in S3.
Skipping pr.series, already in S3.
Skipping pr.txt, already in S3.
No files to delete.
Uploading Dataset 2 JSON to S3...
Dataset 2 JSON saved to usa_population.json
Uploaded usa_population.json to S3 bucket rearc-quest-data-bhagath
S3 Sync complete.
index.html uploaded to s3://rearc-quest-data-bhagath/dataset/index.html
Access it via: https://rearc-quest-data-bhagath.s3.ap-south-1.amazonaws.com/dataset/index.html


## Data analytics

In [None]:
df1 = pd.read_csv(os.path.join(DATA_DIR, "dataset1", "pr.data.0.Current"), sep="\t")
df1.columns = df1.columns.str.strip()
df1 = df1.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
ds2_json = json.load(open(os.path.join(DATA_DIR, "dataset2", DATASET2_JSON_FILE)))
df2 = pd.json_normalize(ds2_json['data'])

In [30]:
html_lines = [
        "<!DOCTYPE html>",
        "<html><head><title>Data Analytics</title></head><body>",
        f"<h2>Data Analytics</h2>"
    ]

In [31]:
pop_df = df2[(df2['Year']>=2013) & (df2['Year']<=2018)]['Population']
html_lines.append("<h3>Population Data statistics from 2013 to 2918</h3>")
html_lines.append(f"<p>Mean: {pop_df.mean()}</p>")
html_lines.append(f"<p>Standard Deviation: {pop_df.std()}</p>")
print("Population Data statistics from 2013 to 2918")
print(f"Mean: {pop_df.mean()}")
print(f"Standard Deviation: {pop_df.std()}")

Population Data statistics from 2013 to 2918
Mean: 322069808.0
Standard Deviation: 4158441.040908095


In [None]:
df1_by_sid_yr = df1.groupby(['series_id','year'], as_index=False)['value'].sum()
df1_by_best_yr = df1_by_sid_yr.loc[df1_by_sid_yr.groupby("series_id")["value"].idxmax()].reset_index(drop=True)
html_lines.append("<h3>Best year info per series_id</h3>")
html_lines.append(df1_by_best_yr.to_html(index=False))
df1

Unnamed: 0,series_id,year,period,value,footnote_codes
0,PRS30006011,1995,Q01,2.600,
1,PRS30006011,1995,Q02,2.100,
2,PRS30006011,1995,Q03,0.900,
3,PRS30006011,1995,Q04,0.100,
4,PRS30006011,1995,Q05,1.400,
...,...,...,...,...,...
37177,PRS88003203,2024,Q02,116.544,
37178,PRS88003203,2024,Q03,116.593,
37179,PRS88003203,2024,Q04,116.682,
37180,PRS88003203,2024,Q05,116.686,


In [45]:
# Question: the requirement is not clear about the input year, 
# I took value 2018 as it's shown in the example/expected output
# I made these 3 params configurable as I am not sure if the series_id, period given in REAM me is just for reference
# or if it should be used in the solution to filter out Dataset 1 i.e. df1.
series_id = 'PRS30006032'
period = 'Q01'
year = 2018


filtered_df1 = df1[(df1['series_id'] == series_id) 
                   & (df1['period'] == period)
                   & (df1['year'] == year)]
merged_df = pd.merge(filtered_df1,df2,
         left_on=['year'],
         right_on=['Year'],
         how='inner')[['series_id','year','period','value','Population']]
html_lines.append(f"<h3>Population details for series_id: {series_id}, period: {period}, year: {year} </h3>")
html_lines.append(merged_df.to_html(index=False))
merged_df

Unnamed: 0,series_id,year,period,value,Population
0,PRS30006032,2018,Q01,0.5,327167439.0


In [None]:
html_lines.append("</body></html>")
html_content = "\n".join(html_lines)

In [48]:
report_dir = os.path.join(DATA_DIR,"reports")
os.makedirs(report_dir, exist_ok=True)
html_path = os.path.join(report_dir,"report.html")
with open(html_path,"w") as html_file:
    html_file.write(html_content)
print(f"Report saved at {html_path}")

Report saved at data/reports/report.html
