# Collection Success Rate Analysis

Analyze the collection success rate of automated collection framework by reading the files in a S3 bucket: overall, the success rate is above 90%. 
<br>
Last Run: 6/4/2024

In [2]:
import pandas as pd
import boto3
from tqdm import tqdm

In [32]:
# for domestic sites
total_domestic_sites = 45

def list_all_objects(bucket, prefix):
    s3 = boto3.client('s3')

    # get the first 1000
    continuation_token = None
    response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix, Delimiter="/")
    CommonPrefixes = response.get('CommonPrefixes', [])
    continuation_token = response.get("NextContinuationToken", None)
    # continue if there is more
    while continuation_token is not None:
        response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix, Delimiter="/", ContinuationToken=continuation_token)
        CommonPrefixes += response.get('CommonPrefixes', [])

        # get next token
        continuation_token = response.get("NextContinuationToken", None)
        
    return CommonPrefixes

def update_collection_rate_stats(local_stats_path, s3_bucket, prefix, total_sites):
    # read existing data
    df = pd.read_csv(local_stats_path, dtype={'time': str})

    # get new data
    response = list_all_objects(s3_bucket, prefix)
    new_data = []
    collected_time = df["time"].to_list()

    # update existing data
    s3 = boto3.client('s3')
    for folder_name in tqdm(response):
        folder_name = folder_name["Prefix"]
        time = folder_name.split("/")[-2]

        # check if datapoint is collected
        if time in collected_time:
            continue

        # not collected
        datapoint_res = s3.list_objects_v2(Bucket=s3_bucket, Prefix=folder_name + "html/")
        collectionn_rate = len(datapoint_res["Contents"]) / total_domestic_sites
        new_data.append({"time" : str(time), "collection_rate" : collectionn_rate})
    
    new_df = pd.concat([df, pd.DataFrame(new_data)], ignore_index=True)
    new_df.to_csv(local_stats_path, index=False) # update saved data

    return new_df

In [33]:
t = update_collection_rate_stats("./data/collection_rate/collection_rate_domestic.csv", "news-collection-2024-3371", "current_data/", total_domestic_sites)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1868/1868 [00:24<00:00, 74.74it/s]


In [34]:
# collection success rate for Jan and Feb
sum(t["collection_rate"]) / len(t["collection_rate"])

0.9765762550559124