In [None]:
import boto3

universe_bucket="ajayvohra-phrase-piece-pdx-1"
assert universe_bucket, "universe bucket is required"

s3_client = boto3.client("s3")
response = s3_client.get_bucket_location(Bucket=universe_bucket)
aws_region = response["LocationConstraint"]
aws_region = "us-east-1" if aws_region is None else aws_region
print(f"AWS Region: {aws_region}")
lambda_client = boto3.client("lambda", region_name=aws_region)

sts = boto3.client("sts")
aws_account_id = sts.get_caller_identity()["Account"]
print(f"AWS account id: {aws_account_id}")


In [None]:
from common import get_lamabda_function_arn

partial_arn = f"arn:aws:lambda:{aws_region}:{aws_account_id}:function:phrase-piece-CreateCorpusFunction"
create_corpus_arn = get_lamabda_function_arn(
  lambda_client=lambda_client,
  aws_region=aws_region,
  partial_arn=partial_arn)

assert create_corpus_arn, "Create corpus function ARN is required"
create_corpus_arn


In [None]:
from common import s3_bucket_keys

s3_uris = []
bucket_prefix = "midas/semeval2017/documents/"

for key in s3_bucket_keys(s3_client=s3_client, bucket_name=universe_bucket, bucket_prefix=bucket_prefix):
    s3_uris.append(f"s3://{universe_bucket}/{key}")


In [None]:
import re
import time
from common import create_corpus

sms = []
count = 0
max_count = 1000

filter_names = []
for s3_uri in s3_uris:
    m=re.match(r".+\/id=(\w+)\/.+", s3_uri)
    if m:
        name = f"semeval2017-{m[1]}"
        if filter_names and name not in filter_names:
            continue
        response = create_corpus(
            lambda_client=lambda_client,
            create_corpus_arn=create_corpus_arn,
            name=name, 
            s3_uri=s3_uri)
        sms.append( (name, response['CorpusStateMachine'],  response['CorpusId'], s3_uri) )
        count += 1
        time.sleep(2)
    if count >= max_count:
        break
print(f"Create Corpus State Machines running count: {len(sms)}")

In [None]:
partial_arn = f"arn:aws:lambda:{aws_region}:{aws_account_id}:function:phrase-piece-DeleteCorpusFunction"
delete_corpus_arn = get_lamabda_function_arn(
  lambda_client=lambda_client,
  aws_region=aws_region,
  partial_arn=partial_arn)

assert delete_corpus_arn, "Delete corpus function ARN is required"
delete_corpus_arn

In [None]:
from common import delete_corpus, wait_for_sfn_sm
    
sfn_client=boto3.client('stepfunctions', region_name=aws_region)
create_corpus_failed = []
for name, sm, corpus_id, s3_uri in sms:
    status = wait_for_sfn_sm(
        sfn_client=sfn_client,
        sm_execution_arn=sm)
    if status != "SUCCEEDED":
        delete_corpus(lambda_client=lambda_client,
                      delete_corpus_arn=delete_corpus_arn,
                      corpus_id=corpus_id)
        create_corpus_failed.append((name, s3_uri))
    print(f"{s3_uri}: {status}")

In [None]:
import pickle

# File path
filename = 'semeval2017-create_corpus_failed.pkl'

# Write the object to the pickle file
with open(filename, 'wb') as file:
    pickle.dump(create_corpus_failed, file)

In [None]:

while True:
    with open(filename, 'rb') as file:
        create_corpus_failed = pickle.load(file)
    if len(create_corpus_failed) == 0:
        break
    
    print(f"retrying failed: {create_corpus_failed}")
    sms.clear()
    for name, s3_uri in create_corpus_failed:
        response = create_corpus(
            lambda_client=lambda_client,
            create_corpus_arn=create_corpus_arn,
            name=name, s3_uri=s3_uri)
        sms.append( (name, response['CorpusStateMachine'],  response['CorpusId'], s3_uri) )
        time.sleep(60)

    create_corpus_failed.clear()
    for name, sm, corpus_id, s3_uri in sms:
        status = wait_for_sfn_sm(
            sfn_client=sfn_client,
            sm_execution_arn=sm)
        if status != "SUCCEEDED":
            delete_corpus(lambda_client=lambda_client,
                      delete_corpus_arn=delete_corpus_arn,
                      corpus_id=corpus_id)
            create_corpus_failed.append(name, s3_uri)
        print(f"{s3_uri}: {status}")

    with open(filename, 'wb') as file:
        pickle.dump(create_corpus_failed, file)

In [None]:
bucket_prefix = f"keyphrases/tag={name}/"




In [None]:
import json
bucket_prefix = f"midas/semeval2017/keyphrases/id={name}/"


In [None]:
dynamodb = boto3.resource('dynamodb')
table = dynamodb.Table('')

response = table.scan()
corpus_data = response['Items']

while 'LastEvaluatedKey' in response:
    response = table.scan(ExclusiveStartKey=response['LastEvaluatedKey'])
    corpus_data.extend(response['Items'])

In [None]:
import json

semeval2017_corpora=[]
prefix = "semeval2017-"
for item in corpus_data:
  corpus_name = item['corpus_name']
  if corpus_name.startswith(prefix):
    semeval2017_corpora.append(corpus_name)

semeval2017_corpora

In [None]:
!pip install rouge-score

In [None]:
from rouge_score import rouge_scorer
from common import get_candidates, get_references

def f_score():

    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    all_list = []

    for name in semeval2017_corpora:
        
        d_list = []
        references = get_references(name.rsplit('-', 1)[1])
        candidates = get_candidates(name)
        candidates.sort(key = lambda x: x[1], reverse=True)
        candidates = candidates[0: len(references)]

        for candidate, _ in candidates:
            c_list = []
            for reference in references:
                scores = scorer.score(reference, candidate)
                c_list.append(scores['rougeL'])
        
            c_list.sort(key=lambda x: x.fmeasure, reverse=True)
            d_list.append(c_list[0])

        precision = 0.0
        recall = 0.0
        fmeasure = 0.0
        total = 0

        for s in d_list:
            total += 1
            precision += s.precision
            recall += s.recall
            fmeasure += s.fmeasure
        
        all_list.append( ( precision/total, recall/total, fmeasure/total))
        

    precision = 0.0
    recall = 0.0
    fmeasure = 0.0
    total = 0

    for s in all_list:
        total += 1
        precision += s[0]
        recall += s[1]
        fmeasure += s[2]

    return precision/total, recall/total, fmeasure/total


p,r,f = f_score()
print(p,r,f)