In [None]:
import boto3
lambda_client = boto3.client("lambda")
s3_client = boto3.client("s3")

In [None]:
import json

create_corpus_arn = ""

def create_corpus(name, s3_uri):

  json_data = { 
    "CorpusName": name,
    "S3Uri": s3_uri,
    "SimThreshold": "0.10"
  }
  
  payload = json.dumps({ "body": json.dumps(json_data) })

  response = lambda_client.invoke(
      FunctionName=create_corpus_arn,
      InvocationType='RequestResponse',
      Payload=payload
  )

  print(response)
  
  json_obj = json.loads(response['Payload'].read())
  data = json.loads(json_obj['body'])
  return data

In [None]:
import time

sfn_client=boto3.client('stepfunctions')
def wait_for_sfn_sm(sm_execution_arn):
    status = 'RUNNING'
    while status == 'RUNNING':
        response = sfn_client.describe_execution(executionArn=sm_execution_arn)
        status = response.get('status')
        if status == 'RUNNING':
            time.sleep(15)
        
    return status

In [None]:
def s3_bucket_keys(s3_client, bucket_name, bucket_prefix):
    """Generator for listing S3 bucket keys matching prefix"""

    kwargs = {'Bucket': bucket_name, 'Prefix': bucket_prefix}
    while True:
        resp = s3_client.list_objects_v2(**kwargs)
        for obj in resp['Contents']:
            yield obj['Key']

        try:
            kwargs['ContinuationToken'] = resp['NextContinuationToken']
        except KeyError:
            break

In [None]:
def delete_corpus(corpus_id):
  payload = json.dumps( { "body": "{ \"CorpusId\": \"" + corpus_id + "\" }" } )

  print(payload)
  response = lambda_client.invoke(
      FunctionName='',
      InvocationType='RequestResponse',
      Payload=payload
  )

  print(response)

In [None]:
s3_uris = []

universe_bucket = 
bucket_prefix = "midas/semeval2017/documents/"

for key in s3_bucket_keys(s3_client=s3_client, bucket_name=universe_bucket, bucket_prefix=bucket_prefix):
    s3_uris.append(f"s3://{universe_bucket}/{key}")


In [None]:
import re

sms = []
count = 0
max_count = 1000

filter_names = []
for s3_uri in s3_uris:
    m=re.match(r".+\/id=(\w+)\/.+", s3_uri)
    if m:
        name = f"semeval2017-{m[1]}"
        if filter_names and name not in filter_names:
            continue
        response = create_corpus(name=name, s3_uri=s3_uri)
        sms.append( (name, response['CorpusStateMachine'],  response['CorpusId'], s3_uri) )
        count += 1
        time.sleep(2)
    if count >= max_count:
        break
print(f"Create Corpus State Machines running count: {len(sms)}")

In [None]:
create_corpus_failed = []
for name, sm, corpus_id, s3_uri in sms:
    status = wait_for_sfn_sm(sm_execution_arn=sm)
    if status != "SUCCEEDED":
        delete_corpus(corpus_id=corpus_id)
        create_corpus_failed.append((name, s3_uri))

if create_corpus_failed:
    print(f"Create Corpus Failed: {create_corpus_failed}")

In [None]:

while len(create_corpus_failed) > 0:
    sms.clear()
    for name, s3_uri in create_corpus_failed:
        response = create_corpus(name=name, s3_uri=s3_uri)
        sms.append( (name, response['CorpusStateMachine'],  response['CorpusId'], s3_uri) )
        time.sleep(60)

    create_corpus_failed.clear()
    for name, sm, corpus_id, s3_uri in sms:
        status = wait_for_sfn_sm(sm_execution_arn=sm)
        if status != "SUCCEEDED":
            delete_corpus(corpus_id=corpus_id)
            create_corpus_failed.append((name, s3_uri))

    if create_corpus_failed:
        print(f"Create corpus Failed: {create_corpus_failed}")

In [None]:
from tempfile import NamedTemporaryFile
import gzip
import json

corpora_bucket = ""

def get_candidates(name):
    bucket_prefix = f"keyphrases/tag={name}/"
        
    extracted = []
    try:
        for key in s3_bucket_keys(s3_client=s3_client, bucket_name=corpora_bucket, bucket_prefix=bucket_prefix):
            with NamedTemporaryFile(mode='w+b', delete=True) as file_obj:
                s3_client.download_fileobj(corpora_bucket, key, file_obj)
                file_obj.seek(0)

                with gzip.open(file_obj, mode="rb") as gzip_obj:
                    while (line := gzip_obj.readline()):
                        json_obj=json.loads(line.decode('utf-8'))
                        keyphrase = json_obj['keyphrase']
                        #keyphrase = re.sub(pattern, '', keyphrase)
                        phrase_piece = json_obj['phrase_piece']
                        extracted.append( (keyphrase, phrase_piece) )
    except KeyError as e:
        print(e)

    extracted.sort(key = lambda x: x[1], reverse=True)
    return extracted


In [None]:
import json
universe_bucket = 

def get_references(name):
    bucket_prefix = f"midas/semeval2017/keyphrases/id={name}/"
        
    gt = []
    try:
        for key in s3_bucket_keys(s3_client=s3_client, bucket_name=universe_bucket, bucket_prefix=bucket_prefix):
            s3_obj = s3_client.get_object(Bucket=universe_bucket, Key=key)
            json_str = s3_obj['Body'].read().decode('utf-8')
            json_obj = json.loads(json_str)
            gt.extend(json_obj['keyphrases'])
    except KeyError as e:
        print(e)

    return gt

In [None]:
dynamodb = boto3.resource('dynamodb')
table = dynamodb.Table('')

response = table.scan()
corpus_data = response['Items']

while 'LastEvaluatedKey' in response:
    response = table.scan(ExclusiveStartKey=response['LastEvaluatedKey'])
    corpus_data.extend(response['Items'])

In [None]:
import json

semeval2017_corpora=[]
prefix = "semeval2017-"
for item in corpus_data:
  corpus_name = item['corpus_name']
  if corpus_name.startswith(prefix):
    semeval2017_corpora.append(corpus_name)

semeval2017_corpora

In [None]:
!pip install rouge-score

In [None]:
from rouge_score import rouge_scorer

def f_score():

    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    all_list = []

    for name in semeval2017_corpora:
        
        d_list = []
        references = get_references(name.rsplit('-', 1)[1])
        candidates = get_candidates(name)
        candidates.sort(key = lambda x: x[1], reverse=True)
        candidates = candidates[0: len(references)]

        for candidate, _ in candidates:
            c_list = []
            for reference in references:
                scores = scorer.score(reference, candidate)
                c_list.append(scores['rougeL'])
        
            c_list.sort(key=lambda x: x.fmeasure, reverse=True)
            d_list.append(c_list[0])

        precision = 0.0
        recall = 0.0
        fmeasure = 0.0
        total = 0

        for s in d_list:
            total += 1
            precision += s.precision
            recall += s.recall
            fmeasure += s.fmeasure
        
        all_list.append( ( precision/total, recall/total, fmeasure/total))
        

    precision = 0.0
    recall = 0.0
    fmeasure = 0.0
    total = 0

    for s in all_list:
        total += 1
        precision += s[0]
        recall += s[1]
        fmeasure += s[2]

    return precision/total, recall/total, fmeasure/total


p,r,f = f_score()
print(p,r,f)