In [None]:
import boto3
lambda_client = boto3.client("lambda")
s3_client = boto3.client("s3")

In [None]:
import json

create_corpus_arn = ""

def create_corpus(name, s3_uri):

  json_data = { 
    "CorpusName": name,
    "S3Uri": s3_uri,
    "NMax": 3
  }
  
  payload = json.dumps({ "body": json.dumps(json_data) })

  response = lambda_client.invoke(
      FunctionName='',
      InvocationType='RequestResponse',
      Payload=payload
  )

  print(response)
  
  json_obj = json.loads(response['Payload'].read())
  data = json.loads(json_obj['body'])
  return data

In [None]:
import time

sfn_client=boto3.client('stepfunctions')
def wait_for_sfn_sm(sm_execution_arn):
    status = 'RUNNING'
    while status == 'RUNNING':
        response = sfn_client.describe_execution(executionArn=sm_execution_arn)
        status = response.get('status')
        if status == 'RUNNING':
            time.sleep(15)
        
    return status

In [None]:
def s3_bucket_keys(s3_client, bucket_name, bucket_prefix):
    """Generator for listing S3 bucket keys matching prefix"""

    kwargs = {'Bucket': bucket_name, 'Prefix': bucket_prefix}
    while True:
        resp = s3_client.list_objects_v2(**kwargs)
        for obj in resp['Contents']:
            yield obj['Key']

        try:
            kwargs['ContinuationToken'] = resp['NextContinuationToken']
        except KeyError:
            break

In [None]:
def delete_corpus(corpus_id):
  payload = json.dumps( { "body": "{ \"CorpusId\": \"" + corpus_id + "\" }" } )

  print(payload)
  response = lambda_client.invoke(
      FunctionName='',
      InvocationType='RequestResponse',
      Payload=payload
  )

  print(response)

In [None]:
s3_uris = []

universe_bucket=""
bucket_prefix = "midas/inspec/documents/"

for key in s3_bucket_keys(s3_client=s3_client, bucket_name=universe_bucket, bucket_prefix=bucket_prefix):
    s3_uris.append(f"s3://{universe_bucket}/{key}")


In [None]:
import re

time.sleep(3600)

sms = []
count = 0
max_count = 1000

filter_names = []
for s3_uri in s3_uris:
    m=re.match(r".+\/id=(\w+)\/.+", s3_uri)
    if m:
        name = f"inspec-{m[1]}"
        if filter_names and name not in filter_names:
            continue
        response = create_corpus(name=name, s3_uri=s3_uri)
        sms.append( (name, response['CorpusStateMachine'],  response['CorpusId'], s3_uri) )
        count += 1
        time.sleep(120)
    if count >= max_count:
        break
print(f"Fast Corpus State Machines running count: {len(sms)}")

In [None]:
create_corpus_failed = []
for name, sm, corpus_id, s3_uri in sms:
    status = wait_for_sfn_sm(sm_execution_arn=sm)
    if status != "SUCCEEDED":
        delete_corpus(corpus_id=corpus_id)
        create_corpus_failed.append((name, s3_uri))

if create_corpus_failed:
    print(f"Fast Corpus Failed: {create_corpus_failed}")

In [None]:

while len(create_corpus_failed) > 0:
    sms.clear()
    for name, s3_uri in create_corpus_failed:
        response = create_corpus(name=name, s3_uri=s3_uri)
        sms.append( (name, response['CorpusStateMachine'],  response['CorpusId'], s3_uri) )
        time.sleep(60)

    create_corpus_failed.clear()
    for name, sm, corpus_id, s3_uri in sms:
        status = wait_for_sfn_sm(sm_execution_arn=sm)
        if status != "SUCCEEDED":
            delete_corpus(corpus_id=corpus_id)
            create_corpus_failed.append((name, s3_uri))

    if create_corpus_failed:
        print(f"Fast corpus Failed: {create_corpus_failed}")

In [None]:
import boto3

dynamodb = boto3.resource('dynamodb')
table = dynamodb.Table('')

response = table.scan()
data = response['Items']

while 'LastEvaluatedKey' in response:
    response = table.scan(ExclusiveStartKey=response['LastEvaluatedKey'])
    data.extend(response['Items'])

In [None]:
import json

compute_catchphrase_arn = ""

def compute_catchphrases(corpus_id, tag, top_k=100, affinity=0.10, n_distance=0.0, c_distance=0.0):

  json_data = { 
    "CorpusId": corpus_id, 
    "Tag": tag, 
    "TopK": top_k,
    "AffinityThreshold": affinity,
    "PosPattern": "^(DET)?(((PROPN)|(NOUN))+|(((PROPN)|(NOUN))*((ADJ)|(VERB))+((PROPN)|(NOUN))+)|(((PROPN)|(NOUN)|(ADJ)|(VERB))+((CCONJ)|((ADP)(DET)?))((PROPN)|(NOUN))+))$",
    "NDistanceThreshold": n_distance,
    "CDistanceThreshold": c_distance,
  }
  
  payload = json.dumps({ "body": json.dumps(json_data) })

  response = lambda_client.invoke(
      FunctionName='',
      InvocationType='RequestResponse',
      Payload=payload
  )
  
  json_obj = json.loads(response['Payload'].read())
  data = json.loads(json_obj['body'])
  return data

In [None]:
compute_catchphrases_list = []
csms = []

for item in data:
  corpus_name = item['corpus_name']
  if not corpus_name.startswith("semeval2017-"):
     continue
  corpus_state = item['corpus_state']

  if corpus_state == 'READY':
    corpus_id = item['corpus_id']
    response = compute_catchphrases(corpus_id=corpus_id, tag=corpus_name)
    csms.append( (corpus_name, response['CorpusStateMachine'], corpus_id) )
    time.sleep(5)
    
for name, sm, corpus_id in csms:
    status = wait_for_sfn_sm(sm_execution_arn=sm)
    if status != "SUCCEEDED":
        compute_catchphrases_list.append( (name, corpus_id) )

In [None]:
pending_count = len(compute_catchphrases_list)
if pending_count > 0:
    print(f"Pending compute corpus: {pending_count} {compute_catchphrases_list}")

In [None]:
while len(compute_catchphrases_list) > 0:
    csms.clear()
    for name, corpus_id in compute_catchphrases_list:
        response = compute_catchphrases(corpus_id=corpus_id, tag=corpus_name)
        csms.append( (corpus_name, response['CorpusStateMachine'], corpus_id) )
        time.sleep(60)

    compute_catchphrases_list.clear()
    for name, sm, corpus_id in csms:
        status = wait_for_sfn_sm(sm_execution_arn=sm)
        if status != "SUCCEEDED":
            compute_catchphrases_list.append( (name, corpus_id) )

    pending_count = len(compute_catchphrases_list)
    if pending_count > 0:
        print(f"Pending compute corpus: {pending_count} {compute_catchphrases_list}")

In [None]:
from tempfile import NamedTemporaryFile
import gzip


corpora_bucket = ""

def get_extracted(name):
    bucket_prefix = f"catchphrases/tag={name}/"
        
    extracted = []
    try:
        for key in s3_bucket_keys(s3_client=s3_client, bucket_name=corpora_bucket, bucket_prefix=bucket_prefix):
            with NamedTemporaryFile(mode='w+b', delete=True) as file_obj:
                s3_client.download_fileobj(corpora_bucket, key, file_obj)
                file_obj.seek(0)

                with gzip.open(file_obj, mode="rb") as gzip_obj:
                    while (line := gzip_obj.readline()):
                        json_obj=json.loads(line.decode('utf-8'))
                        catchphrase = json_obj['catchphrase']
                        weight = json_obj['weight']
                        extracted.append( (catchphrase, weight) )
    except KeyError as e:
        print(e)

    extracted.sort(key = lambda x: x[1], reverse=True)
    return extracted


In [None]:
import json
universe_bucket=""

def get_gt_keyphrases(name):
    bucket_prefix = f"midas/semeval2017/keyphrases/id={name}/"
        
    gt = []
    try:
        for key in s3_bucket_keys(s3_client=s3_client, bucket_name=universe_bucket, bucket_prefix=bucket_prefix):
            s3_obj = s3_client.get_object(Bucket=universe_bucket, Key=key)
            json_str = s3_obj['Body'].read().decode('utf-8')
            json_obj = json.loads(json_str)
            gt.extend(json_obj['keyphrases'])
    except KeyError as e:
        print(e)

    return gt

In [None]:
import string

def normalize_phrases(retrieved, gt):
    normalized = retrieved

    for p in gt:
        i = 0
        while i < len(normalized):
            s = normalized[i]
            idx = s.find(p)
            if idx >= 0:
                s1 = s[:idx].strip()
                s2 = s[idx+len(p):].strip()
                insert_list = []
                if s1 and s1 not in string.punctuation:
                    insert_list.append(s1)
                insert_list.append(p)
                if s2 and s2 not in string.punctuation:
                    insert_list.append(s2)
                normalized = normalized[:i] + insert_list + normalized[i+1:]
                i = i + len(insert_list)
                break
            i+=1

    return normalized

In [None]:
def f_score():

    true_positive = []
    false_positive = []
    false_negative = []

    for item in data:
        name = item['corpus_name']
        if not name.startswith("semeval2017-"):
            continue
        extracted = get_extracted(name)

        retrieved = []
        for phrase, _ in extracted:
            phrase = phrase.lower()
            retrieved.append(phrase)
        
        print(f"{name}: Retrieved: {retrieved}")
        gt_keyphrases = get_gt_keyphrases(name.rsplit('-', 1)[1])
        print(f"{name}: Ground Truth: {gt_keyphrases}")
        normalized = normalize_phrases(retrieved=retrieved, gt=gt_keyphrases)
        print(f"{name}: Normalized: {normalized}")

        tp = set()
        fn = set()
        for keyphrase in gt_keyphrases:
            if keyphrase in normalized:
                tp.add(keyphrase)
            else:
                fn.add(keyphrase)

        true_positive.append(len(tp))
        false_positive.append(len(normalized) - len(tp))
        false_negative.append(len(fn))
    

    tp_sum = sum(true_positive)
    fp_sum = sum(false_positive)
    fn_sum = sum(false_negative)

    precision = tp_sum / (tp_sum + fp_sum)
    recall = tp_sum / (tp_sum + fn_sum)
    F1 = 2*precision*recall/(precision+recall)
    F2 = 5*precision*recall/(4*precision+recall)

    return precision, recall, F1, F2
        

In [None]:
import time

precision, recall, F1, F2 = f_score()

key = "FScore/semeval2017.json"
json_obj = {
    "dataset": "semeval2017",
    "topk": 1000,
    "affinity": 0.01,
    "distance": 0.10,
    "max_length": 5,
    "timestamp": int(time.time()),
    "F1": F1,
    "F2": F2,
    "recall": recall,
    "precision": precision
}
print(json_obj)

s3_client.put_object( Bucket=corpora_bucket, Key=key, Body=json.dumps(json_obj))

    