## Create Kendra Index

---
## 1. Setup

In [None]:
import boto3
import os

session = boto3.session.Session()
kendra = session.client("kendra")
iam_client = session.client("iam")

account_id = boto3.client("sts").get_caller_identity()["Account"]
region_name = session.region_name
s3_bucket = os.environ.get("S3_BUCKET_NAME")

kendra_edition = os.environ.get("KENDRA_EDITION") or "DEVELOPER_EDITION" # DEVELOPER_EDITION | ENTERPRISE_EDITION


---
## 2. Create Kendra execution role

In [None]:
import json

try:
    role = iam_client.create_role(
        RoleName="HASearchIndexExecutionRole",
        AssumeRolePolicyDocument=json.dumps(
            {
                "Version": "2012-10-17",
                "Statement": [
                    {
                        "Effect": "Allow",
                        "Principal": {"Service": "kendra.amazonaws.com"},
                        "Action": "sts:AssumeRole",
                    }
                ],
            }
        ),
    )
except:
    role = iam_client.get_role(RoleName="HASearchIndexExecutionRole")

kendra_role_arn = role.get("Role").get("Arn")
print(f"Data source role ARN: {kendra_role_arn}")

iam_client.put_role_policy(
    RoleName="HASearchIndexExecutionRole",
    PolicyName="HASearchIndexExecutionPolicy",
    PolicyDocument=json.dumps(
        {
            "Version": "2012-10-17",
            "Statement": [
                {
                    "Effect": "Allow",
                    "Action": ["cloudwatch:PutMetricData"],
                    "Resource": "*",
                    "Condition": {
                        "StringEquals": {"cloudwatch:namespace": "AWS/Kendra"}
                    },
                },
                {
                    "Effect": "Allow",
                    "Action": ["logs:DescribeLogGroups"],
                    "Resource": "*",
                },
                {
                    "Effect": "Allow",
                    "Action": ["logs:CreateLogGroup"],
                    "Resource": [
                        "arn:aws:logs:*:log-group:/aws/kendra/*"
                    ],
                },
                {
                    "Effect": "Allow",
                    "Action": [
                        "logs:DescribeLogStreams",
                        "logs:CreateLogStream",
                        "logs:PutLogEvents",
                    ],
                    "Resource": [
                        "arn:aws:logs:*:log-group:/aws/kendra/*:log-stream:*"
                    ],
                },
                {
                    "Effect": "Allow",
                    "Action": [
                        "s3:GetObject",
                    ],
                    "Resource": [
                        f"arn:aws:s3:::{s3_bucket}/*"
                    ],
                },                
            ],
        }
    ),
)

---
## 3. Create Kendra index

In [None]:
import boto3
import time

kendra = boto3.client("kendra")

index_name = "health-authority-search"
description = "Health authority document search"

index_response = kendra.create_index(
    Description=description,
    Name=index_name,
    Edition=kendra_edition,
    RoleArn=kendra_role_arn,
)

print(index_response)

index_id = index_response["Id"]

print("Wait for Amazon Kendra to create the index.")

while True:
    # Get the details of the index, such as the status
    index_description = kendra.describe_index(Id=index_id)
    # When status is not CREATING quit.
    status = index_description["Status"]
    print(" Creating index. Status: " + status)
    time.sleep(30)
    if status != "CREATING":
        break

Update metadata configuration

In [None]:
response = kendra.update_index(
    Id=index_id,
    DocumentMetadataConfigurationUpdates=[
        {
            "Name": "ApplicationNumber",
            "Type": "STRING_VALUE",
            "Search": {
                "Facetable": False,
                "Searchable": True,
                "Displayable": True,
                "Sortable": False,
            },
        },
        {
            "Name": "BrandName",
            "Type": "STRING_VALUE",
            "Search": {
                "Facetable": True,
                "Searchable": True,
                "Displayable": True,
                "Sortable": False,
            },
        },
        {
            "Name": "GenericName",
            "Type": "STRING_VALUE",
            "Search": {
                "Facetable": True,
                "Searchable": True,
                "Displayable": True,
                "Sortable": False,
            },
        },
        {
            "Name": "ManufacturerName",
            "Type": "STRING_VALUE",
            "Search": {
                "Facetable": True,
                "Searchable": True,
                "Displayable": True,
                "Sortable": False,
            },
        },
        {
            "Name": "DocumentId",
            "Type": "STRING_VALUE",
            "Search": {
                "Facetable": False,
                "Searchable": True,
                "Displayable": True,
                "Sortable": False,
            },
        },        
        {
            "Name": "Submission",
            "Type": "STRING_VALUE",
            "Search": {
                "Facetable": True,
                "Searchable": True,
                "Displayable": True,
                "Sortable": False,
            },
        },
        {
            "Name": "_category",
            "Type": "STRING_VALUE",
            "Search": {
                "Facetable": True,
                "Searchable": True,
                "Displayable": True,
                "Sortable": False,
            },
        },  
    ],
)

response

Create thesaurus

In [None]:
from src.helpers import write_string_to_s3

thesaurus = [
    ("DARZALEX", "Daratumumab"),
    ("DUPIXENT", "Dupilumab"),
    ("ELIQUIS", "Apixaban"),
    ("EYLEA", "Aflibercept"),
    ("HUMIRA", "Adalimumab"),
    ("KEYTRUDA", "Pembrolizumab"),
    ("MOUNJARO", "Tirzepatide"),
    ("OPDIVO", "Nivolumab"),
    ("OZEMPIC", "Semaglutide"),
    ("TRULICITY", "Dulaglutide"),
    ("SKYRIZI", "Risankizumab"),
    ("STELARA", "Ustekinumab"),
    ("ZEPBOUND", "Tirzepatide"),
]

thesaurus_string = "\n".join([",".join(line) for line in thesaurus])
s3_uri = write_string_to_s3(
    thesaurus_string,
    s3_bucket,
    "thesaurus.txt",
)
print(s3_uri)

kendra.create_thesaurus(
    Name="drug-name-thesaurus",
    RoleArn=kendra_role_arn,
    Description="Brand name <> generic drug name thesaurus",
    IndexId=index_id,
    SourceS3Path={
        "Bucket": s3_bucket,
        "Key": "thesaurus.txt",
    }
)

---
## 4. Create Drugs@FDA Data Source

Create data source role

In [None]:
import json

try:
    role = iam_client.create_role(
        RoleName="HASearchDataSourceRole",
        AssumeRolePolicyDocument=json.dumps(
            {
                "Version": "2012-10-17",
                "Statement": [
                    {
                        "Effect": "Allow",
                        "Principal": {"Service": "kendra.amazonaws.com"},
                        "Action": "sts:AssumeRole",
                    }
                ],
            }
        ),
    )
except:
    role = iam_client.get_role(RoleName="HASearchDataSourceRole")

data_source_role_arn = role.get("Role").get("Arn")
print(f"Data source role ARN: {data_source_role_arn}")

iam_client.put_role_policy(
    RoleName="HASearchDataSourceRole",
    PolicyName="HASearchDataSourcePolicy",
    PolicyDocument=json.dumps(
        {
            "Version": "2012-10-17",
            "Statement": [
                {
                    "Action": ["s3:GetObject"],
                    "Resource": [f"arn:aws:s3:::{s3_bucket}/*"],
                    "Effect": "Allow",
                },
                {
                    "Action": ["s3:ListBucket"],
                    "Resource": [f"arn:aws:s3:::{s3_bucket}"],
                    "Effect": "Allow",
                },
                {
                    "Effect": "Allow",
                    "Action": ["kendra:BatchPutDocument", "kendra:BatchDeleteDocument"],
                    "Resource":f"arn:aws:kendra:{region_name}:{account_id}:index/{index_id}",
                },
            ],
        }
    ),
)

In [None]:
data_source_name = "drugs-at-fda"
data_source_description = "Drugs@FDA documents."
data_source_type = "S3"

configuration = {
    "S3Configuration": {
        "BucketName": s3_bucket,
        "InclusionPrefixes": ["drugs-at-fda"],
    }
}

data_source_response = kendra.create_data_source(
    Name=data_source_name,
    Description=data_source_name,
    RoleArn=data_source_role_arn,
    Type=data_source_type,
    Configuration=configuration,
    IndexId=index_id,
)

print(data_source_response)

data_source_id = data_source_response["Id"]

print("Wait for Amazon Kendra to create the data source.")

while True:
    # Get the details of the data source, such as the status
    data_source_description = kendra.describe_data_source(
        Id=data_source_id, IndexId=index_id
    )
    # If status is not CREATING, then quit
    status = data_source_description["Status"]
    print(" Creating data source. Status: " + status)
    time.sleep(15)
    if status != "CREATING":
        break


In [None]:
print("Synchronize the data source.")

sync_response = kendra.start_data_source_sync_job(Id=data_source_id, IndexId=index_id)
time.sleep(5)
print(sync_response)

print("Wait for the data source to sync with the index.")

jobs = kendra.list_data_source_sync_jobs(Id=data_source_id, IndexId=index_id)

# For this example, there should be one job
status = jobs["History"][0]["Status"]

print(" Syncing data source. Status: " + status)

In [None]:
jobs = kendra.list_data_source_sync_jobs(Id=data_source_id, IndexId=index_id)
jobs["History"][0]