Skip to content

Commit

Permalink
feat: Added delete document functionality (#464)
Browse files Browse the repository at this point in the history
* feat: Added delete document functionality

Added delete functionality for all types of documents (Files, Texts, Q&A and Websites). The feature deletes the documents from S3 upload bucket, S3 processed bucket, DynamoDB documents table, OpenSearch index and also updates DynamoDB workspaces table.
Following are the major code changes:
1. Added delete button on UI for each row of the documents.
2. Added confirmation dialog via Modal so that user can Cancel/Delete the document from there.
3. Created AWS step function to use State Machines and delete document workflow. This way, the whole process is organised and is automatically rolled back if any of the operation in the step function fails.

Major components and their working is as below:
1. documents-tab.tsx has functionality related to delete button and handling of confirmation Modal.
2. documents-client.ts has function deleteDocument to hit the backend API.
3. delete_document function in lib/chatbot-api/functions/api-handler/routes/documents.py handles the API request
4. deleteDocumentWorkflow is created in lib/rag-engines/workspaces/index.ts
5. delete-document.ts has internal structure of Delete document workflow
6. The lambda function to handle the workflow is written in lib/rag-engines/workspaces/functions/delete-document-workflow/delete/index.py
7. The execution of state machine starts in delete_document function of lib/shared/layers/python-sdk/python/genai_core/documents.py
8. The actual deletion of documents happens in delete_open_search_document function of lib/shared/layers/python-sdk/python/genai_core/opensearch/delete.py

Request flow would be like documents-client -> documents.py (api handler) -> documents.py (genai_core) -> index.py (delete-document-workflow) -> delete.py (genai_core/opensearch)

As part of this change, also updated version of opensearch-py which was initially updated as calling direct http methods was not allowed in earlier version but later on calling http methods was not required. Kept this change for future perspective as it would have no impact.

* Added missing cursor commit for Aurora DB

---------

Co-authored-by: Bigad Soleiman <bigadsoleiman@gmail.com>
  • Loading branch information
azaylamba and bigadsoleiman committed Jun 10, 2024
1 parent 3cacec0 commit 06aed3b
Show file tree
Hide file tree
Showing 19 changed files with 711 additions and 53 deletions.
3 changes: 3 additions & 0 deletions lib/aws-genai-llm-chatbot-stack.ts
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,9 @@ export class AwsGenAILLMChatbotStack extends cdk.Stack {
`/${this.stackName}/RagEngines/Workspaces/DeleteWorkspace/DeleteWorkspaceFunction/ServiceRole/Resource`,
`/${this.stackName}/RagEngines/Workspaces/DeleteWorkspace/DeleteWorkspaceFunction/ServiceRole/DefaultPolicy/Resource`,
`/${this.stackName}/RagEngines/Workspaces/DeleteWorkspace/DeleteWorkspace/Role/DefaultPolicy/Resource`,
`/${this.stackName}/RagEngines/Workspaces/DeleteDocument/DeleteDocumentFunction/ServiceRole/Resource`,
`/${this.stackName}/RagEngines/Workspaces/DeleteDocument/DeleteDocumentFunction/ServiceRole/DefaultPolicy/Resource`,
`/${this.stackName}/RagEngines/Workspaces/DeleteDocument/DeleteDocument/Role/DefaultPolicy/Resource`,
`/${this.stackName}/RagEngines/DataImport/FileImportBatchJob/ManagedEc2EcsComputeEnvironment/InstanceProfileRole/Resource`,
`/${this.stackName}/RagEngines/DataImport/WebCrawlerBatchJob/WebCrawlerManagedEc2EcsComputeEnvironment/InstanceProfileRole/Resource`,
`/${this.stackName}/BucketNotificationsHandler050a0587b7544547bf325f094a3db834/Role/Resource`,
Expand Down
10 changes: 10 additions & 0 deletions lib/chatbot-api/functions/api-handler/routes/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@ class GetDocumentRequest(BaseModel):
workspaceId: str
documentId: str

class DeleteDocumentRequest(BaseModel):
workspaceId: str
documentId: str

class GetRssPostsRequest(BaseModel):
workspaceId: str
Expand Down Expand Up @@ -132,6 +135,13 @@ def get_documents(input: dict):
"lastDocumentId": result["last_document_id"],
}

@router.resolver(field_name="deleteDocument")
@tracer.capture_method
def delete_document(input: dict):
request = DeleteDocumentRequest(**input)
result = genai_core.documents.delete_document(request.workspaceId, request.documentId)

return result

@router.resolver(field_name="getDocument")
@tracer.capture_method
Expand Down
6 changes: 6 additions & 0 deletions lib/chatbot-api/rest-api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,8 @@ export class ApiResolvers extends Construct {
?.attrEndpointName ?? "",
DELETE_WORKSPACE_WORKFLOW_ARN:
props.ragEngines?.deleteWorkspaceWorkflow?.stateMachineArn ?? "",
DELETE_DOCUMENT_WORKFLOW_ARN:
props.ragEngines?.deleteDocumentWorkflow?.stateMachineArn ?? "",
CREATE_AURORA_WORKSPACE_WORKFLOW_ARN:
props.ragEngines?.auroraPgVector?.createAuroraWorkspaceWorkflow
?.stateMachineArn ?? "",
Expand Down Expand Up @@ -225,6 +227,10 @@ export class ApiResolvers extends Construct {
);
}

if (props.ragEngines?.deleteDocumentWorkflow) {
props.ragEngines.deleteDocumentWorkflow.grantStartExecution(apiHandler);
}

if (props.ragEngines?.sageMakerRagModels) {
apiHandler.addToRolePolicy(
new iam.PolicyStatement({
Expand Down
16 changes: 14 additions & 2 deletions lib/chatbot-api/schema/schema.graphql
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,11 @@ type DocumentResult @aws_cognito_user_pools {
status: String
}

type DeleteDocumentResult @aws_cognito_user_pools {
documentId: String!
deleted: Boolean!
}

type UserFeedbackResult @aws_cognito_user_pools {
feedback_id: String!
}
Expand Down Expand Up @@ -242,6 +247,11 @@ type SessionHistoryItem @aws_cognito_user_pools {
metadata: String
}

input DeleteDocumentInput {
workspaceId: String!
documentId: String!
}

input UserFeedbackInput {
sessionId: String!
key: Int!
Expand Down Expand Up @@ -311,9 +321,11 @@ type Mutation {
@aws_cognito_user_pools
startKendraDataSync(workspaceId: String!): Boolean @aws_cognito_user_pools
deleteWorkspace(workspaceId: String!): Boolean @aws_cognito_user_pools
deleteDocument(input: DeleteDocumentInput!): DeleteDocumentResult @aws_cognito_user_pools
addTextDocument(input: TextDocumentInput!): DocumentResult
@aws_cognito_user_pools
addUserFeedback(input: UserFeedbackInput!): UserFeedbackResult @aws_cognito_user_pools
addUserFeedback(input: UserFeedbackInput!): UserFeedbackResult
@aws_cognito_user_pools
addQnADocument(input: QnADocumentInput!): DocumentResult
@aws_cognito_user_pools
setDocumentSubscriptionStatus(
Expand Down Expand Up @@ -366,4 +378,4 @@ schema {
query: Query
mutation: Mutation
subscription: Subscription
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def process_record(record):
workspace_id=workspace_id,
document_type="file",
path=file_name,
title=file_name,
size_in_bytes=object_size,
)

Expand Down
2 changes: 2 additions & 0 deletions lib/rag-engines/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ export class RagEngines extends Construct {
public readonly fileImportWorkflow?: sfn.StateMachine;
public readonly websiteCrawlingWorkflow?: sfn.StateMachine;
public readonly deleteWorkspaceWorkflow?: sfn.StateMachine;
public readonly deleteDocumentWorkflow?: sfn.StateMachine;
public readonly dataImport: DataImport;

constructor(scope: Construct, id: string, props: RagEnginesProps) {
Expand Down Expand Up @@ -118,6 +119,7 @@ export class RagEngines extends Construct {
this.fileImportWorkflow = dataImport.fileImportWorkflow;
this.websiteCrawlingWorkflow = dataImport.websiteCrawlingWorkflow;
this.deleteWorkspaceWorkflow = workspaces.deleteWorkspaceWorkflow;
this.deleteDocumentWorkflow = workspaces.deleteDocumentWorkflow;
this.dataImport = dataImport;
}
}
177 changes: 177 additions & 0 deletions lib/rag-engines/workspaces/delete-document.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
import * as cdk from "aws-cdk-lib";
import * as iam from "aws-cdk-lib/aws-iam";
import * as lambda from "aws-cdk-lib/aws-lambda";
import * as logs from "aws-cdk-lib/aws-logs";
import * as sfn from "aws-cdk-lib/aws-stepfunctions";
import * as tasks from "aws-cdk-lib/aws-stepfunctions-tasks";
import { Construct } from "constructs";
import * as path from "path";
import { Shared } from "../../shared";
import { SystemConfig } from "../../shared/types";
import { AuroraPgVector } from "../aurora-pgvector";
import { DataImport } from "../data-import";
import { KendraRetrieval } from "../kendra-retrieval";
import { OpenSearchVector } from "../opensearch-vector";
import { RagDynamoDBTables } from "../rag-dynamodb-tables";
import { RemovalPolicy } from "aws-cdk-lib";

export interface DeleteDocumentProps {
readonly config: SystemConfig;
readonly shared: Shared;
readonly dataImport: DataImport;
readonly ragDynamoDBTables: RagDynamoDBTables;
readonly auroraPgVector?: AuroraPgVector;
readonly openSearchVector?: OpenSearchVector;
readonly kendraRetrieval?: KendraRetrieval;
}

export class DeleteDocument extends Construct {
public readonly stateMachine?: sfn.StateMachine;

constructor(scope: Construct, id: string, props: DeleteDocumentProps) {
super(scope, id);

const deleteFunction = new lambda.Function(this, "DeleteDocumentFunction", {
vpc: props.shared.vpc,
code: props.shared.sharedCode.bundleWithLambdaAsset(
path.join(__dirname, "./functions/delete-document-workflow/delete")
),
runtime: props.shared.pythonRuntime,
architecture: props.shared.lambdaArchitecture,
handler: "index.lambda_handler",
layers: [props.shared.powerToolsLayer, props.shared.commonLayer],
timeout: cdk.Duration.minutes(15),
logRetention: logs.RetentionDays.ONE_WEEK,
environment: {
...props.shared.defaultEnvironmentVariables,
AURORA_DB_SECRET_ID: props.auroraPgVector?.database.secret
?.secretArn as string,
UPLOAD_BUCKET_NAME: props.dataImport.uploadBucket.bucketName,
PROCESSING_BUCKET_NAME: props.dataImport.processingBucket.bucketName,
WORKSPACES_TABLE_NAME:
props.ragDynamoDBTables.workspacesTable.tableName,
WORKSPACES_BY_OBJECT_TYPE_INDEX_NAME:
props.ragDynamoDBTables.workspacesByObjectTypeIndexName,
DOCUMENTS_TABLE_NAME:
props.ragDynamoDBTables?.documentsTable.tableName ?? "",
DOCUMENTS_BY_COMPOUND_KEY_INDEX_NAME:
props.ragDynamoDBTables?.documentsByCompoundKeyIndexName ?? "",
DEFAULT_KENDRA_S3_DATA_SOURCE_BUCKET_NAME:
props.kendraRetrieval?.kendraS3DataSourceBucket?.bucketName ?? "",
OPEN_SEARCH_COLLECTION_ENDPOINT:
props.openSearchVector?.openSearchCollectionEndpoint ?? "",
},
});

if (props.auroraPgVector) {
props.auroraPgVector.database.secret?.grantRead(deleteFunction);
props.auroraPgVector.database.connections.allowDefaultPortFrom(
deleteFunction
);
}

if (props.openSearchVector) {
deleteFunction.addToRolePolicy(
new iam.PolicyStatement({
actions: [
"aoss:APIAccessAll",
"aoss:DescribeIndex",
"aoss:UpdateIndex",
],
resources: [props.openSearchVector.openSearchCollection.attrArn],
})
);

props.openSearchVector.addToAccessPolicy(
"delete-document",
[deleteFunction.role?.roleArn],
[
"aoss:DescribeIndex",
"aoss:UpdateIndex",
"aoss:ReadDocument",
"aoss:WriteDocument",
]
);
}

props.dataImport.uploadBucket.grantReadWrite(deleteFunction);
props.dataImport.processingBucket.grantReadWrite(deleteFunction);
props.kendraRetrieval?.kendraS3DataSourceBucket?.grantReadWrite(
deleteFunction
);
props.ragDynamoDBTables.workspacesTable.grantReadWriteData(deleteFunction);
props.ragDynamoDBTables.documentsTable.grantReadWriteData(deleteFunction);

const handleError = new tasks.DynamoUpdateItem(this, "HandleError", {
table: props.ragDynamoDBTables.documentsTable,
key: {
workspace_id: tasks.DynamoAttributeValue.fromString(
sfn.JsonPath.stringAt("$.workspace_id")
),
document_id: tasks.DynamoAttributeValue.fromString(
sfn.JsonPath.stringAt("$.document_id")
),
},
updateExpression: "set #status = :error",
expressionAttributeNames: {
"#status": "status",
},
expressionAttributeValues: {
":error": tasks.DynamoAttributeValue.fromString("error"),
},
}).next(
new sfn.Fail(this, "Fail", {
cause: "Document deletion failed",
})
);

const setDeleting = new tasks.DynamoUpdateItem(this, "SetDeleting", {
table: props.ragDynamoDBTables.documentsTable,
key: {
workspace_id: tasks.DynamoAttributeValue.fromString(
sfn.JsonPath.stringAt("$.workspace_id")
),
document_id: tasks.DynamoAttributeValue.fromString(
sfn.JsonPath.stringAt("$.document_id")
),
},
updateExpression: "set #status=:statusValue",
expressionAttributeNames: {
"#status": "status",
},
expressionAttributeValues: {
":statusValue": tasks.DynamoAttributeValue.fromString("deleting"),
},
resultPath: sfn.JsonPath.DISCARD,
});

const deleteTask = new tasks.LambdaInvoke(this, "Delete", {
lambdaFunction: deleteFunction,
resultPath: "$.deleteResult",
}).addCatch(handleError, {
errors: ["States.ALL"],
resultPath: "$.deleteResult",
});

const workflow = setDeleting
.next(deleteTask)
.next(new sfn.Succeed(this, "Success"));

const logGroup = new logs.LogGroup(this, "DeleteDocumentSMLogGroup", {
removalPolicy: RemovalPolicy.DESTROY,
});

const stateMachine = new sfn.StateMachine(this, "DeleteDocument", {
definitionBody: sfn.DefinitionBody.fromChainable(workflow),
timeout: cdk.Duration.minutes(5),
comment: "Delete Document Workflow",
tracingEnabled: true,
logs: {
destination: logGroup,
level: sfn.LogLevel.ALL,
},
});

this.stateMachine = stateMachine;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import genai_core.types
import genai_core.workspaces
import genai_core.documents
import genai_core.aurora.delete
import genai_core.opensearch.delete
import genai_core.kendra.delete
from aws_lambda_powertools import Logger
from aws_lambda_powertools.utilities.typing import LambdaContext

logger = Logger()


@logger.inject_lambda_context(log_event=True)
def lambda_handler(event, context: LambdaContext):
workspace_id = event["workspace_id"]
document_id = event["document_id"]
workspace = genai_core.workspaces.get_workspace(workspace_id)
if workspace is None:
raise genai_core.types.CommonError("Workspace not found")

document = genai_core.documents.get_document(workspace_id, document_id)
if document is None:
raise genai_core.types.CommonError("Document not found")

if workspace["engine"] == "opensearch":
genai_core.opensearch.delete.delete_open_search_document(workspace_id, document)
elif workspace["engine"] == "aurora":
genai_core.aurora.delete.delete_aurora_document(workspace_id, document)
elif workspace["engine"] == "kendra":
genai_core.kendra.delete.delete_kendra_document(workspace_id, document)
else:
raise genai_core.types.CommonError("Workspace engine not supported")
21 changes: 19 additions & 2 deletions lib/rag-engines/workspaces/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import { KendraRetrieval } from "../kendra-retrieval";
import { OpenSearchVector } from "../opensearch-vector";
import { RagDynamoDBTables } from "../rag-dynamodb-tables";
import { DeleteWorkspace } from "./delete-workspace";
import { DeleteDocument } from "./delete-document";

export interface WorkkspacesProps {
readonly config: SystemConfig;
Expand All @@ -21,11 +22,26 @@ export interface WorkkspacesProps {

export class Workspaces extends Construct {
public readonly deleteWorkspaceWorkflow?: sfn.StateMachine;
public readonly deleteDocumentWorkflow?: sfn.StateMachine;

constructor(scope: Construct, id: string, props: WorkkspacesProps) {
super(scope, id);

const workflow = new DeleteWorkspace(this, "DeleteWorkspace", {
const deleteWorkspaceWorkflow = new DeleteWorkspace(
this,
"DeleteWorkspace",
{
config: props.config,
shared: props.shared,
dataImport: props.dataImport,
ragDynamoDBTables: props.ragDynamoDBTables,
auroraPgVector: props.auroraPgVector,
openSearchVector: props.openSearchVector,
kendraRetrieval: props.kendraRetrieval,
}
);

const deleteDocumentWorkflow = new DeleteDocument(this, "DeleteDocument", {
config: props.config,
shared: props.shared,
dataImport: props.dataImport,
Expand All @@ -35,6 +51,7 @@ export class Workspaces extends Construct {
kendraRetrieval: props.kendraRetrieval,
});

this.deleteWorkspaceWorkflow = workflow.stateMachine;
this.deleteWorkspaceWorkflow = deleteWorkspaceWorkflow.stateMachine;
this.deleteDocumentWorkflow = deleteDocumentWorkflow.stateMachine;
}
}
2 changes: 1 addition & 1 deletion lib/shared/layers/common/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ aws_requests_auth==0.4.3
requests-aws4auth==1.2.3
langchain==0.1.17
langchain-community==0.0.36
opensearch-py==2.3.1
opensearch-py==2.4.2
psycopg2-binary==2.9.7
pgvector==0.2.2
pydantic==2.4.0
Expand Down
Loading

0 comments on commit 06aed3b

Please sign in to comment.