feat: enable web scraping to parse and save pdf content

aws-samples · May 2, 2024 · 24f77bd · 24f77bd
1 parent 47d7499
commit 24f77bd
Show file tree

Hide file tree

Showing 13 changed files with 217 additions and 21 deletions.
diff --git a/lib/chatbot-api/functions/api-handler/routes/documents.py b/lib/chatbot-api/functions/api-handler/routes/documents.py
@@ -35,6 +35,7 @@ class WebsiteDocumentRequest(BaseModel):
     address: str
     followLinks: bool
     limit: int
+    contentTypes: Optional[list]
 
 
 class RssFeedDocumentRequest(BaseModel):
@@ -44,12 +45,14 @@ class RssFeedDocumentRequest(BaseModel):
     limit: int
     title: Optional[str]
     followLinks: bool
+    contentTypes: Optional[list]
 
 
 class RssFeedCrawlerUpdateRequest(BaseModel):
     documentType: str
     followLinks: bool
     limit: int
+    contentTypes: Optional[str]
 
 
 class ListDocumentsRequest(BaseModel):
@@ -237,6 +240,7 @@ def add_website(input: dict):
         crawler_properties={
             "follow_links": request.followLinks,
             "limit": limit,
+            "content_types": request.contentTypes,
         },
     )
 
@@ -263,6 +267,7 @@ def add_rss_feed(
         crawler_properties={
             "follow_links": request.followLinks,
             "limit": request.limit,
+            "content_types": request.contentTypes,
         },
     )
 
@@ -282,6 +287,7 @@ def update_rss_feed(input: dict):
         document_type="rssfeed",
         follow_links=request.followLinks,
         limit=request.limit,
+        content_types=request.contentTypes,
     )
     return {
         "workspaceId": result["workspace_id"],
@@ -295,6 +301,7 @@ def _convert_document(document: dict):
         document["crawler_properties"] = {
             "followLinks": document["crawler_properties"]["follow_links"],
             "limit": document["crawler_properties"]["limit"],
+            "contentTypes": document["crawler_properties"]["content_types"],
         }
     return {
         "id": document["document_id"],
@@ -315,6 +322,7 @@ def _convert_document(document: dict):
         "crawlerProperties": {
             "followLinks": document.get("crawler_properties").get("follow_links", None),
             "limit": document.get("crawler_properties").get("limit", None),
+            "contentTypes": document.get("crawler_properties").get("content_types", None),
         }
         if document.get("crawler_properties", None) != None
         else None,

diff --git a/lib/chatbot-api/schema/schema.graphql b/lib/chatbot-api/schema/schema.graphql
@@ -47,6 +47,7 @@ input CalculateEmbeddingsInput {
 type CrawlerProperties @aws_cognito_user_pools {
   followLinks: Boolean
   limit: Int
+  contentTypes: [String!]!
 }
 
 type CrossEncoderData @aws_cognito_user_pools {
@@ -190,6 +191,7 @@ input RssFeedInput {
   limit: Int!
   title: String
   followLinks: Boolean!
+  contentTypes: [String!]!
 }
 
 input SemanticSearchInput {
@@ -261,6 +263,7 @@ input WebsiteInput {
   address: String!
   followLinks: Boolean!
   limit: Int!
+  contentTypes: [String!]!
 }
 
 type Workspace @aws_cognito_user_pools {

diff --git a/lib/shared/file-import-batch-job/requirements.txt b/lib/shared/file-import-batch-job/requirements.txt
@@ -8,7 +8,7 @@ langchain==0.1.5
 opensearch-py==2.3.1
 psycopg2-binary==2.9.7
 pgvector==0.2.2
-pydantic==2.3.0
+pydantic==2.4.0
 urllib3<2
 openai==0.28.0
 beautifulsoup4==4.12.2

diff --git a/lib/shared/layers/python-sdk/python/genai_core/documents.py b/lib/shared/layers/python-sdk/python/genai_core/documents.py
@@ -359,6 +359,7 @@ def update_document(workspace_id: str, document_id: str, document_type: str, **k
         if "limit" in kwargs and "follow_links" in kwargs:
             follow_links = kwargs["follow_links"]
             limit = kwargs["limit"]
+            content_types = kwargs["content_types"]
             response = documents_table.update_item(
                 Key={"workspace_id": workspace_id, "document_id": document_id},
                 UpdateExpression="SET #crawler_properties=:crawler_properties, updated_at=:timestampValue",
@@ -367,6 +368,7 @@ def update_document(workspace_id: str, document_id: str, document_type: str, **k
                     ":crawler_properties": {
                         "follow_links": follow_links,
                         "limit": limit,
+                        "content_types": content_types,
                     },
                     ":timestampValue": timestamp,
                 },
@@ -479,6 +481,7 @@ def _process_document(
         crawler_properties = kwargs["crawler_properties"]
         follow_links = crawler_properties["follow_links"]
         limit = crawler_properties["limit"]
+        content_types = crawler_properties["content_types"]
 
         if document_sub_type == "sitemap":
             follow_links = False
@@ -514,6 +517,7 @@ def _process_document(
                     "processed_urls": [],
                     "follow_links": follow_links,
                     "limit": limit,
+                    "content_types": content_types,
                     "done": False,
                 },
                 cls=genai_core.utils.json.CustomEncoder,
@@ -712,6 +716,9 @@ def batch_crawl_websites():
                     "limit": int(post["crawler_properties"]["M"]["limit"]["N"])
                     if "crawler_properties" in post
                     else 250,
+                    "content_types": post["crawler_properties"]["M"]["content_types"]["L"]
+                    if "crawler_properties" in post and "content_types" in post["crawler_properties"]["M"]
+                    else ["text/html"],
                 },
             )
             set_status(workspace_id, document_id, "processed")

diff --git a/lib/shared/layers/python-sdk/python/genai_core/websites/crawler.py b/lib/shared/layers/python-sdk/python/genai_core/websites/crawler.py
@@ -5,6 +5,8 @@
 import requests
 import genai_core.chunks
 import genai_core.documents
+import pdfplumber
+import io
 from typing import List
 from bs4 import BeautifulSoup
 from urllib.parse import urlparse
@@ -21,6 +23,7 @@ def crawl_urls(
     processed_urls: List[str],
     follow_links: bool,
     limit: int,
+    content_types: List[str],
 ):
     workspace_id = workspace["workspace_id"]
     document_id = document["document_id"]
@@ -47,7 +50,7 @@ def crawl_urls(
         print(f"Processing url {document_sub_id}: {current_url}")
 
         try:
-            content, local_links, _ = parse_url(current_url)
+            content, local_links, _ = parse_url(current_url, content_types)
         except:
             print(f"Failed to parse url: {current_url}")
             continue
@@ -96,7 +99,7 @@ def crawl_urls(
     }
 
 
-def parse_url(url: str):
+def parse_url(url: str, content_types_supported: list):
     headers = {
         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"
     }
@@ -105,14 +108,34 @@ def parse_url(url: str):
     base_url = f"{root_url_parse.scheme}://{root_url_parse.netloc}"
 
     response = requests.get(url, headers=headers, timeout=20)
-    if "text/html" not in response.headers["Content-Type"]:
-        raise Exception(
-            f"Invalid content type {response.headers['Content-Type']}")
-    soup = BeautifulSoup(response.content, "html.parser")
-    content = soup.get_text(separator=' ')
-    content = re.sub(r"[ \n]+", " ", content)
-
-    links = list(set([a["href"] for a in soup.find_all("a", href=True)]))
+    content_type = response.headers["Content-Type"]
+    links = []
+
+    if ("text/html" in content_type) and ("text/html" in content_types_supported):
+        soup = BeautifulSoup(response.content, "html.parser")
+        content = soup.get_text(separator=' ')
+        content = re.sub(r"[ \n]+", " ", content)
+        links = [a["href"] for a in soup.find_all("a", href=True)]
+
+    elif ("application/pdf" in content_type) and ("application/pdf" in content_types_supported):
+        pdf_bytes = response.content  # Get the bytes content of the response
+        pdf_stream = io.BytesIO(pdf_bytes)  # Create a BytesIO stream from the bytes
+        with pdfplumber.open(pdf_stream) as pdf:
+            content = []
+            for page in pdf.pages:
+                if page.extract_text():
+                    content.append(page.extract_text().replace('\n', ' '))
+
+                # Extract links from annotations
+                annotations = page.annots
+                if annotations:
+                    for annot in annotations:
+                        if annot['uri']:
+                            links.append(annot['uri'])
+            content = ' '.join(content)
+    else:
+        raise Exception(f"Unsupported content type {content_type} found at: {url}")
+
     local_links = []
     external_links = []
 

diff --git a/lib/shared/web-crawler-batch-job/index.py b/lib/shared/web-crawler-batch-job/index.py
@@ -23,6 +23,7 @@ def main():
     processed_urls = data["processed_urls"]
     follow_links = data["follow_links"]
     limit = data["limit"]
+    content_types = data["content_types"]
 
     return genai_core.websites.crawler.crawl_urls(
         workspace=workspace,
@@ -31,6 +32,7 @@ def main():
         processed_urls=processed_urls,
         follow_links=follow_links,
         limit=limit,
+        content_types=content_types,
     )
 
 if __name__ == "__main__":

diff --git a/lib/shared/web-crawler-batch-job/requirements.txt b/lib/shared/web-crawler-batch-job/requirements.txt
@@ -8,12 +8,13 @@ langchain==0.1.5
 opensearch-py==2.3.1
 psycopg2-binary==2.9.7
 pgvector==0.2.2
-pydantic==2.3.0
+pydantic==2.4.0
 urllib3<2
 openai==0.28.0
 beautifulsoup4==4.12.2
 requests==2.31.0
 attrs==23.1.0
 feedparser==6.0.10
 aws_xray_sdk==2.12.1
-defusedxml==0.7.1
+defusedxml==0.7.1
+pdfplumber==0.11.0
diff --git a/lib/user-interface/react-app/src/common/api-client/documents-client.ts b/lib/user-interface/react-app/src/common/api-client/documents-client.ts
@@ -119,7 +119,8 @@ export class DocumentsClient {
     sitemap: boolean,
     address: string,
     followLinks: boolean,
-    limit: number
+    limit: number,
+    contentTypes: string[]
   ): Promise<GraphQLResult<GraphQLQuery<AddWebsiteMutation>>> {
     const result = API.graphql<GraphQLQuery<AddWebsiteMutation>>({
       query: addWebsite,
@@ -130,6 +131,7 @@ export class DocumentsClient {
           address,
           followLinks,
           limit,
+          contentTypes,
         },
       },
     });
@@ -141,7 +143,8 @@ export class DocumentsClient {
     address: string,
     title: string,
     limit: number,
-    followLinks: boolean
+    followLinks: boolean,
+    contentTypes: string[]
   ): Promise<GraphQLResult<GraphQLQuery<AddRssFeedMutation>>> {
     const result = API.graphql<GraphQLQuery<AddRssFeedMutation>>({
       query: addRssFeed,
@@ -152,6 +155,7 @@ export class DocumentsClient {
           title,
           limit,
           followLinks,
+          contentTypes,
         },
       },
     });
@@ -222,7 +226,8 @@ export class DocumentsClient {
     workspaceId: string,
     feedId: string,
     followLinks: boolean,
-    limit: number
+    limit: number,
+    contentTypes: string[]
   ): Promise<GraphQLResult<GraphQLQuery<UpdateRssFeedMutation>>> {
     const result = API.graphql<GraphQLQuery<UpdateRssFeedMutation>>({
       query: addRssFeed,
@@ -232,6 +237,7 @@ export class DocumentsClient {
           documentId: feedId,
           followLinks,
           limit,
+          contentTypes,
         },
       },
     });

diff --git a/lib/user-interface/react-app/src/pages/rag/add-data/add-rss-subscription.tsx b/lib/user-interface/react-app/src/pages/rag/add-data/add-rss-subscription.tsx
@@ -8,8 +8,10 @@ import {
   Input,
   SpaceBetween,
   Toggle,
+  Multiselect,
 } from "@cloudscape-design/components";
-import { AddDataData } from "./types";
+import { AddDataData, SelectOption, multiselectOptions } from "./types";
+import { generateSelectedOptions } from "./utils";
 import { useForm } from "../../../common/hooks/use-form";
 import { useContext, useState } from "react";
 import { AppContext } from "../../../common/app-context";
@@ -31,6 +33,7 @@ interface AddRssSubscriptionData {
   rssFeedTitle: string;
   linkLimit: number;
   followLinks: boolean;
+  contentTypes: (string | undefined)[];
 }
 
 export default function AddRssSubscription(props: AddRssSubscriptionProps) {
@@ -46,6 +49,7 @@ export default function AddRssSubscription(props: AddRssSubscriptionProps) {
         rssFeedTitle: "",
         linkLimit: 250,
         followLinks: true,
+        contentTypes: ["text/html"],
       };
     },
     validate: (form) => {
@@ -77,13 +81,15 @@ export default function AddRssSubscription(props: AddRssSubscriptionProps) {
     setGlobalError(undefined);
 
     const apiClient = new ApiClient(appContext);
+    const contentTypesToUse = data.contentTypes.filter((ct): ct is string => ct !== undefined);
     try {
       await apiClient.documents.addRssFeedSubscription(
         props.data.workspace.value,
         data.rssFeedUrl,
         data.rssFeedTitle,
         data.linkLimit,
-        data.followLinks
+        data.followLinks,
+        contentTypesToUse
       );
 
       setFlashbarItem({
@@ -109,6 +115,20 @@ export default function AddRssSubscription(props: AddRssSubscriptionProps) {
     props.setSubmitting(false);
   };
 
+  const handleContentTypeChange = (selectedOptions: ReadonlyArray<SelectOption>) => {
+    const options: SelectOption[] = selectedOptions.map(option => {
+      if (option.value === undefined) {
+        throw new Error(`Option value cannot be undefined`);
+      }
+      return {
+        label: option.label,
+        value: option.value,
+        description: option.description
+      };
+    });
+    onChange({ contentTypes: options.map(option => option.value) });
+  };
+
   const hasReadyWorkspace =
     typeof props.data.workspace?.value !== "undefined" &&
     typeof props.selectedWorkspace !== "undefined" &&
@@ -191,6 +211,18 @@ export default function AddRssSubscription(props: AddRssSubscriptionProps) {
                 }
               />
             </FormField>
+            <FormField
+              label="Enabled Content Types"
+              errorText={errors.contentTypes}
+              description="Content Types to Enable for crawlingl"
+            >
+            <Multiselect
+              disabled={props.submitting}
+              selectedOptions={generateSelectedOptions(data.contentTypes)}
+              options={multiselectOptions}
+              onChange={({ detail }) => handleContentTypeChange(detail.selectedOptions)}
+            />
+            </FormField>
           </SpaceBetween>
         </Container>
         {flashbarItem !== null && <Flashbar items={[flashbarItem]} />}