From 599c59e0b02607370f565db8d50ebbbe4b627872 Mon Sep 17 00:00:00 2001 From: Rob-Powell <7034920+Rob-Powell@users.noreply.github.com> Date: Thu, 2 May 2024 03:03:19 +0000 Subject: [PATCH] feat: enable web scraping to parse and save pdf content --- .../functions/api-handler/routes/documents.py | 8 +++ lib/chatbot-api/schema/schema.graphql | 3 ++ .../file-import-batch-job/requirements.txt | 2 +- .../python-sdk/python/genai_core/documents.py | 7 +++ .../python/genai_core/websites/crawler.py | 43 ++++++++++++---- lib/shared/web-crawler-batch-job/index.py | 2 + .../web-crawler-batch-job/requirements.txt | 5 +- .../src/common/api-client/documents-client.ts | 12 +++-- .../rag/add-data/add-rss-subscription.tsx | 36 ++++++++++++- .../src/pages/rag/add-data/crawl-website.tsx | 40 ++++++++++++++- .../react-app/src/pages/rag/add-data/types.ts | 21 ++++++++ .../react-app/src/pages/rag/add-data/utils.ts | 9 ++++ .../src/pages/rag/workspace/rss-feed.tsx | 50 ++++++++++++++++++- 13 files changed, 217 insertions(+), 21 deletions(-) create mode 100644 lib/user-interface/react-app/src/pages/rag/add-data/utils.ts diff --git a/lib/chatbot-api/functions/api-handler/routes/documents.py b/lib/chatbot-api/functions/api-handler/routes/documents.py index 26c61fd0..9e2c5254 100644 --- a/lib/chatbot-api/functions/api-handler/routes/documents.py +++ b/lib/chatbot-api/functions/api-handler/routes/documents.py @@ -35,6 +35,7 @@ class WebsiteDocumentRequest(BaseModel): address: str followLinks: bool limit: int + contentTypes: Optional[list] class RssFeedDocumentRequest(BaseModel): @@ -44,12 +45,14 @@ class RssFeedDocumentRequest(BaseModel): limit: int title: Optional[str] followLinks: bool + contentTypes: Optional[list] class RssFeedCrawlerUpdateRequest(BaseModel): documentType: str followLinks: bool limit: int + contentTypes: Optional[str] class ListDocumentsRequest(BaseModel): @@ -237,6 +240,7 @@ def add_website(input: dict): crawler_properties={ "follow_links": request.followLinks, "limit": limit, + "content_types": request.contentTypes, }, ) @@ -263,6 +267,7 @@ def add_rss_feed( crawler_properties={ "follow_links": request.followLinks, "limit": request.limit, + "content_types": request.contentTypes, }, ) @@ -282,6 +287,7 @@ def update_rss_feed(input: dict): document_type="rssfeed", follow_links=request.followLinks, limit=request.limit, + content_types=request.contentTypes, ) return { "workspaceId": result["workspace_id"], @@ -295,6 +301,7 @@ def _convert_document(document: dict): document["crawler_properties"] = { "followLinks": document["crawler_properties"]["follow_links"], "limit": document["crawler_properties"]["limit"], + "contentTypes": document["crawler_properties"]["content_types"], } return { "id": document["document_id"], @@ -315,6 +322,7 @@ def _convert_document(document: dict): "crawlerProperties": { "followLinks": document.get("crawler_properties").get("follow_links", None), "limit": document.get("crawler_properties").get("limit", None), + "contentTypes": document.get("crawler_properties").get("content_types", None), } if document.get("crawler_properties", None) != None else None, diff --git a/lib/chatbot-api/schema/schema.graphql b/lib/chatbot-api/schema/schema.graphql index f844063a..a484c5ea 100644 --- a/lib/chatbot-api/schema/schema.graphql +++ b/lib/chatbot-api/schema/schema.graphql @@ -47,6 +47,7 @@ input CalculateEmbeddingsInput { type CrawlerProperties @aws_cognito_user_pools { followLinks: Boolean limit: Int + contentTypes: [String!]! } type CrossEncoderData @aws_cognito_user_pools { @@ -190,6 +191,7 @@ input RssFeedInput { limit: Int! title: String followLinks: Boolean! + contentTypes: [String!]! } input SemanticSearchInput { @@ -261,6 +263,7 @@ input WebsiteInput { address: String! followLinks: Boolean! limit: Int! + contentTypes: [String!]! } type Workspace @aws_cognito_user_pools { diff --git a/lib/shared/file-import-batch-job/requirements.txt b/lib/shared/file-import-batch-job/requirements.txt index fa606e51..e74adc2b 100644 --- a/lib/shared/file-import-batch-job/requirements.txt +++ b/lib/shared/file-import-batch-job/requirements.txt @@ -8,7 +8,7 @@ langchain==0.1.5 opensearch-py==2.3.1 psycopg2-binary==2.9.7 pgvector==0.2.2 -pydantic==2.3.0 +pydantic==2.4.0 urllib3<2 openai==0.28.0 beautifulsoup4==4.12.2 diff --git a/lib/shared/layers/python-sdk/python/genai_core/documents.py b/lib/shared/layers/python-sdk/python/genai_core/documents.py index 8f2f96e1..e7c153d1 100644 --- a/lib/shared/layers/python-sdk/python/genai_core/documents.py +++ b/lib/shared/layers/python-sdk/python/genai_core/documents.py @@ -359,6 +359,7 @@ def update_document(workspace_id: str, document_id: str, document_type: str, **k if "limit" in kwargs and "follow_links" in kwargs: follow_links = kwargs["follow_links"] limit = kwargs["limit"] + content_types = kwargs["content_types"] response = documents_table.update_item( Key={"workspace_id": workspace_id, "document_id": document_id}, UpdateExpression="SET #crawler_properties=:crawler_properties, updated_at=:timestampValue", @@ -367,6 +368,7 @@ def update_document(workspace_id: str, document_id: str, document_type: str, **k ":crawler_properties": { "follow_links": follow_links, "limit": limit, + "content_types": content_types, }, ":timestampValue": timestamp, }, @@ -479,6 +481,7 @@ def _process_document( crawler_properties = kwargs["crawler_properties"] follow_links = crawler_properties["follow_links"] limit = crawler_properties["limit"] + content_types = crawler_properties["content_types"] if document_sub_type == "sitemap": follow_links = False @@ -514,6 +517,7 @@ def _process_document( "processed_urls": [], "follow_links": follow_links, "limit": limit, + "content_types": content_types, "done": False, }, cls=genai_core.utils.json.CustomEncoder, @@ -712,6 +716,9 @@ def batch_crawl_websites(): "limit": int(post["crawler_properties"]["M"]["limit"]["N"]) if "crawler_properties" in post else 250, + "content_types": post["crawler_properties"]["M"]["content_types"]["L"] + if "crawler_properties" in post and "content_types" in post["crawler_properties"]["M"] + else ["text/html"], }, ) set_status(workspace_id, document_id, "processed") diff --git a/lib/shared/layers/python-sdk/python/genai_core/websites/crawler.py b/lib/shared/layers/python-sdk/python/genai_core/websites/crawler.py index d0a94b4f..eceb2371 100644 --- a/lib/shared/layers/python-sdk/python/genai_core/websites/crawler.py +++ b/lib/shared/layers/python-sdk/python/genai_core/websites/crawler.py @@ -5,6 +5,8 @@ import requests import genai_core.chunks import genai_core.documents +import pdfplumber +import io from typing import List from bs4 import BeautifulSoup from urllib.parse import urlparse @@ -21,6 +23,7 @@ def crawl_urls( processed_urls: List[str], follow_links: bool, limit: int, + content_types: List[str], ): workspace_id = workspace["workspace_id"] document_id = document["document_id"] @@ -47,7 +50,7 @@ def crawl_urls( print(f"Processing url {document_sub_id}: {current_url}") try: - content, local_links, _ = parse_url(current_url) + content, local_links, _ = parse_url(current_url, content_types) except: print(f"Failed to parse url: {current_url}") continue @@ -96,7 +99,7 @@ def crawl_urls( } -def parse_url(url: str): +def parse_url(url: str, content_types_supported: list): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36" } @@ -105,14 +108,34 @@ def parse_url(url: str): base_url = f"{root_url_parse.scheme}://{root_url_parse.netloc}" response = requests.get(url, headers=headers, timeout=20) - if "text/html" not in response.headers["Content-Type"]: - raise Exception( - f"Invalid content type {response.headers['Content-Type']}") - soup = BeautifulSoup(response.content, "html.parser") - content = soup.get_text(separator=' ') - content = re.sub(r"[ \n]+", " ", content) - - links = list(set([a["href"] for a in soup.find_all("a", href=True)])) + content_type = response.headers["Content-Type"] + links = [] + + if ("text/html" in content_type) and ("text/html" in content_types_supported): + soup = BeautifulSoup(response.content, "html.parser") + content = soup.get_text(separator=' ') + content = re.sub(r"[ \n]+", " ", content) + links = [a["href"] for a in soup.find_all("a", href=True)] + + elif ("application/pdf" in content_type) and ("application/pdf" in content_types_supported): + pdf_bytes = response.content # Get the bytes content of the response + pdf_stream = io.BytesIO(pdf_bytes) # Create a BytesIO stream from the bytes + with pdfplumber.open(pdf_stream) as pdf: + content = [] + for page in pdf.pages: + if page.extract_text(): + content.append(page.extract_text().replace('\n', ' ')) + + # Extract links from annotations + annotations = page.annots + if annotations: + for annot in annotations: + if annot['uri']: + links.append(annot['uri']) + content = ' '.join(content) + else: + raise Exception(f"Unsupported content type {content_type} found at: {url}") + local_links = [] external_links = [] diff --git a/lib/shared/web-crawler-batch-job/index.py b/lib/shared/web-crawler-batch-job/index.py index 8d322de4..7d3ad324 100644 --- a/lib/shared/web-crawler-batch-job/index.py +++ b/lib/shared/web-crawler-batch-job/index.py @@ -23,6 +23,7 @@ def main(): processed_urls = data["processed_urls"] follow_links = data["follow_links"] limit = data["limit"] + content_types = data["content_types"] return genai_core.websites.crawler.crawl_urls( workspace=workspace, @@ -31,6 +32,7 @@ def main(): processed_urls=processed_urls, follow_links=follow_links, limit=limit, + content_types=content_types, ) if __name__ == "__main__": diff --git a/lib/shared/web-crawler-batch-job/requirements.txt b/lib/shared/web-crawler-batch-job/requirements.txt index 5af0bf18..d513c491 100644 --- a/lib/shared/web-crawler-batch-job/requirements.txt +++ b/lib/shared/web-crawler-batch-job/requirements.txt @@ -8,7 +8,7 @@ langchain==0.1.5 opensearch-py==2.3.1 psycopg2-binary==2.9.7 pgvector==0.2.2 -pydantic==2.3.0 +pydantic==2.4.0 urllib3<2 openai==0.28.0 beautifulsoup4==4.12.2 @@ -16,4 +16,5 @@ requests==2.31.0 attrs==23.1.0 feedparser==6.0.10 aws_xray_sdk==2.12.1 -defusedxml==0.7.1 \ No newline at end of file +defusedxml==0.7.1 +pdfplumber==0.11.0 \ No newline at end of file diff --git a/lib/user-interface/react-app/src/common/api-client/documents-client.ts b/lib/user-interface/react-app/src/common/api-client/documents-client.ts index ac0b440f..59d3c050 100644 --- a/lib/user-interface/react-app/src/common/api-client/documents-client.ts +++ b/lib/user-interface/react-app/src/common/api-client/documents-client.ts @@ -119,7 +119,8 @@ export class DocumentsClient { sitemap: boolean, address: string, followLinks: boolean, - limit: number + limit: number, + contentTypes: string[] ): Promise>> { const result = API.graphql>({ query: addWebsite, @@ -130,6 +131,7 @@ export class DocumentsClient { address, followLinks, limit, + contentTypes, }, }, }); @@ -141,7 +143,8 @@ export class DocumentsClient { address: string, title: string, limit: number, - followLinks: boolean + followLinks: boolean, + contentTypes: string[] ): Promise>> { const result = API.graphql>({ query: addRssFeed, @@ -152,6 +155,7 @@ export class DocumentsClient { title, limit, followLinks, + contentTypes, }, }, }); @@ -222,7 +226,8 @@ export class DocumentsClient { workspaceId: string, feedId: string, followLinks: boolean, - limit: number + limit: number, + contentTypes: string[] ): Promise>> { const result = API.graphql>({ query: addRssFeed, @@ -232,6 +237,7 @@ export class DocumentsClient { documentId: feedId, followLinks, limit, + contentTypes, }, }, }); diff --git a/lib/user-interface/react-app/src/pages/rag/add-data/add-rss-subscription.tsx b/lib/user-interface/react-app/src/pages/rag/add-data/add-rss-subscription.tsx index ebf3c7bf..8b0a91e6 100644 --- a/lib/user-interface/react-app/src/pages/rag/add-data/add-rss-subscription.tsx +++ b/lib/user-interface/react-app/src/pages/rag/add-data/add-rss-subscription.tsx @@ -8,8 +8,10 @@ import { Input, SpaceBetween, Toggle, + Multiselect, } from "@cloudscape-design/components"; -import { AddDataData } from "./types"; +import { AddDataData, SelectOption, multiselectOptions } from "./types"; +import { generateSelectedOptions } from "./utils"; import { useForm } from "../../../common/hooks/use-form"; import { useContext, useState } from "react"; import { AppContext } from "../../../common/app-context"; @@ -31,6 +33,7 @@ interface AddRssSubscriptionData { rssFeedTitle: string; linkLimit: number; followLinks: boolean; + contentTypes: (string | undefined)[]; } export default function AddRssSubscription(props: AddRssSubscriptionProps) { @@ -46,6 +49,7 @@ export default function AddRssSubscription(props: AddRssSubscriptionProps) { rssFeedTitle: "", linkLimit: 250, followLinks: true, + contentTypes: ["text/html"], }; }, validate: (form) => { @@ -77,13 +81,15 @@ export default function AddRssSubscription(props: AddRssSubscriptionProps) { setGlobalError(undefined); const apiClient = new ApiClient(appContext); + const contentTypesToUse = data.contentTypes.filter((ct): ct is string => ct !== undefined); try { await apiClient.documents.addRssFeedSubscription( props.data.workspace.value, data.rssFeedUrl, data.rssFeedTitle, data.linkLimit, - data.followLinks + data.followLinks, + contentTypesToUse ); setFlashbarItem({ @@ -109,6 +115,20 @@ export default function AddRssSubscription(props: AddRssSubscriptionProps) { props.setSubmitting(false); }; + const handleContentTypeChange = (selectedOptions: ReadonlyArray) => { + const options: SelectOption[] = selectedOptions.map(option => { + if (option.value === undefined) { + throw new Error(`Option value cannot be undefined`); + } + return { + label: option.label, + value: option.value, + description: option.description + }; + }); + onChange({ contentTypes: options.map(option => option.value) }); + }; + const hasReadyWorkspace = typeof props.data.workspace?.value !== "undefined" && typeof props.selectedWorkspace !== "undefined" && @@ -191,6 +211,18 @@ export default function AddRssSubscription(props: AddRssSubscriptionProps) { } /> + + handleContentTypeChange(detail.selectedOptions)} + /> + {flashbarItem !== null && } diff --git a/lib/user-interface/react-app/src/pages/rag/add-data/crawl-website.tsx b/lib/user-interface/react-app/src/pages/rag/add-data/crawl-website.tsx index 3547be90..b86a1a7d 100644 --- a/lib/user-interface/react-app/src/pages/rag/add-data/crawl-website.tsx +++ b/lib/user-interface/react-app/src/pages/rag/add-data/crawl-website.tsx @@ -9,8 +9,10 @@ import { SegmentedControl, SpaceBetween, Toggle, + Multiselect, } from "@cloudscape-design/components"; -import { AddDataData } from "./types"; +import { AddDataData, SelectOption, multiselectOptions } from "./types"; +import { generateSelectedOptions } from "./utils"; import { useForm } from "../../../common/hooks/use-form"; import { useContext, useState } from "react"; import { AppContext } from "../../../common/app-context"; @@ -33,6 +35,7 @@ interface CrawlWebisteData { sitemapUrl: string; followLinks: boolean; limit: number; + contentTypes: (string | undefined)[]; } export default function CrawlWebsite(props: CrawlWebsiteProps) { @@ -49,6 +52,7 @@ export default function CrawlWebsite(props: CrawlWebsiteProps) { sitemapUrl: "", followLinks: true, limit: 250, + contentTypes: ["text/html"], }; }, validate: (form) => { @@ -73,6 +77,10 @@ export default function CrawlWebsite(props: CrawlWebsiteProps) { if (form.limit < 1 || form.limit > 1000) { errors.limit = "Page limit should be between 1 and 1000"; } + + if (form.contentTypes.length === 0) { + errors.contentTypes = "At least one content type must be selected."; + } return errors; }, @@ -91,13 +99,15 @@ export default function CrawlWebsite(props: CrawlWebsiteProps) { const apiClient = new ApiClient(appContext); const isSitemap = data.urlType === "sitemap"; + const contentTypesToUse = data.contentTypes.filter((ct): ct is string => ct !== undefined); try { await apiClient.documents.addWebsiteDocument( props.data.workspace.value, isSitemap, isSitemap ? data.sitemapUrl : data.websiteUrl, data.followLinks, - data.limit + data.limit, + contentTypesToUse ); setFlashbarItem({ @@ -121,6 +131,20 @@ export default function CrawlWebsite(props: CrawlWebsiteProps) { props.setSubmitting(false); }; + + const handleContentTypeChange = (selectedOptions: ReadonlyArray) => { + const options: SelectOption[] = selectedOptions.map(option => { + if (option.value === undefined) { + throw new Error(`Option value cannot be undefined`); + } + return { + label: option.label, + value: option.value, + description: option.description + }; + }); + onChange({ contentTypes: options.map(option => option.value) }); + }; const hasReadyWorkspace = typeof props.data.workspace?.value !== "undefined" && @@ -220,6 +244,18 @@ export default function CrawlWebsite(props: CrawlWebsiteProps) { } /> + + handleContentTypeChange(detail.selectedOptions)} + /> + {flashbarItem !== null && } diff --git a/lib/user-interface/react-app/src/pages/rag/add-data/types.ts b/lib/user-interface/react-app/src/pages/rag/add-data/types.ts index 95d4ede6..5aa535c1 100644 --- a/lib/user-interface/react-app/src/pages/rag/add-data/types.ts +++ b/lib/user-interface/react-app/src/pages/rag/add-data/types.ts @@ -4,3 +4,24 @@ export interface AddDataData { workspace: SelectProps.Option | null; query: string; } + +export interface SelectOption { + label?: string; + value?: string | undefined; + description?: string; +} + +export const ContentTypesOptionsConfig: { [key: string]: { description: string } } = { + "text/html": { + description: "Crawl Websites" + }, + "application/pdf": { + description: "Crawl PDFs" + } +}; + +export const multiselectOptions: SelectOption[] = Object.entries(ContentTypesOptionsConfig).map(([value, { description }]) => ({ + label: value, + value: value, + description: description +})); diff --git a/lib/user-interface/react-app/src/pages/rag/add-data/utils.ts b/lib/user-interface/react-app/src/pages/rag/add-data/utils.ts new file mode 100644 index 00000000..70966e74 --- /dev/null +++ b/lib/user-interface/react-app/src/pages/rag/add-data/utils.ts @@ -0,0 +1,9 @@ +import { SelectOption, ContentTypesOptionsConfig } from "./types"; + +export function generateSelectedOptions(contentTypes: (string | undefined)[]): SelectOption[] { + return contentTypes.map((ct): SelectOption => ({ + label: ct || "text/html", + value: ct || "text/html", + description: ContentTypesOptionsConfig[ct as keyof typeof ContentTypesOptionsConfig]?.description || "Default Description" + })); +} diff --git a/lib/user-interface/react-app/src/pages/rag/workspace/rss-feed.tsx b/lib/user-interface/react-app/src/pages/rag/workspace/rss-feed.tsx index 1504f2be..39c2b36c 100644 --- a/lib/user-interface/react-app/src/pages/rag/workspace/rss-feed.tsx +++ b/lib/user-interface/react-app/src/pages/rag/workspace/rss-feed.tsx @@ -17,10 +17,13 @@ import { StatusIndicator, Table, Toggle, + Multiselect, } from "@cloudscape-design/components"; import useOnFollow from "../../../common/hooks/use-on-follow"; import BaseAppLayout from "../../../components/base-app-layout"; import { useNavigate, useParams } from "react-router-dom"; +import { multiselectOptions, SelectOption } from "../add-data/types"; +import { generateSelectedOptions } from "../add-data/utils"; import { useCallback, useContext, useEffect, useState } from "react"; import { DocumentSubscriptionStatus } from "../../../common/types"; import { AppContext } from "../../../common/app-context"; @@ -385,6 +388,7 @@ export default function RssFeed() { data={{ followLinks: rssCrawlerFollowLinks, limit: rssCrawlerLimit, + contentTypes: ["text/html"], }} documentId={rssSubscription?.id ?? ""} workspaceId={workspace?.id ?? ""} @@ -538,6 +542,7 @@ export function RssFeedPostUrlPopover(props: RssFeedPostUrlPopoverProps) { const item = props.item; const followLinks = item.crawlerProperties?.followLinks; const limit = item.crawlerProperties?.limit; + const contentTypes = item.crawlerProperties?.contentTypes; return ( )} +
+ Content Types supported +
+ {contentTypes} +
+