From 599c59e0b02607370f565db8d50ebbbe4b627872 Mon Sep 17 00:00:00 2001
From: Rob-Powell <7034920+Rob-Powell@users.noreply.github.com>
Date: Thu, 2 May 2024 03:03:19 +0000
Subject: [PATCH] feat: enable web scraping to parse and save pdf content

---
 .../functions/api-handler/routes/documents.py |  8 +++
 lib/chatbot-api/schema/schema.graphql         |  3 ++
 .../file-import-batch-job/requirements.txt    |  2 +-
 .../python-sdk/python/genai_core/documents.py |  7 +++
 .../python/genai_core/websites/crawler.py     | 43 ++++++++++++----
 lib/shared/web-crawler-batch-job/index.py     |  2 +
 .../web-crawler-batch-job/requirements.txt    |  5 +-
 .../src/common/api-client/documents-client.ts | 12 +++--
 .../rag/add-data/add-rss-subscription.tsx     | 36 ++++++++++++-
 .../src/pages/rag/add-data/crawl-website.tsx  | 40 ++++++++++++++-
 .../react-app/src/pages/rag/add-data/types.ts | 21 ++++++++
 .../react-app/src/pages/rag/add-data/utils.ts |  9 ++++
 .../src/pages/rag/workspace/rss-feed.tsx      | 50 ++++++++++++++++++-
 13 files changed, 217 insertions(+), 21 deletions(-)
 create mode 100644 lib/user-interface/react-app/src/pages/rag/add-data/utils.ts

diff --git a/lib/chatbot-api/functions/api-handler/routes/documents.py b/lib/chatbot-api/functions/api-handler/routes/documents.py
index 26c61fd0..9e2c5254 100644
--- a/lib/chatbot-api/functions/api-handler/routes/documents.py
+++ b/lib/chatbot-api/functions/api-handler/routes/documents.py
@@ -35,6 +35,7 @@ class WebsiteDocumentRequest(BaseModel):
     address: str
     followLinks: bool
     limit: int
+    contentTypes: Optional[list]
 
 
 class RssFeedDocumentRequest(BaseModel):
@@ -44,12 +45,14 @@ class RssFeedDocumentRequest(BaseModel):
     limit: int
     title: Optional[str]
     followLinks: bool
+    contentTypes: Optional[list]
 
 
 class RssFeedCrawlerUpdateRequest(BaseModel):
     documentType: str
     followLinks: bool
     limit: int
+    contentTypes: Optional[str]
 
 
 class ListDocumentsRequest(BaseModel):
@@ -237,6 +240,7 @@ def add_website(input: dict):
         crawler_properties={
             "follow_links": request.followLinks,
             "limit": limit,
+            "content_types": request.contentTypes,
         },
     )
 
@@ -263,6 +267,7 @@ def add_rss_feed(
         crawler_properties={
             "follow_links": request.followLinks,
             "limit": request.limit,
+            "content_types": request.contentTypes,
         },
     )
 
@@ -282,6 +287,7 @@ def update_rss_feed(input: dict):
         document_type="rssfeed",
         follow_links=request.followLinks,
         limit=request.limit,
+        content_types=request.contentTypes,
     )
     return {
         "workspaceId": result["workspace_id"],
@@ -295,6 +301,7 @@ def _convert_document(document: dict):
         document["crawler_properties"] = {
             "followLinks": document["crawler_properties"]["follow_links"],
             "limit": document["crawler_properties"]["limit"],
+            "contentTypes": document["crawler_properties"]["content_types"],
         }
     return {
         "id": document["document_id"],
@@ -315,6 +322,7 @@ def _convert_document(document: dict):
         "crawlerProperties": {
             "followLinks": document.get("crawler_properties").get("follow_links", None),
             "limit": document.get("crawler_properties").get("limit", None),
+            "contentTypes": document.get("crawler_properties").get("content_types", None),
         }
         if document.get("crawler_properties", None) != None
         else None,
diff --git a/lib/chatbot-api/schema/schema.graphql b/lib/chatbot-api/schema/schema.graphql
index f844063a..a484c5ea 100644
--- a/lib/chatbot-api/schema/schema.graphql
+++ b/lib/chatbot-api/schema/schema.graphql
@@ -47,6 +47,7 @@ input CalculateEmbeddingsInput {
 type CrawlerProperties @aws_cognito_user_pools {
   followLinks: Boolean
   limit: Int
+  contentTypes: [String!]!
 }
 
 type CrossEncoderData @aws_cognito_user_pools {
@@ -190,6 +191,7 @@ input RssFeedInput {
   limit: Int!
   title: String
   followLinks: Boolean!
+  contentTypes: [String!]!
 }
 
 input SemanticSearchInput {
@@ -261,6 +263,7 @@ input WebsiteInput {
   address: String!
   followLinks: Boolean!
   limit: Int!
+  contentTypes: [String!]!
 }
 
 type Workspace @aws_cognito_user_pools {
diff --git a/lib/shared/file-import-batch-job/requirements.txt b/lib/shared/file-import-batch-job/requirements.txt
index fa606e51..e74adc2b 100644
--- a/lib/shared/file-import-batch-job/requirements.txt
+++ b/lib/shared/file-import-batch-job/requirements.txt
@@ -8,7 +8,7 @@ langchain==0.1.5
 opensearch-py==2.3.1
 psycopg2-binary==2.9.7
 pgvector==0.2.2
-pydantic==2.3.0
+pydantic==2.4.0
 urllib3<2
 openai==0.28.0
 beautifulsoup4==4.12.2
diff --git a/lib/shared/layers/python-sdk/python/genai_core/documents.py b/lib/shared/layers/python-sdk/python/genai_core/documents.py
index 8f2f96e1..e7c153d1 100644
--- a/lib/shared/layers/python-sdk/python/genai_core/documents.py
+++ b/lib/shared/layers/python-sdk/python/genai_core/documents.py
@@ -359,6 +359,7 @@ def update_document(workspace_id: str, document_id: str, document_type: str, **k
         if "limit" in kwargs and "follow_links" in kwargs:
             follow_links = kwargs["follow_links"]
             limit = kwargs["limit"]
+            content_types = kwargs["content_types"]
             response = documents_table.update_item(
                 Key={"workspace_id": workspace_id, "document_id": document_id},
                 UpdateExpression="SET #crawler_properties=:crawler_properties, updated_at=:timestampValue",
@@ -367,6 +368,7 @@ def update_document(workspace_id: str, document_id: str, document_type: str, **k
                     ":crawler_properties": {
                         "follow_links": follow_links,
                         "limit": limit,
+                        "content_types": content_types,
                     },
                     ":timestampValue": timestamp,
                 },
@@ -479,6 +481,7 @@ def _process_document(
         crawler_properties = kwargs["crawler_properties"]
         follow_links = crawler_properties["follow_links"]
         limit = crawler_properties["limit"]
+        content_types = crawler_properties["content_types"]
 
         if document_sub_type == "sitemap":
             follow_links = False
@@ -514,6 +517,7 @@ def _process_document(
                     "processed_urls": [],
                     "follow_links": follow_links,
                     "limit": limit,
+                    "content_types": content_types,
                     "done": False,
                 },
                 cls=genai_core.utils.json.CustomEncoder,
@@ -712,6 +716,9 @@ def batch_crawl_websites():
                     "limit": int(post["crawler_properties"]["M"]["limit"]["N"])
                     if "crawler_properties" in post
                     else 250,
+                    "content_types": post["crawler_properties"]["M"]["content_types"]["L"]
+                    if "crawler_properties" in post and "content_types" in post["crawler_properties"]["M"]
+                    else ["text/html"],
                 },
             )
             set_status(workspace_id, document_id, "processed")
diff --git a/lib/shared/layers/python-sdk/python/genai_core/websites/crawler.py b/lib/shared/layers/python-sdk/python/genai_core/websites/crawler.py
index d0a94b4f..eceb2371 100644
--- a/lib/shared/layers/python-sdk/python/genai_core/websites/crawler.py
+++ b/lib/shared/layers/python-sdk/python/genai_core/websites/crawler.py
@@ -5,6 +5,8 @@
 import requests
 import genai_core.chunks
 import genai_core.documents
+import pdfplumber
+import io
 from typing import List
 from bs4 import BeautifulSoup
 from urllib.parse import urlparse
@@ -21,6 +23,7 @@ def crawl_urls(
     processed_urls: List[str],
     follow_links: bool,
     limit: int,
+    content_types: List[str],
 ):
     workspace_id = workspace["workspace_id"]
     document_id = document["document_id"]
@@ -47,7 +50,7 @@ def crawl_urls(
         print(f"Processing url {document_sub_id}: {current_url}")
 
         try:
-            content, local_links, _ = parse_url(current_url)
+            content, local_links, _ = parse_url(current_url, content_types)
         except:
             print(f"Failed to parse url: {current_url}")
             continue
@@ -96,7 +99,7 @@ def crawl_urls(
     }
 
 
-def parse_url(url: str):
+def parse_url(url: str, content_types_supported: list):
     headers = {
         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"
     }
@@ -105,14 +108,34 @@ def parse_url(url: str):
     base_url = f"{root_url_parse.scheme}://{root_url_parse.netloc}"
 
     response = requests.get(url, headers=headers, timeout=20)
-    if "text/html" not in response.headers["Content-Type"]:
-        raise Exception(
-            f"Invalid content type {response.headers['Content-Type']}")
-    soup = BeautifulSoup(response.content, "html.parser")
-    content = soup.get_text(separator=' ')
-    content = re.sub(r"[ \n]+", " ", content)
-
-    links = list(set([a["href"] for a in soup.find_all("a", href=True)]))
+    content_type = response.headers["Content-Type"]
+    links = []
+
+    if ("text/html" in content_type) and ("text/html" in content_types_supported):
+        soup = BeautifulSoup(response.content, "html.parser")
+        content = soup.get_text(separator=' ')
+        content = re.sub(r"[ \n]+", " ", content)
+        links = [a["href"] for a in soup.find_all("a", href=True)]
+    
+    elif ("application/pdf" in content_type) and ("application/pdf" in content_types_supported):
+        pdf_bytes = response.content  # Get the bytes content of the response
+        pdf_stream = io.BytesIO(pdf_bytes)  # Create a BytesIO stream from the bytes
+        with pdfplumber.open(pdf_stream) as pdf:
+            content = []
+            for page in pdf.pages:
+                if page.extract_text():
+                    content.append(page.extract_text().replace('\n', ' '))
+                
+                # Extract links from annotations
+                annotations = page.annots
+                if annotations:
+                    for annot in annotations:
+                        if annot['uri']:
+                            links.append(annot['uri'])
+            content = ' '.join(content)
+    else:
+        raise Exception(f"Unsupported content type {content_type} found at: {url}")
+
     local_links = []
     external_links = []
 
diff --git a/lib/shared/web-crawler-batch-job/index.py b/lib/shared/web-crawler-batch-job/index.py
index 8d322de4..7d3ad324 100644
--- a/lib/shared/web-crawler-batch-job/index.py
+++ b/lib/shared/web-crawler-batch-job/index.py
@@ -23,6 +23,7 @@ def main():
     processed_urls = data["processed_urls"]
     follow_links = data["follow_links"]
     limit = data["limit"]
+    content_types = data["content_types"]
 
     return genai_core.websites.crawler.crawl_urls(
         workspace=workspace,
@@ -31,6 +32,7 @@ def main():
         processed_urls=processed_urls,
         follow_links=follow_links,
         limit=limit,
+        content_types=content_types,
     )
 
 if __name__ == "__main__":
diff --git a/lib/shared/web-crawler-batch-job/requirements.txt b/lib/shared/web-crawler-batch-job/requirements.txt
index 5af0bf18..d513c491 100644
--- a/lib/shared/web-crawler-batch-job/requirements.txt
+++ b/lib/shared/web-crawler-batch-job/requirements.txt
@@ -8,7 +8,7 @@ langchain==0.1.5
 opensearch-py==2.3.1
 psycopg2-binary==2.9.7
 pgvector==0.2.2
-pydantic==2.3.0
+pydantic==2.4.0
 urllib3<2
 openai==0.28.0
 beautifulsoup4==4.12.2
@@ -16,4 +16,5 @@ requests==2.31.0
 attrs==23.1.0
 feedparser==6.0.10
 aws_xray_sdk==2.12.1
-defusedxml==0.7.1
\ No newline at end of file
+defusedxml==0.7.1
+pdfplumber==0.11.0
\ No newline at end of file
diff --git a/lib/user-interface/react-app/src/common/api-client/documents-client.ts b/lib/user-interface/react-app/src/common/api-client/documents-client.ts
index ac0b440f..59d3c050 100644
--- a/lib/user-interface/react-app/src/common/api-client/documents-client.ts
+++ b/lib/user-interface/react-app/src/common/api-client/documents-client.ts
@@ -119,7 +119,8 @@ export class DocumentsClient {
     sitemap: boolean,
     address: string,
     followLinks: boolean,
-    limit: number
+    limit: number,
+    contentTypes: string[]
   ): Promise<GraphQLResult<GraphQLQuery<AddWebsiteMutation>>> {
     const result = API.graphql<GraphQLQuery<AddWebsiteMutation>>({
       query: addWebsite,
@@ -130,6 +131,7 @@ export class DocumentsClient {
           address,
           followLinks,
           limit,
+          contentTypes,
         },
       },
     });
@@ -141,7 +143,8 @@ export class DocumentsClient {
     address: string,
     title: string,
     limit: number,
-    followLinks: boolean
+    followLinks: boolean,
+    contentTypes: string[]
   ): Promise<GraphQLResult<GraphQLQuery<AddRssFeedMutation>>> {
     const result = API.graphql<GraphQLQuery<AddRssFeedMutation>>({
       query: addRssFeed,
@@ -152,6 +155,7 @@ export class DocumentsClient {
           title,
           limit,
           followLinks,
+          contentTypes,
         },
       },
     });
@@ -222,7 +226,8 @@ export class DocumentsClient {
     workspaceId: string,
     feedId: string,
     followLinks: boolean,
-    limit: number
+    limit: number,
+    contentTypes: string[]
   ): Promise<GraphQLResult<GraphQLQuery<UpdateRssFeedMutation>>> {
     const result = API.graphql<GraphQLQuery<UpdateRssFeedMutation>>({
       query: addRssFeed,
@@ -232,6 +237,7 @@ export class DocumentsClient {
           documentId: feedId,
           followLinks,
           limit,
+          contentTypes,
         },
       },
     });
diff --git a/lib/user-interface/react-app/src/pages/rag/add-data/add-rss-subscription.tsx b/lib/user-interface/react-app/src/pages/rag/add-data/add-rss-subscription.tsx
index ebf3c7bf..8b0a91e6 100644
--- a/lib/user-interface/react-app/src/pages/rag/add-data/add-rss-subscription.tsx
+++ b/lib/user-interface/react-app/src/pages/rag/add-data/add-rss-subscription.tsx
@@ -8,8 +8,10 @@ import {
   Input,
   SpaceBetween,
   Toggle,
+  Multiselect,
 } from "@cloudscape-design/components";
-import { AddDataData } from "./types";
+import { AddDataData, SelectOption, multiselectOptions } from "./types";
+import { generateSelectedOptions } from "./utils";
 import { useForm } from "../../../common/hooks/use-form";
 import { useContext, useState } from "react";
 import { AppContext } from "../../../common/app-context";
@@ -31,6 +33,7 @@ interface AddRssSubscriptionData {
   rssFeedTitle: string;
   linkLimit: number;
   followLinks: boolean;
+  contentTypes: (string | undefined)[];
 }
 
 export default function AddRssSubscription(props: AddRssSubscriptionProps) {
@@ -46,6 +49,7 @@ export default function AddRssSubscription(props: AddRssSubscriptionProps) {
         rssFeedTitle: "",
         linkLimit: 250,
         followLinks: true,
+        contentTypes: ["text/html"],
       };
     },
     validate: (form) => {
@@ -77,13 +81,15 @@ export default function AddRssSubscription(props: AddRssSubscriptionProps) {
     setGlobalError(undefined);
 
     const apiClient = new ApiClient(appContext);
+    const contentTypesToUse = data.contentTypes.filter((ct): ct is string => ct !== undefined);
     try {
       await apiClient.documents.addRssFeedSubscription(
         props.data.workspace.value,
         data.rssFeedUrl,
         data.rssFeedTitle,
         data.linkLimit,
-        data.followLinks
+        data.followLinks,
+        contentTypesToUse
       );
 
       setFlashbarItem({
@@ -109,6 +115,20 @@ export default function AddRssSubscription(props: AddRssSubscriptionProps) {
     props.setSubmitting(false);
   };
 
+  const handleContentTypeChange = (selectedOptions: ReadonlyArray<SelectOption>) => {
+    const options: SelectOption[] = selectedOptions.map(option => {
+      if (option.value === undefined) {
+        throw new Error(`Option value cannot be undefined`);
+      }
+      return {
+        label: option.label,
+        value: option.value,
+        description: option.description
+      };
+    });
+    onChange({ contentTypes: options.map(option => option.value) });
+  };
+
   const hasReadyWorkspace =
     typeof props.data.workspace?.value !== "undefined" &&
     typeof props.selectedWorkspace !== "undefined" &&
@@ -191,6 +211,18 @@ export default function AddRssSubscription(props: AddRssSubscriptionProps) {
                 }
               />
             </FormField>
+            <FormField
+              label="Enabled Content Types"
+              errorText={errors.contentTypes}
+              description="Content Types to Enable for crawlingl"
+            >
+            <Multiselect
+              disabled={props.submitting}
+              selectedOptions={generateSelectedOptions(data.contentTypes)}
+              options={multiselectOptions}
+              onChange={({ detail }) => handleContentTypeChange(detail.selectedOptions)}
+            />
+            </FormField>
           </SpaceBetween>
         </Container>
         {flashbarItem !== null && <Flashbar items={[flashbarItem]} />}
diff --git a/lib/user-interface/react-app/src/pages/rag/add-data/crawl-website.tsx b/lib/user-interface/react-app/src/pages/rag/add-data/crawl-website.tsx
index 3547be90..b86a1a7d 100644
--- a/lib/user-interface/react-app/src/pages/rag/add-data/crawl-website.tsx
+++ b/lib/user-interface/react-app/src/pages/rag/add-data/crawl-website.tsx
@@ -9,8 +9,10 @@ import {
   SegmentedControl,
   SpaceBetween,
   Toggle,
+  Multiselect,
 } from "@cloudscape-design/components";
-import { AddDataData } from "./types";
+import { AddDataData, SelectOption, multiselectOptions } from "./types";
+import { generateSelectedOptions } from "./utils";
 import { useForm } from "../../../common/hooks/use-form";
 import { useContext, useState } from "react";
 import { AppContext } from "../../../common/app-context";
@@ -33,6 +35,7 @@ interface CrawlWebisteData {
   sitemapUrl: string;
   followLinks: boolean;
   limit: number;
+  contentTypes: (string | undefined)[];
 }
 
 export default function CrawlWebsite(props: CrawlWebsiteProps) {
@@ -49,6 +52,7 @@ export default function CrawlWebsite(props: CrawlWebsiteProps) {
         sitemapUrl: "",
         followLinks: true,
         limit: 250,
+        contentTypes: ["text/html"],
       };
     },
     validate: (form) => {
@@ -73,6 +77,10 @@ export default function CrawlWebsite(props: CrawlWebsiteProps) {
       if (form.limit < 1 || form.limit > 1000) {
         errors.limit = "Page limit should be between 1 and 1000";
       }
+      
+      if (form.contentTypes.length === 0) {
+        errors.contentTypes = "At least one content type must be selected.";
+      }
 
       return errors;
     },
@@ -91,13 +99,15 @@ export default function CrawlWebsite(props: CrawlWebsiteProps) {
 
     const apiClient = new ApiClient(appContext);
     const isSitemap = data.urlType === "sitemap";
+    const contentTypesToUse = data.contentTypes.filter((ct): ct is string => ct !== undefined);
     try {
       await apiClient.documents.addWebsiteDocument(
         props.data.workspace.value,
         isSitemap,
         isSitemap ? data.sitemapUrl : data.websiteUrl,
         data.followLinks,
-        data.limit
+        data.limit,
+        contentTypesToUse
       );
 
       setFlashbarItem({
@@ -121,6 +131,20 @@ export default function CrawlWebsite(props: CrawlWebsiteProps) {
 
     props.setSubmitting(false);
   };
+  
+  const handleContentTypeChange = (selectedOptions: ReadonlyArray<SelectOption>) => {
+    const options: SelectOption[] = selectedOptions.map(option => {
+      if (option.value === undefined) {
+        throw new Error(`Option value cannot be undefined`);
+      }
+      return {
+        label: option.label,
+        value: option.value,
+        description: option.description
+      };
+    });
+    onChange({ contentTypes: options.map(option => option.value) });
+  };
 
   const hasReadyWorkspace =
     typeof props.data.workspace?.value !== "undefined" &&
@@ -220,6 +244,18 @@ export default function CrawlWebsite(props: CrawlWebsiteProps) {
                 }
               />
             </FormField>
+            <FormField
+              label="Enabled Content Types"
+              errorText={errors.contentTypes}
+              description="Content Types to Enable for crawlingl"
+            >
+            <Multiselect
+              disabled={props.submitting}
+              selectedOptions={generateSelectedOptions(data.contentTypes)}
+              options={multiselectOptions}
+              onChange={({ detail }) => handleContentTypeChange(detail.selectedOptions)}
+            />
+            </FormField>
           </SpaceBetween>
         </Container>
         {flashbarItem !== null && <Flashbar items={[flashbarItem]} />}
diff --git a/lib/user-interface/react-app/src/pages/rag/add-data/types.ts b/lib/user-interface/react-app/src/pages/rag/add-data/types.ts
index 95d4ede6..5aa535c1 100644
--- a/lib/user-interface/react-app/src/pages/rag/add-data/types.ts
+++ b/lib/user-interface/react-app/src/pages/rag/add-data/types.ts
@@ -4,3 +4,24 @@ export interface AddDataData {
   workspace: SelectProps.Option | null;
   query: string;
 }
+
+export interface SelectOption {
+  label?: string;
+  value?: string | undefined;
+  description?: string;
+}
+
+export const ContentTypesOptionsConfig: { [key: string]: { description: string } } = {
+  "text/html": {
+    description: "Crawl Websites"
+  },
+  "application/pdf": {
+    description: "Crawl PDFs"
+  }
+};
+
+export const multiselectOptions: SelectOption[] = Object.entries(ContentTypesOptionsConfig).map(([value, { description }]) => ({
+  label: value,
+  value: value,
+  description: description
+}));
diff --git a/lib/user-interface/react-app/src/pages/rag/add-data/utils.ts b/lib/user-interface/react-app/src/pages/rag/add-data/utils.ts
new file mode 100644
index 00000000..70966e74
--- /dev/null
+++ b/lib/user-interface/react-app/src/pages/rag/add-data/utils.ts
@@ -0,0 +1,9 @@
+import { SelectOption, ContentTypesOptionsConfig } from "./types";
+
+export function generateSelectedOptions(contentTypes: (string | undefined)[]): SelectOption[] {
+  return contentTypes.map((ct): SelectOption => ({
+    label: ct || "text/html",
+    value: ct || "text/html",
+    description: ContentTypesOptionsConfig[ct as keyof typeof ContentTypesOptionsConfig]?.description || "Default Description"
+  }));
+}
diff --git a/lib/user-interface/react-app/src/pages/rag/workspace/rss-feed.tsx b/lib/user-interface/react-app/src/pages/rag/workspace/rss-feed.tsx
index 1504f2be..39c2b36c 100644
--- a/lib/user-interface/react-app/src/pages/rag/workspace/rss-feed.tsx
+++ b/lib/user-interface/react-app/src/pages/rag/workspace/rss-feed.tsx
@@ -17,10 +17,13 @@ import {
   StatusIndicator,
   Table,
   Toggle,
+  Multiselect,
 } from "@cloudscape-design/components";
 import useOnFollow from "../../../common/hooks/use-on-follow";
 import BaseAppLayout from "../../../components/base-app-layout";
 import { useNavigate, useParams } from "react-router-dom";
+import { multiselectOptions, SelectOption } from "../add-data/types";
+import { generateSelectedOptions } from "../add-data/utils";
 import { useCallback, useContext, useEffect, useState } from "react";
 import { DocumentSubscriptionStatus } from "../../../common/types";
 import { AppContext } from "../../../common/app-context";
@@ -385,6 +388,7 @@ export default function RssFeed() {
                       data={{
                         followLinks: rssCrawlerFollowLinks,
                         limit: rssCrawlerLimit,
+                        contentTypes: ["text/html"],
                       }}
                       documentId={rssSubscription?.id ?? ""}
                       workspaceId={workspace?.id ?? ""}
@@ -538,6 +542,7 @@ export function RssFeedPostUrlPopover(props: RssFeedPostUrlPopoverProps) {
   const item = props.item;
   const followLinks = item.crawlerProperties?.followLinks;
   const limit = item.crawlerProperties?.limit;
+  const contentTypes = item.crawlerProperties?.contentTypes;
   return (
     <Popover
       dismissButton={false}
@@ -572,6 +577,12 @@ export function RssFeedPostUrlPopover(props: RssFeedPostUrlPopoverProps) {
           ) : (
             <></>
           )}
+          <div>
+            <Box variant="awsui-key-label">Content Types supported</Box>
+            <div>
+              <Badge color="blue">{contentTypes}</Badge>
+            </div>
+          </div>
           <div>
             <Box>
               <Button target="_blank" href={item.path!}>
@@ -590,6 +601,7 @@ export function RssFeedPostUrlPopover(props: RssFeedPostUrlPopoverProps) {
 export interface RssFeedCrawlerData {
   followLinks: boolean;
   limit: number;
+  contentTypes: (string | undefined)[];
 }
 
 export interface RssFeedEditorProps {
@@ -610,6 +622,7 @@ export function RssFeedCrawlerForm(props: RssFeedEditorProps) {
         documentId: props.documentId,
         followLinks: props.data.followLinks,
         limit: props.data.limit,
+        contentTypes: props.data.contentTypes
       };
     },
     validate: (form) => {
@@ -617,21 +630,44 @@ export function RssFeedCrawlerForm(props: RssFeedEditorProps) {
       if (form.limit < 1 || form.limit > 1000) {
         errors.limit = "Page limit should be between 1 and 1000";
       }
+      if (form.contentTypes.length === 0) {
+        errors.contentTypes = "At least one content type must be selected.";
+      }
+
       return errors;
     },
   });
+  
+  
+  const handleContentTypeChange = (selectedOptions: ReadonlyArray<SelectOption>) => {
+    const options: SelectOption[] = selectedOptions.map(option => {
+      if (option.value === undefined) {
+        throw new Error(`Option value cannot be undefined`);
+      }
+      return {
+        label: option.label,
+        value: option.value,
+        description: option.description
+      };
+    });
+    onChange({ contentTypes: options.map(option => option.value) });
+  };
+
+  
   const onSubmit = async () => {
     if (!appContext) return;
     const validationResult = validate();
     if (!validationResult) return;
     props.setSubmitting(true);
+    const contentTypesToUse = data.contentTypes.filter((ct): ct is string => ct !== undefined);
     const apiClient = new ApiClient(appContext);
     try {
       await apiClient.documents.updateRssSubscriptionCrawler(
         props.workspaceId,
         props.documentId,
         data.followLinks,
-        data.limit
+        data.limit,
+        contentTypesToUse
       );
 
       props.setSubmitting(false);
@@ -692,6 +728,18 @@ export function RssFeedCrawlerForm(props: RssFeedEditorProps) {
             }
           />
         </FormField>
+            <FormField
+              label="Enabled Content Types"
+              errorText={errors.contentTypes}
+              description="Content Types to Enable for crawlingl"
+            >
+            <Multiselect
+              disabled={props.submitting}
+              selectedOptions={generateSelectedOptions(data.contentTypes)}
+              options={multiselectOptions}
+              onChange={({ detail }) => handleContentTypeChange(detail.selectedOptions)}
+            />
+            </FormField>
       </SpaceBetween>
     </Form>
   );