From a960047e47f09531979c47aafef63b2f3c4321a6 Mon Sep 17 00:00:00 2001 From: Jacob Smith Date: Fri, 27 Feb 2026 10:55:54 -0800 Subject: [PATCH 1/5] AB#32008 Add Office document text extraction support (Word/Excel) # Conflicts: # applications/Unity.GrantManager/src/Unity.GrantManager.Application/AI/TextExtractionService.cs # applications/Unity.GrantManager/src/Unity.GrantManager.Application/Unity.GrantManager.Application.csproj --- .../AI/TextExtractionService.cs | 167 ++++++++++++++++-- .../Unity.GrantManager.Application.csproj | 1 + 2 files changed, 152 insertions(+), 16 deletions(-) diff --git a/applications/Unity.GrantManager/src/Unity.GrantManager.Application/AI/TextExtractionService.cs b/applications/Unity.GrantManager/src/Unity.GrantManager.Application/AI/TextExtractionService.cs index 3c2b3f2b3..8e7f3d41b 100644 --- a/applications/Unity.GrantManager/src/Unity.GrantManager.Application/AI/TextExtractionService.cs +++ b/applications/Unity.GrantManager/src/Unity.GrantManager.Application/AI/TextExtractionService.cs @@ -1,6 +1,10 @@ using Microsoft.Extensions.Logging; +using NPOI.SS.UserModel; +using NPOI.XWPF.UserModel; using System; +using System.Collections.Generic; using System.IO; +using System.Linq; using System.Text; using System.Text.RegularExpressions; using System.Threading.Tasks; @@ -12,6 +16,12 @@ namespace Unity.GrantManager.AI public class TextExtractionService : ITextExtractionService, ITransientDependency { private const int MaxExtractedTextLength = 50000; + private const int MaxExcelSheets = 10; + private const int MaxExcelRowsPerSheet = 2000; + private const int MaxExcelCellsPerRow = 50; + private const int MaxDocxParagraphs = 2000; + private const int MaxDocxTableRows = 2000; + private const int MaxDocxTableCellsPerRow = 50; private readonly ILogger _logger; public TextExtractionService(ILogger logger) @@ -29,13 +39,11 @@ public async Task ExtractTextAsync(string fileName, byte[] fileContent, try { - // Normalize content type var normalizedContentType = contentType?.ToLowerInvariant() ?? string.Empty; var extension = Path.GetExtension(fileName)?.ToLowerInvariant() ?? string.Empty; string rawText; - // Handle text-based files if (normalizedContentType.Contains("text/") || extension == ".txt" || extension == ".csv" || @@ -46,37 +54,37 @@ public async Task ExtractTextAsync(string fileName, byte[] fileContent, return NormalizeAndLimitText(rawText, fileName); } - // Handle PDF files if (normalizedContentType.Contains("pdf") || extension == ".pdf") { - rawText = await Task.FromResult(ExtractTextFromPdfFile(fileName, fileContent)); + rawText = ExtractTextFromPdfFile(fileName, fileContent); return NormalizeAndLimitText(rawText, fileName); } - // Handle Word documents if (normalizedContentType.Contains("word") || normalizedContentType.Contains("msword") || normalizedContentType.Contains("officedocument.wordprocessingml") || extension == ".doc" || extension == ".docx") { - // For now, return empty string - can be enhanced with Word parsing library - _logger.LogDebug("Word document text extraction not yet implemented for {FileName}", fileName); + if (extension == ".docx" || normalizedContentType.Contains("officedocument.wordprocessingml")) + { + rawText = ExtractTextFromWordDocx(fileContent); + return NormalizeAndLimitText(rawText, fileName); + } + + _logger.LogDebug("Legacy .doc extraction is not supported for {FileName}", fileName); return string.Empty; } - // Handle Excel files if (normalizedContentType.Contains("excel") || normalizedContentType.Contains("spreadsheet") || extension == ".xls" || extension == ".xlsx") { - // For now, return empty string - can be enhanced with Excel parsing library - _logger.LogDebug("Excel text extraction not yet implemented for {FileName}", fileName); - return string.Empty; + rawText = ExtractTextFromExcelFile(fileName, fileContent); + return NormalizeAndLimitText(rawText, fileName); } - // For other file types, return empty string _logger.LogDebug("No text extraction available for content type {ContentType} with extension {Extension}", contentType, extension); return string.Empty; @@ -92,17 +100,13 @@ private async Task ExtractTextFromTextFileAsync(byte[] fileContent) { try { - // Try UTF-8 first var text = Encoding.UTF8.GetString(fileContent); - // Check if the decoded text contains replacement characters (indicates encoding issue) if (text.Contains('\uFFFD')) { - // Try other encodings text = Encoding.ASCII.GetString(fileContent); } - // Limit the extracted text to a reasonable size. if (text.Length > MaxExtractedTextLength) { text = text.Substring(0, MaxExtractedTextLength); @@ -154,6 +158,137 @@ private string ExtractTextFromPdfFile(string fileName, byte[] fileContent) } } + private string ExtractTextFromWordDocx(byte[] fileContent) + { + try + { + using var stream = new MemoryStream(fileContent, writable: false); + using var document = new XWPFDocument(stream); + var parts = new List(); + + foreach (var paragraph in document.Paragraphs.Take(MaxDocxParagraphs)) + { + if (!string.IsNullOrWhiteSpace(paragraph.ParagraphText)) + { + parts.Add(paragraph.ParagraphText); + } + } + + foreach (var table in document.Tables) + { + foreach (var row in table.Rows.Take(MaxDocxTableRows)) + { + foreach (var cell in row.GetTableCells().Take(MaxDocxTableCellsPerRow)) + { + var text = cell.GetText(); + if (!string.IsNullOrWhiteSpace(text)) + { + parts.Add(text); + } + } + } + } + + var combined = string.Join(Environment.NewLine, parts); + if (combined.Length > MaxExtractedTextLength) + { + combined = combined.Substring(0, MaxExtractedTextLength); + } + + return combined; + } + catch (Exception ex) + { + _logger.LogWarning(ex, "Word (.docx) text extraction failed"); + return string.Empty; + } + } + + private string ExtractTextFromExcelFile(string fileName, byte[] fileContent) + { + try + { + using var stream = new MemoryStream(fileContent, writable: false); + using var workbook = WorkbookFactory.Create(stream); + var rows = new List(); + var totalLength = 0; + var sheetCount = Math.Min(workbook.NumberOfSheets, MaxExcelSheets); + + for (var sheetIndex = 0; sheetIndex < sheetCount; sheetIndex++) + { + var sheet = workbook.GetSheetAt(sheetIndex); + if (sheet == null) + { + continue; + } + + var processedRows = 0; + foreach (IRow row in sheet) + { + if (processedRows >= MaxExcelRowsPerSheet || totalLength >= MaxExtractedTextLength) + { + break; + } + + var cellTexts = row.Cells + .Take(MaxExcelCellsPerRow) + .Select(GetCellText) + .Where(value => !string.IsNullOrWhiteSpace(value)) + .ToList(); + + processedRows++; + + if (cellTexts.Count == 0) + { + continue; + } + + var rowText = string.Join(" | ", cellTexts); + rows.Add(rowText); + totalLength += rowText.Length; + } + + if (totalLength >= MaxExtractedTextLength) + { + break; + } + } + + var combined = string.Join(Environment.NewLine, rows); + if (combined.Length > MaxExtractedTextLength) + { + combined = combined.Substring(0, MaxExtractedTextLength); + } + + return combined; + } + catch (Exception ex) + { + _logger.LogWarning(ex, "Excel text extraction failed for {FileName}", fileName); + return string.Empty; + } + } + + private static string GetCellText(ICell cell) + { + if (cell == null) + { + return string.Empty; + } + + return (cell.CellType switch + { + CellType.String => cell.StringCellValue ?? string.Empty, + CellType.Numeric => DateUtil.IsCellDateFormatted(cell) + ? cell.DateCellValue.ToString() + : cell.NumericCellValue.ToString(), + CellType.Boolean => cell.BooleanCellValue ? "true" : "false", + CellType.Formula => cell.ToString(), + CellType.Blank => string.Empty, + _ => cell.ToString() ?? string.Empty + }) ?? string.Empty; + } + private string NormalizeAndLimitText(string text, string fileName) { var normalized = NormalizeExtractedText(text); diff --git a/applications/Unity.GrantManager/src/Unity.GrantManager.Application/Unity.GrantManager.Application.csproj b/applications/Unity.GrantManager/src/Unity.GrantManager.Application/Unity.GrantManager.Application.csproj index 8ec3e53bc..ff57bfd94 100644 --- a/applications/Unity.GrantManager/src/Unity.GrantManager.Application/Unity.GrantManager.Application.csproj +++ b/applications/Unity.GrantManager/src/Unity.GrantManager.Application/Unity.GrantManager.Application.csproj @@ -33,6 +33,7 @@ + From 7ed4d3f3a94e2700c9df5892b134203912b8b4cd Mon Sep 17 00:00:00 2001 From: Jacob Smith Date: Fri, 27 Feb 2026 12:01:21 -0800 Subject: [PATCH 2/5] AB#32008 Resolve ICell specificity error --- .../Unity.GrantManager.Application/AI/TextExtractionService.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/applications/Unity.GrantManager/src/Unity.GrantManager.Application/AI/TextExtractionService.cs b/applications/Unity.GrantManager/src/Unity.GrantManager.Application/AI/TextExtractionService.cs index 8e7f3d41b..8de6b180e 100644 --- a/applications/Unity.GrantManager/src/Unity.GrantManager.Application/AI/TextExtractionService.cs +++ b/applications/Unity.GrantManager/src/Unity.GrantManager.Application/AI/TextExtractionService.cs @@ -269,7 +269,7 @@ private string ExtractTextFromExcelFile(string fileName, byte[] fileContent) } } - private static string GetCellText(ICell cell) + private static string GetCellText(NPOI.SS.UserModel.ICell cell) { if (cell == null) { From ce760cb67827dc4a9693dc0499af77ad5d1aa0d6 Mon Sep 17 00:00:00 2001 From: Jacob Smith Date: Fri, 27 Feb 2026 16:25:11 -0800 Subject: [PATCH 3/5] AB#32008 Sonar fix simplify docx paragraph extraction loop --- .../AI/TextExtractionService.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/applications/Unity.GrantManager/src/Unity.GrantManager.Application/AI/TextExtractionService.cs b/applications/Unity.GrantManager/src/Unity.GrantManager.Application/AI/TextExtractionService.cs index 8de6b180e..0f55a62e3 100644 --- a/applications/Unity.GrantManager/src/Unity.GrantManager.Application/AI/TextExtractionService.cs +++ b/applications/Unity.GrantManager/src/Unity.GrantManager.Application/AI/TextExtractionService.cs @@ -166,11 +166,11 @@ private string ExtractTextFromWordDocx(byte[] fileContent) using var document = new XWPFDocument(stream); var parts = new List(); - foreach (var paragraph in document.Paragraphs.Take(MaxDocxParagraphs)) + foreach (var paragraphText in document.Paragraphs.Take(MaxDocxParagraphs).Select(paragraph => paragraph.ParagraphText)) { - if (!string.IsNullOrWhiteSpace(paragraph.ParagraphText)) + if (!string.IsNullOrWhiteSpace(paragraphText)) { - parts.Add(paragraph.ParagraphText); + parts.Add(paragraphText); } } From 5d02ba95714fd89f72a7473688051633bc06f2b8 Mon Sep 17 00:00:00 2001 From: Jacob Smith Date: Mon, 2 Mar 2026 10:05:41 -0800 Subject: [PATCH 4/5] AB#32008 Simplify text extraction async flow and stale comments --- .../AI/TextExtractionService.cs | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/applications/Unity.GrantManager/src/Unity.GrantManager.Application/AI/TextExtractionService.cs b/applications/Unity.GrantManager/src/Unity.GrantManager.Application/AI/TextExtractionService.cs index 0f55a62e3..88080d6a3 100644 --- a/applications/Unity.GrantManager/src/Unity.GrantManager.Application/AI/TextExtractionService.cs +++ b/applications/Unity.GrantManager/src/Unity.GrantManager.Application/AI/TextExtractionService.cs @@ -29,12 +29,12 @@ public TextExtractionService(ILogger logger) _logger = logger; } - public async Task ExtractTextAsync(string fileName, byte[] fileContent, string contentType) + public Task ExtractTextAsync(string fileName, byte[] fileContent, string contentType) { if (fileContent == null || fileContent.Length == 0) { _logger.LogDebug("File content is empty for {FileName}", fileName); - return string.Empty; + return Task.FromResult(string.Empty); } try @@ -50,14 +50,14 @@ public async Task ExtractTextAsync(string fileName, byte[] fileContent, extension == ".json" || extension == ".xml") { - rawText = await ExtractTextFromTextFileAsync(fileContent); - return NormalizeAndLimitText(rawText, fileName); + rawText = ExtractTextFromTextFile(fileContent); + return Task.FromResult(NormalizeAndLimitText(rawText, fileName)); } if (normalizedContentType.Contains("pdf") || extension == ".pdf") { rawText = ExtractTextFromPdfFile(fileName, fileContent); - return NormalizeAndLimitText(rawText, fileName); + return Task.FromResult(NormalizeAndLimitText(rawText, fileName)); } if (normalizedContentType.Contains("word") || @@ -69,11 +69,11 @@ public async Task ExtractTextAsync(string fileName, byte[] fileContent, if (extension == ".docx" || normalizedContentType.Contains("officedocument.wordprocessingml")) { rawText = ExtractTextFromWordDocx(fileContent); - return NormalizeAndLimitText(rawText, fileName); + return Task.FromResult(NormalizeAndLimitText(rawText, fileName)); } _logger.LogDebug("Legacy .doc extraction is not supported for {FileName}", fileName); - return string.Empty; + return Task.FromResult(string.Empty); } if (normalizedContentType.Contains("excel") || @@ -82,21 +82,21 @@ public async Task ExtractTextAsync(string fileName, byte[] fileContent, extension == ".xlsx") { rawText = ExtractTextFromExcelFile(fileName, fileContent); - return NormalizeAndLimitText(rawText, fileName); + return Task.FromResult(NormalizeAndLimitText(rawText, fileName)); } _logger.LogDebug("No text extraction available for content type {ContentType} with extension {Extension}", contentType, extension); - return string.Empty; + return Task.FromResult(string.Empty); } catch (Exception ex) { _logger.LogError(ex, "Error extracting text from {FileName}", fileName); - return string.Empty; + return Task.FromResult(string.Empty); } } - private async Task ExtractTextFromTextFileAsync(byte[] fileContent) + private string ExtractTextFromTextFile(byte[] fileContent) { try { @@ -113,7 +113,7 @@ private async Task ExtractTextFromTextFileAsync(byte[] fileContent) _logger.LogDebug("Truncated text content to {MaxLength} characters", MaxExtractedTextLength); } - return await Task.FromResult(text); + return text; } catch (Exception ex) { From b16d19db495edeb16c17c9f540aed39f7caa9e0e Mon Sep 17 00:00:00 2001 From: Jacob Smith Date: Wed, 4 Mar 2026 11:47:31 -0800 Subject: [PATCH 5/5] AB#32008 Optimize Office text extraction memory usage and limits --- .../AI/TextExtractionService.cs | 141 ++++++++++++------ 1 file changed, 95 insertions(+), 46 deletions(-) diff --git a/applications/Unity.GrantManager/src/Unity.GrantManager.Application/AI/TextExtractionService.cs b/applications/Unity.GrantManager/src/Unity.GrantManager.Application/AI/TextExtractionService.cs index 88080d6a3..3b6f81b42 100644 --- a/applications/Unity.GrantManager/src/Unity.GrantManager.Application/AI/TextExtractionService.cs +++ b/applications/Unity.GrantManager/src/Unity.GrantManager.Application/AI/TextExtractionService.cs @@ -68,7 +68,7 @@ public Task ExtractTextAsync(string fileName, byte[] fileContent, string { if (extension == ".docx" || normalizedContentType.Contains("officedocument.wordprocessingml")) { - rawText = ExtractTextFromWordDocx(fileContent); + rawText = ExtractTextFromWordDocx(fileName, fileContent); return Task.FromResult(NormalizeAndLimitText(rawText, fileName)); } @@ -158,48 +158,56 @@ private string ExtractTextFromPdfFile(string fileName, byte[] fileContent) } } - private string ExtractTextFromWordDocx(byte[] fileContent) + private string ExtractTextFromWordDocx(string fileName, byte[] fileContent) { try { using var stream = new MemoryStream(fileContent, writable: false); using var document = new XWPFDocument(stream); - var parts = new List(); + var builder = new StringBuilder(); foreach (var paragraphText in document.Paragraphs.Take(MaxDocxParagraphs).Select(paragraph => paragraph.ParagraphText)) { - if (!string.IsNullOrWhiteSpace(paragraphText)) + var limitReached = AppendWithLimit(builder, paragraphText, MaxExtractedTextLength, Environment.NewLine); + if (limitReached) { - parts.Add(paragraphText); + break; } } - foreach (var table in document.Tables) + if (builder.Length < MaxExtractedTextLength) { - foreach (var row in table.Rows.Take(MaxDocxTableRows)) + foreach (var table in document.Tables) { - foreach (var cell in row.GetTableCells().Take(MaxDocxTableCellsPerRow)) + foreach (var row in table.Rows.Take(MaxDocxTableRows)) { - var text = cell.GetText(); - if (!string.IsNullOrWhiteSpace(text)) + foreach (var cell in row.GetTableCells().Take(MaxDocxTableCellsPerRow)) { - parts.Add(text); + var limitReached = AppendWithLimit(builder, cell.GetText(), MaxExtractedTextLength, Environment.NewLine); + if (limitReached) + { + break; + } + } + + if (builder.Length >= MaxExtractedTextLength) + { + break; } } - } - } - var combined = string.Join(Environment.NewLine, parts); - if (combined.Length > MaxExtractedTextLength) - { - combined = combined.Substring(0, MaxExtractedTextLength); + if (builder.Length >= MaxExtractedTextLength) + { + break; + } + } } - return combined; + return builder.ToString(); } catch (Exception ex) { - _logger.LogWarning(ex, "Word (.docx) text extraction failed"); + _logger.LogWarning(ex, "Word (.docx) text extraction failed for {FileName}", fileName); return string.Empty; } } @@ -210,12 +218,17 @@ private string ExtractTextFromExcelFile(string fileName, byte[] fileContent) { using var stream = new MemoryStream(fileContent, writable: false); using var workbook = WorkbookFactory.Create(stream); - var rows = new List(); - var totalLength = 0; + var builder = new StringBuilder(); var sheetCount = Math.Min(workbook.NumberOfSheets, MaxExcelSheets); + var limitReached = false; for (var sheetIndex = 0; sheetIndex < sheetCount; sheetIndex++) { + if (limitReached || builder.Length >= MaxExtractedTextLength) + { + break; + } + var sheet = workbook.GetSheetAt(sheetIndex); if (sheet == null) { @@ -225,42 +238,38 @@ private string ExtractTextFromExcelFile(string fileName, byte[] fileContent) var processedRows = 0; foreach (IRow row in sheet) { - if (processedRows >= MaxExcelRowsPerSheet || totalLength >= MaxExtractedTextLength) + if (processedRows >= MaxExcelRowsPerSheet || builder.Length >= MaxExtractedTextLength) { break; } - var cellTexts = row.Cells - .Take(MaxExcelCellsPerRow) - .Select(GetCellText) - .Where(value => !string.IsNullOrWhiteSpace(value)) - .ToList(); + var rowHasValue = false; + foreach (var cell in row.Cells.Take(MaxExcelCellsPerRow)) + { + var value = GetCellText(cell); + if (string.IsNullOrWhiteSpace(value)) + { + continue; + } - processedRows++; + var separator = rowHasValue ? " | " : (builder.Length > 0 ? Environment.NewLine : null); + limitReached = AppendWithLimit(builder, value, MaxExtractedTextLength, separator); + rowHasValue = true; + if (limitReached) + { + break; + } + } - if (cellTexts.Count == 0) + processedRows++; + if (limitReached) { - continue; + break; } - - var rowText = string.Join(" | ", cellTexts); - rows.Add(rowText); - totalLength += rowText.Length; - } - - if (totalLength >= MaxExtractedTextLength) - { - break; } } - var combined = string.Join(Environment.NewLine, rows); - if (combined.Length > MaxExtractedTextLength) - { - combined = combined.Substring(0, MaxExtractedTextLength); - } - - return combined; + return builder.ToString(); } catch (Exception ex) { @@ -269,6 +278,46 @@ private string ExtractTextFromExcelFile(string fileName, byte[] fileContent) } } + private static bool AppendWithLimit(StringBuilder builder, string? value, int maxLength, string? separator = null) + { + if (string.IsNullOrWhiteSpace(value)) + { + return builder.Length >= maxLength; + } + + if (builder.Length >= maxLength) + { + return true; + } + + var remaining = maxLength - builder.Length; + if (remaining <= 0) + { + return true; + } + + if (!string.IsNullOrEmpty(separator) && builder.Length > 0) + { + if (separator.Length >= remaining) + { + builder.Append(separator.AsSpan(0, remaining)); + return true; + } + + builder.Append(separator); + remaining -= separator.Length; + } + + if (value.Length >= remaining) + { + builder.Append(value.AsSpan(0, remaining)); + return true; + } + + builder.Append(value); + return false; + } + private static string GetCellText(NPOI.SS.UserModel.ICell cell) { if (cell == null)