Feature/ab#32008 add office extraction support#2048
Conversation
# Conflicts: # applications/Unity.GrantManager/src/Unity.GrantManager.Application/AI/TextExtractionService.cs # applications/Unity.GrantManager/src/Unity.GrantManager.Application/Unity.GrantManager.Application.csproj
AndreGAot
left a comment
There was a problem hiding this comment.
At some point would be nice to make these max rows, table rows etc.. configurable
|
Looks good you can merge these - just runing a co pilot review I would suggest using string builder based on the co-pilot review. |
There was a problem hiding this comment.
Pull request overview
Adds Office document text extraction to the AI attachment summarization pipeline so .docx, .xls, and .xlsx attachments can contribute real content (after normalization/length limiting) instead of falling back to metadata-only summaries.
Changes:
- Added NPOI dependency to support Word/Excel parsing.
- Implemented
.docxextraction viaXWPFDocument(paragraph + table text). - Implemented
.xls/.xlsxextraction viaWorkbookFactorywith basic bounds and normalization.
Reviewed changes
Copilot reviewed 2 out of 2 changed files in this pull request and generated 3 comments.
| File | Description |
|---|---|
| applications/Unity.GrantManager/src/Unity.GrantManager.Application/Unity.GrantManager.Application.csproj | Adds the NPOI package reference required for Office parsing. |
| applications/Unity.GrantManager/src/Unity.GrantManager.Application/AI/TextExtractionService.cs | Adds Word/Excel extraction logic and routes Office files through normalization/limits before AI prompt usage. |
💡 Add Copilot custom instructions for smarter, more guided reviews. Learn how to get started.
You can also share your feedback on Copilot code review. Take the survey.
| var parts = new List<string>(); | ||
|
|
||
| foreach (var paragraphText in document.Paragraphs.Take(MaxDocxParagraphs).Select(paragraph => paragraph.ParagraphText)) | ||
| { | ||
| if (!string.IsNullOrWhiteSpace(paragraphText)) | ||
| { | ||
| parts.Add(paragraphText); | ||
| } | ||
| } | ||
|
|
||
| foreach (var table in document.Tables) | ||
| { | ||
| foreach (var row in table.Rows.Take(MaxDocxTableRows)) | ||
| { | ||
| foreach (var cell in row.GetTableCells().Take(MaxDocxTableCellsPerRow)) | ||
| { | ||
| var text = cell.GetText(); | ||
| if (!string.IsNullOrWhiteSpace(text)) | ||
| { | ||
| parts.Add(text); | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| var combined = string.Join(Environment.NewLine, parts); | ||
| if (combined.Length > MaxExtractedTextLength) | ||
| { | ||
| combined = combined.Substring(0, MaxExtractedTextLength); | ||
| } | ||
|
|
||
| return combined; |
There was a problem hiding this comment.
ExtractTextFromWordDocx builds up parts from paragraphs/tables without short-circuiting once the overall extracted text budget is reached. For large docs/tables this can do substantial unnecessary work and allocations (especially the final string.Join) even though the result is truncated to MaxExtractedTextLength. Track a running character count and break out of the paragraph/table loops when the limit is hit (and consider appending to a StringBuilder instead of collecting all parts).
| var parts = new List<string>(); | |
| foreach (var paragraphText in document.Paragraphs.Take(MaxDocxParagraphs).Select(paragraph => paragraph.ParagraphText)) | |
| { | |
| if (!string.IsNullOrWhiteSpace(paragraphText)) | |
| { | |
| parts.Add(paragraphText); | |
| } | |
| } | |
| foreach (var table in document.Tables) | |
| { | |
| foreach (var row in table.Rows.Take(MaxDocxTableRows)) | |
| { | |
| foreach (var cell in row.GetTableCells().Take(MaxDocxTableCellsPerRow)) | |
| { | |
| var text = cell.GetText(); | |
| if (!string.IsNullOrWhiteSpace(text)) | |
| { | |
| parts.Add(text); | |
| } | |
| } | |
| } | |
| } | |
| var combined = string.Join(Environment.NewLine, parts); | |
| if (combined.Length > MaxExtractedTextLength) | |
| { | |
| combined = combined.Substring(0, MaxExtractedTextLength); | |
| } | |
| return combined; | |
| var builder = new StringBuilder(); | |
| var limitReached = false; | |
| var firstPart = true; | |
| void AppendWithLimit(string text) | |
| { | |
| if (limitReached || string.IsNullOrWhiteSpace(text)) | |
| { | |
| return; | |
| } | |
| var remaining = MaxExtractedTextLength - builder.Length; | |
| if (remaining <= 0) | |
| { | |
| limitReached = true; | |
| return; | |
| } | |
| // Append separator between parts, counted against the limit | |
| if (!firstPart) | |
| { | |
| var separator = Environment.NewLine; | |
| if (separator.Length <= remaining) | |
| { | |
| builder.Append(separator); | |
| remaining -= separator.Length; | |
| } | |
| else | |
| { | |
| builder.Append(separator.Substring(0, remaining)); | |
| limitReached = true; | |
| return; | |
| } | |
| } | |
| firstPart = false; | |
| remaining = MaxExtractedTextLength - builder.Length; | |
| if (remaining <= 0) | |
| { | |
| limitReached = true; | |
| return; | |
| } | |
| if (text.Length <= remaining) | |
| { | |
| builder.Append(text); | |
| } | |
| else | |
| { | |
| builder.Append(text.Substring(0, remaining)); | |
| limitReached = true; | |
| } | |
| } | |
| foreach (var paragraphText in document.Paragraphs.Take(MaxDocxParagraphs).Select(paragraph => paragraph.ParagraphText)) | |
| { | |
| AppendWithLimit(paragraphText); | |
| if (limitReached) | |
| { | |
| break; | |
| } | |
| } | |
| if (!limitReached) | |
| { | |
| foreach (var table in document.Tables) | |
| { | |
| foreach (var row in table.Rows.Take(MaxDocxTableRows)) | |
| { | |
| foreach (var cell in row.GetTableCells().Take(MaxDocxTableCellsPerRow)) | |
| { | |
| var text = cell.GetText(); | |
| AppendWithLimit(text); | |
| if (limitReached) | |
| { | |
| break; | |
| } | |
| } | |
| if (limitReached) | |
| { | |
| break; | |
| } | |
| } | |
| if (limitReached) | |
| { | |
| break; | |
| } | |
| } | |
| } | |
| return builder.ToString(); |
| var cellTexts = row.Cells | ||
| .Take(MaxExcelCellsPerRow) | ||
| .Select(GetCellText) | ||
| .Where(value => !string.IsNullOrWhiteSpace(value)) | ||
| .ToList(); | ||
|
|
||
| processedRows++; | ||
|
|
||
| if (cellTexts.Count == 0) | ||
| { | ||
| continue; | ||
| } | ||
|
|
||
| var rowText = string.Join(" | ", cellTexts); |
There was a problem hiding this comment.
In the Excel path, a single cell/row can still be extremely large (e.g., a cell with a huge text blob). rowText is fully materialized and added before checking/truncating, which can defeat the intent of bounded extraction and cause large allocations. Consider truncating individual cell values and/or truncating rowText to the remaining budget before adding it to rows/incrementing totalLength.
| var cellTexts = row.Cells | |
| .Take(MaxExcelCellsPerRow) | |
| .Select(GetCellText) | |
| .Where(value => !string.IsNullOrWhiteSpace(value)) | |
| .ToList(); | |
| processedRows++; | |
| if (cellTexts.Count == 0) | |
| { | |
| continue; | |
| } | |
| var rowText = string.Join(" | ", cellTexts); | |
| // Build row text with awareness of the remaining global budget to avoid large allocations. | |
| var remainingForRow = MaxExtractedTextLength - totalLength; | |
| if (remainingForRow <= 0) | |
| { | |
| break; | |
| } | |
| var rowBuilder = new StringBuilder(); | |
| var cells = row.Cells.Take(MaxExcelCellsPerRow); | |
| foreach (var cell in cells) | |
| { | |
| if (remainingForRow <= 0) | |
| { | |
| break; | |
| } | |
| var value = GetCellText(cell); | |
| if (string.IsNullOrWhiteSpace(value)) | |
| { | |
| continue; | |
| } | |
| // Add separator if this is not the first value in the row. | |
| if (rowBuilder.Length > 0) | |
| { | |
| const string separator = " | "; | |
| if (separator.Length > remainingForRow) | |
| { | |
| // No space left even for the separator; stop adding to this row. | |
| break; | |
| } | |
| rowBuilder.Append(separator); | |
| remainingForRow -= separator.Length; | |
| } | |
| if (value.Length > remainingForRow) | |
| { | |
| value = value.Substring(0, remainingForRow); | |
| } | |
| rowBuilder.Append(value); | |
| remainingForRow -= value.Length; | |
| if (remainingForRow <= 0) | |
| { | |
| break; | |
| } | |
| } | |
| processedRows++; | |
| if (rowBuilder.Length == 0) | |
| { | |
| continue; | |
| } | |
| var rowText = rowBuilder.ToString(); |
There was a problem hiding this comment.
NPOI loads entire workbooks into memory.
If you process:
Large Excel files
Multiple concurrent uploads
In a web API
You may hit memory pressure.
| catch (Exception ex) | ||
| { | ||
| _logger.LogWarning(ex, "Word (.docx) text extraction failed"); | ||
| return string.Empty; |
There was a problem hiding this comment.
The warning log in the .docx extraction catch block doesn't include the file name, which makes production troubleshooting harder when multiple attachments are processed. Consider passing fileName into ExtractTextFromWordDocx (or logging from the caller) so the log message includes {FileName} like the PDF/Excel paths do.
JamesPasta
left a comment
There was a problem hiding this comment.
Please review co-pilot suggestions
| var cellTexts = row.Cells | ||
| .Take(MaxExcelCellsPerRow) | ||
| .Select(GetCellText) | ||
| .Where(value => !string.IsNullOrWhiteSpace(value)) | ||
| .ToList(); | ||
|
|
||
| processedRows++; | ||
|
|
||
| if (cellTexts.Count == 0) | ||
| { | ||
| continue; | ||
| } | ||
|
|
||
| var rowText = string.Join(" | ", cellTexts); |
There was a problem hiding this comment.
NPOI loads entire workbooks into memory.
If you process:
Large Excel files
Multiple concurrent uploads
In a web API
You may hit memory pressure.
|
Added bounded StringBuilder accumulation with early-stop limits to reduce temporary allocations during DOCX/Excel text extraction. |
Pull request overview
This pull request adds Office document text extraction support for AI attachment summarization.
It enables
.docx,.xls, and.xlsxcontent to be extracted and normalized before it is sent into AI attachment analysis, improving summary quality and reducing metadata-only fallbacks when readable file content exists.Changes:
TextExtractionServicefor.docx,.xls, and.xlsxNPOIpackage reference to the application project