bcgov · jacobwillsmith · Mar 4, 2026 · Feb 27, 2026 · Feb 27, 2026 · Feb 28, 2026
diff --git a/...cations/Unity.GrantManager/src/Unity.GrantManager.Application/AI/TextExtractionService.cs b/...cations/Unity.GrantManager/src/Unity.GrantManager.Application/AI/TextExtractionService.cs
@@ -1,6 +1,10 @@
 using Microsoft.Extensions.Logging;
+using NPOI.SS.UserModel;
+using NPOI.XWPF.UserModel;
 using System;
+using System.Collections.Generic;
 using System.IO;
+using System.Linq;
 using System.Text;
 using System.Text.RegularExpressions;
 using System.Threading.Tasks;
@@ -12,104 +16,104 @@ namespace Unity.GrantManager.AI
     public class TextExtractionService : ITextExtractionService, ITransientDependency
     {
         private const int MaxExtractedTextLength = 50000;
+        private const int MaxExcelSheets = 10;
+        private const int MaxExcelRowsPerSheet = 2000;
+        private const int MaxExcelCellsPerRow = 50;
+        private const int MaxDocxParagraphs = 2000;
+        private const int MaxDocxTableRows = 2000;
+        private const int MaxDocxTableCellsPerRow = 50;
         private readonly ILogger<TextExtractionService> _logger;
 
         public TextExtractionService(ILogger<TextExtractionService> logger)
         {
             _logger = logger;
         }
 
-        public async Task<string> ExtractTextAsync(string fileName, byte[] fileContent, string contentType)
+        public Task<string> ExtractTextAsync(string fileName, byte[] fileContent, string contentType)
         {
             if (fileContent == null || fileContent.Length == 0)
             {
                 _logger.LogDebug("File content is empty for {FileName}", fileName);
-                return string.Empty;
+                return Task.FromResult(string.Empty);
             }
 
             try
             {
-                // Normalize content type
                 var normalizedContentType = contentType?.ToLowerInvariant() ?? string.Empty;
                 var extension = Path.GetExtension(fileName)?.ToLowerInvariant() ?? string.Empty;
 
                 string rawText;
 
-                // Handle text-based files
                 if (normalizedContentType.Contains("text/") ||
                     extension == ".txt" ||
                     extension == ".csv" ||
                     extension == ".json" ||
                     extension == ".xml")
                 {
-                    rawText = await ExtractTextFromTextFileAsync(fileContent);
-                    return NormalizeAndLimitText(rawText, fileName);
+                    rawText = ExtractTextFromTextFile(fileContent);
+                    return Task.FromResult(NormalizeAndLimitText(rawText, fileName));
                 }
 
-                // Handle PDF files
                 if (normalizedContentType.Contains("pdf") || extension == ".pdf")
                 {
-                    rawText = await Task.FromResult(ExtractTextFromPdfFile(fileName, fileContent));
-                    return NormalizeAndLimitText(rawText, fileName);
+                    rawText = ExtractTextFromPdfFile(fileName, fileContent);
+                    return Task.FromResult(NormalizeAndLimitText(rawText, fileName));
                 }
 
-                // Handle Word documents
                 if (normalizedContentType.Contains("word") ||
                     normalizedContentType.Contains("msword") ||
                     normalizedContentType.Contains("officedocument.wordprocessingml") ||
                     extension == ".doc" ||
                     extension == ".docx")
                 {
-                    // For now, return empty string - can be enhanced with Word parsing library
-                    _logger.LogDebug("Word document text extraction not yet implemented for {FileName}", fileName);
-                    return string.Empty;
+                    if (extension == ".docx" || normalizedContentType.Contains("officedocument.wordprocessingml"))
+                    {
+                        rawText = ExtractTextFromWordDocx(fileName, fileContent);
+                        return Task.FromResult(NormalizeAndLimitText(rawText, fileName));
+                    }
+
+                    _logger.LogDebug("Legacy .doc extraction is not supported for {FileName}", fileName);
+                    return Task.FromResult(string.Empty);
                 }
 
-                // Handle Excel files
                 if (normalizedContentType.Contains("excel") ||
                     normalizedContentType.Contains("spreadsheet") ||
                     extension == ".xls" ||
                     extension == ".xlsx")
                 {
-                    // For now, return empty string - can be enhanced with Excel parsing library
-                    _logger.LogDebug("Excel text extraction not yet implemented for {FileName}", fileName);
-                    return string.Empty;
+                    rawText = ExtractTextFromExcelFile(fileName, fileContent);
+                    return Task.FromResult(NormalizeAndLimitText(rawText, fileName));
                 }
 
-                // For other file types, return empty string
                 _logger.LogDebug("No text extraction available for content type {ContentType} with extension {Extension}",
                     contentType, extension);
-                return string.Empty;
+                return Task.FromResult(string.Empty);
             }
             catch (Exception ex)
             {
                 _logger.LogError(ex, "Error extracting text from {FileName}", fileName);
-                return string.Empty;
+                return Task.FromResult(string.Empty);
             }
         }
 
-        private async Task<string> ExtractTextFromTextFileAsync(byte[] fileContent)
+        private string ExtractTextFromTextFile(byte[] fileContent)
         {
             try
             {
-                // Try UTF-8 first
                 var text = Encoding.UTF8.GetString(fileContent);
 
-                // Check if the decoded text contains replacement characters (indicates encoding issue)
                 if (text.Contains('\uFFFD'))
                 {
-                    // Try other encodings
                     text = Encoding.ASCII.GetString(fileContent);
                 }
 
-                // Limit the extracted text to a reasonable size.
                 if (text.Length > MaxExtractedTextLength)
                 {
                     text = text.Substring(0, MaxExtractedTextLength);
                     _logger.LogDebug("Truncated text content to {MaxLength} characters", MaxExtractedTextLength);
                 }
 
-                return await Task.FromResult(text);
+                return text;
             }
             catch (Exception ex)
             {
@@ -154,6 +158,186 @@ private string ExtractTextFromPdfFile(string fileName, byte[] fileContent)
             }
         }
 
+        private string ExtractTextFromWordDocx(string fileName, byte[] fileContent)
+        {
+            try
+            {
+                using var stream = new MemoryStream(fileContent, writable: false);
+                using var document = new XWPFDocument(stream);
+                var builder = new StringBuilder();
+
+                foreach (var paragraphText in document.Paragraphs.Take(MaxDocxParagraphs).Select(paragraph => paragraph.ParagraphText))
+                {
+                    var limitReached = AppendWithLimit(builder, paragraphText, MaxExtractedTextLength, Environment.NewLine);
+                    if (limitReached)
+                    {
+                        break;
+                    }
+                }
+
+                if (builder.Length < MaxExtractedTextLength)
+                {
+                    foreach (var table in document.Tables)
+                    {
+                        foreach (var row in table.Rows.Take(MaxDocxTableRows))
+                        {
+                            foreach (var cell in row.GetTableCells().Take(MaxDocxTableCellsPerRow))
+                            {
+                                var limitReached = AppendWithLimit(builder, cell.GetText(), MaxExtractedTextLength, Environment.NewLine);
+                                if (limitReached)
+                                {
+                                    break;
+                                }
+                            }
+
+                            if (builder.Length >= MaxExtractedTextLength)
+                            {
+                                break;
+                            }
+                        }
+
+                        if (builder.Length >= MaxExtractedTextLength)
+                        {
+                            break;
+                        }
+                    }
+                }
+
+                return builder.ToString();
+            }
+            catch (Exception ex)
+            {
+                _logger.LogWarning(ex, "Word (.docx) text extraction failed for {FileName}", fileName);
+                return string.Empty;
+            }
+        }
+
+        private string ExtractTextFromExcelFile(string fileName, byte[] fileContent)
+        {
+            try
+            {
+                using var stream = new MemoryStream(fileContent, writable: false);
+                using var workbook = WorkbookFactory.Create(stream);
+                var builder = new StringBuilder();
+                var sheetCount = Math.Min(workbook.NumberOfSheets, MaxExcelSheets);
+                var limitReached = false;
+
+                for (var sheetIndex = 0; sheetIndex < sheetCount; sheetIndex++)
+                {
+                    if (limitReached || builder.Length >= MaxExtractedTextLength)
+                    {
+                        break;
+                    }
+
+                    var sheet = workbook.GetSheetAt(sheetIndex);
+                    if (sheet == null)
+                    {
+                        continue;
+                    }
+
+                    var processedRows = 0;
+                    foreach (IRow row in sheet)
+                    {
+                        if (processedRows >= MaxExcelRowsPerSheet || builder.Length >= MaxExtractedTextLength)
+                        {
+                            break;
+                        }
+
+                        var rowHasValue = false;
+                        foreach (var cell in row.Cells.Take(MaxExcelCellsPerRow))
+                        {
+                            var value = GetCellText(cell);
+                            if (string.IsNullOrWhiteSpace(value))
+                            {
+                                continue;
+                            }
+
+                            var separator = rowHasValue ? " | " : (builder.Length > 0 ? Environment.NewLine : null);
+                            limitReached = AppendWithLimit(builder, value, MaxExtractedTextLength, separator);
+                            rowHasValue = true;
+                            if (limitReached)
+                            {
+                                break;
+                            }
+                        }
+
+                        processedRows++;
+                        if (limitReached)
+                        {
+                            break;
+                        }
+                    }
+                }
+
+                return builder.ToString();
+            }
+            catch (Exception ex)
+            {
+                _logger.LogWarning(ex, "Excel text extraction failed for {FileName}", fileName);
+                return string.Empty;
+            }
+        }
+
+        private static bool AppendWithLimit(StringBuilder builder, string? value, int maxLength, string? separator = null)
+        {
+            if (string.IsNullOrWhiteSpace(value))
+            {
+                return builder.Length >= maxLength;
+            }
+
+            if (builder.Length >= maxLength)
+            {
+                return true;
+            }
+
+            var remaining = maxLength - builder.Length;
+            if (remaining <= 0)
+            {
+                return true;
+            }
+
+            if (!string.IsNullOrEmpty(separator) && builder.Length > 0)
+            {
+                if (separator.Length >= remaining)
+                {
+                    builder.Append(separator.AsSpan(0, remaining));
+                    return true;
+                }
+
+                builder.Append(separator);
+                remaining -= separator.Length;
+            }
+
+            if (value.Length >= remaining)
+            {
+                builder.Append(value.AsSpan(0, remaining));
+                return true;
+            }
+
+            builder.Append(value);
+            return false;
+        }
+
+        private static string GetCellText(NPOI.SS.UserModel.ICell cell)
+        {
+            if (cell == null)
+            {
+                return string.Empty;
+            }
+
+            return (cell.CellType switch
+            {
+                CellType.String => cell.StringCellValue ?? string.Empty,
+                CellType.Numeric => DateUtil.IsCellDateFormatted(cell)
+                    ? cell.DateCellValue.ToString()
+                    : cell.NumericCellValue.ToString(),
+                CellType.Boolean => cell.BooleanCellValue ? "true" : "false",
+                CellType.Formula => cell.ToString(),
+                CellType.Blank => string.Empty,
+                _ => cell.ToString() ?? string.Empty
+            }) ?? string.Empty;
+        }
+
         private string NormalizeAndLimitText(string text, string fileName)
         {
             var normalized = NormalizeExtractedText(text);

diff --git a/...ity.GrantManager/src/Unity.GrantManager.Application/Unity.GrantManager.Application.csproj b/...ity.GrantManager/src/Unity.GrantManager.Application/Unity.GrantManager.Application.csproj
@@ -33,6 +33,7 @@
     <PackageReference Include="Quartz.Serialization.Json" Version="3.14.0" />
     <PackageReference Include="RestSharp" Version="112.1.0" />
     <PackageReference Include="PdfPig" Version="0.1.13" />
+    <PackageReference Include="NPOI" Version="2.7.5" />
     <PackageReference Include="Newtonsoft.Json" Version="13.0.3" />
     <PackageReference Include="Volo.Abp.BackgroundWorkers.Quartz" Version="9.1.3" />
     <PackageReference Include="Volo.Abp.BlobStoring" Version="9.1.3" />