Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
using Microsoft.Extensions.Logging;
using NPOI.SS.UserModel;
using NPOI.XWPF.UserModel;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
Expand All @@ -12,104 +16,104 @@ namespace Unity.GrantManager.AI
public class TextExtractionService : ITextExtractionService, ITransientDependency
{
private const int MaxExtractedTextLength = 50000;
private const int MaxExcelSheets = 10;
private const int MaxExcelRowsPerSheet = 2000;
private const int MaxExcelCellsPerRow = 50;
private const int MaxDocxParagraphs = 2000;
private const int MaxDocxTableRows = 2000;
private const int MaxDocxTableCellsPerRow = 50;
private readonly ILogger<TextExtractionService> _logger;

public TextExtractionService(ILogger<TextExtractionService> logger)
{
_logger = logger;
}

public async Task<string> ExtractTextAsync(string fileName, byte[] fileContent, string contentType)
public Task<string> ExtractTextAsync(string fileName, byte[] fileContent, string contentType)
{
if (fileContent == null || fileContent.Length == 0)
{
_logger.LogDebug("File content is empty for {FileName}", fileName);
return string.Empty;
return Task.FromResult(string.Empty);
}

try
{
// Normalize content type
var normalizedContentType = contentType?.ToLowerInvariant() ?? string.Empty;
var extension = Path.GetExtension(fileName)?.ToLowerInvariant() ?? string.Empty;

string rawText;

// Handle text-based files
if (normalizedContentType.Contains("text/") ||
extension == ".txt" ||
extension == ".csv" ||
extension == ".json" ||
extension == ".xml")
{
rawText = await ExtractTextFromTextFileAsync(fileContent);
return NormalizeAndLimitText(rawText, fileName);
rawText = ExtractTextFromTextFile(fileContent);
return Task.FromResult(NormalizeAndLimitText(rawText, fileName));
}

// Handle PDF files
if (normalizedContentType.Contains("pdf") || extension == ".pdf")
{
rawText = await Task.FromResult(ExtractTextFromPdfFile(fileName, fileContent));
return NormalizeAndLimitText(rawText, fileName);
rawText = ExtractTextFromPdfFile(fileName, fileContent);
return Task.FromResult(NormalizeAndLimitText(rawText, fileName));
}

// Handle Word documents
if (normalizedContentType.Contains("word") ||
normalizedContentType.Contains("msword") ||
normalizedContentType.Contains("officedocument.wordprocessingml") ||
extension == ".doc" ||
extension == ".docx")
{
// For now, return empty string - can be enhanced with Word parsing library
_logger.LogDebug("Word document text extraction not yet implemented for {FileName}", fileName);
return string.Empty;
if (extension == ".docx" || normalizedContentType.Contains("officedocument.wordprocessingml"))
{
rawText = ExtractTextFromWordDocx(fileName, fileContent);
return Task.FromResult(NormalizeAndLimitText(rawText, fileName));
}

_logger.LogDebug("Legacy .doc extraction is not supported for {FileName}", fileName);
return Task.FromResult(string.Empty);
}

// Handle Excel files
if (normalizedContentType.Contains("excel") ||
normalizedContentType.Contains("spreadsheet") ||
extension == ".xls" ||
extension == ".xlsx")
{
// For now, return empty string - can be enhanced with Excel parsing library
_logger.LogDebug("Excel text extraction not yet implemented for {FileName}", fileName);
return string.Empty;
rawText = ExtractTextFromExcelFile(fileName, fileContent);
return Task.FromResult(NormalizeAndLimitText(rawText, fileName));
}

// For other file types, return empty string
_logger.LogDebug("No text extraction available for content type {ContentType} with extension {Extension}",
contentType, extension);
return string.Empty;
return Task.FromResult(string.Empty);
}
catch (Exception ex)
{
_logger.LogError(ex, "Error extracting text from {FileName}", fileName);
return string.Empty;
return Task.FromResult(string.Empty);
}
}

private async Task<string> ExtractTextFromTextFileAsync(byte[] fileContent)
private string ExtractTextFromTextFile(byte[] fileContent)
{
try
{
// Try UTF-8 first
var text = Encoding.UTF8.GetString(fileContent);

// Check if the decoded text contains replacement characters (indicates encoding issue)
if (text.Contains('\uFFFD'))
{
// Try other encodings
text = Encoding.ASCII.GetString(fileContent);
}

// Limit the extracted text to a reasonable size.
if (text.Length > MaxExtractedTextLength)
{
text = text.Substring(0, MaxExtractedTextLength);
_logger.LogDebug("Truncated text content to {MaxLength} characters", MaxExtractedTextLength);
}

return await Task.FromResult(text);
return text;
}
catch (Exception ex)
{
Expand Down Expand Up @@ -154,6 +158,186 @@ private string ExtractTextFromPdfFile(string fileName, byte[] fileContent)
}
}

private string ExtractTextFromWordDocx(string fileName, byte[] fileContent)
{
try
{
using var stream = new MemoryStream(fileContent, writable: false);
using var document = new XWPFDocument(stream);
var builder = new StringBuilder();

foreach (var paragraphText in document.Paragraphs.Take(MaxDocxParagraphs).Select(paragraph => paragraph.ParagraphText))
{
var limitReached = AppendWithLimit(builder, paragraphText, MaxExtractedTextLength, Environment.NewLine);
if (limitReached)
{
break;
}
}

if (builder.Length < MaxExtractedTextLength)
{
foreach (var table in document.Tables)
{
foreach (var row in table.Rows.Take(MaxDocxTableRows))
{
foreach (var cell in row.GetTableCells().Take(MaxDocxTableCellsPerRow))
{
var limitReached = AppendWithLimit(builder, cell.GetText(), MaxExtractedTextLength, Environment.NewLine);
if (limitReached)
{
break;
}
}

if (builder.Length >= MaxExtractedTextLength)
{
break;
}
}

if (builder.Length >= MaxExtractedTextLength)
{
break;
}
}
}

return builder.ToString();
}
catch (Exception ex)
{
_logger.LogWarning(ex, "Word (.docx) text extraction failed for {FileName}", fileName);
return string.Empty;
Comment on lines +208 to +211
Copy link

Copilot AI Mar 4, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The warning log in the .docx extraction catch block doesn't include the file name, which makes production troubleshooting harder when multiple attachments are processed. Consider passing fileName into ExtractTextFromWordDocx (or logging from the caller) so the log message includes {FileName} like the PDF/Excel paths do.

Copilot uses AI. Check for mistakes.
}
}

private string ExtractTextFromExcelFile(string fileName, byte[] fileContent)
{
try
{
using var stream = new MemoryStream(fileContent, writable: false);
using var workbook = WorkbookFactory.Create(stream);
var builder = new StringBuilder();
var sheetCount = Math.Min(workbook.NumberOfSheets, MaxExcelSheets);
var limitReached = false;

for (var sheetIndex = 0; sheetIndex < sheetCount; sheetIndex++)
{
if (limitReached || builder.Length >= MaxExtractedTextLength)
{
break;
}

var sheet = workbook.GetSheetAt(sheetIndex);
if (sheet == null)
{
continue;
}

var processedRows = 0;
foreach (IRow row in sheet)
{
if (processedRows >= MaxExcelRowsPerSheet || builder.Length >= MaxExtractedTextLength)
{
break;
}

var rowHasValue = false;
foreach (var cell in row.Cells.Take(MaxExcelCellsPerRow))
{
var value = GetCellText(cell);
if (string.IsNullOrWhiteSpace(value))
{
continue;
}

var separator = rowHasValue ? " | " : (builder.Length > 0 ? Environment.NewLine : null);
limitReached = AppendWithLimit(builder, value, MaxExtractedTextLength, separator);
rowHasValue = true;
if (limitReached)
{
break;
}
}

processedRows++;
if (limitReached)
{
break;
}
}
}

return builder.ToString();
}
catch (Exception ex)
{
_logger.LogWarning(ex, "Excel text extraction failed for {FileName}", fileName);
return string.Empty;
}
}

private static bool AppendWithLimit(StringBuilder builder, string? value, int maxLength, string? separator = null)
{
if (string.IsNullOrWhiteSpace(value))
{
return builder.Length >= maxLength;
}

if (builder.Length >= maxLength)
{
return true;
}

var remaining = maxLength - builder.Length;
if (remaining <= 0)
{
return true;
}

if (!string.IsNullOrEmpty(separator) && builder.Length > 0)
{
if (separator.Length >= remaining)
{
builder.Append(separator.AsSpan(0, remaining));
return true;
}

builder.Append(separator);
remaining -= separator.Length;
}

if (value.Length >= remaining)
{
builder.Append(value.AsSpan(0, remaining));
return true;
}

builder.Append(value);
return false;
}

private static string GetCellText(NPOI.SS.UserModel.ICell cell)
{
if (cell == null)
{
return string.Empty;
}

return (cell.CellType switch
{
CellType.String => cell.StringCellValue ?? string.Empty,
CellType.Numeric => DateUtil.IsCellDateFormatted(cell)
? cell.DateCellValue.ToString()
: cell.NumericCellValue.ToString(),
CellType.Boolean => cell.BooleanCellValue ? "true" : "false",
CellType.Formula => cell.ToString(),
CellType.Blank => string.Empty,
_ => cell.ToString() ?? string.Empty
}) ?? string.Empty;
}

private string NormalizeAndLimitText(string text, string fileName)
{
var normalized = NormalizeExtractedText(text);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
<PackageReference Include="Quartz.Serialization.Json" Version="3.14.0" />
<PackageReference Include="RestSharp" Version="112.1.0" />
<PackageReference Include="PdfPig" Version="0.1.13" />
<PackageReference Include="NPOI" Version="2.7.5" />
<PackageReference Include="Newtonsoft.Json" Version="13.0.3" />
<PackageReference Include="Volo.Abp.BackgroundWorkers.Quartz" Version="9.1.3" />
<PackageReference Include="Volo.Abp.BlobStoring" Version="9.1.3" />
Expand Down
Loading