In [1]:
# Workflow:
# For each document:
# 1. Preprocessing and extract features using RegEx
# 2. Split into overlapping chunks.
# 3. Return the most important chunks using Memwalker
# 4. Concatenate the most important chunks.
# 5. If a chunk is too long, shorten it by summarising its chunks.
# 6. Use text and features to receive a financial mismanagement score.

In [None]:
import os
import re
import spacy
from google import genai
from google.genai import types
import json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Optional
from dataclasses import dataclass, field

KEY = "YOUR_KEY" # add your gemini api key

def preprocess(text):
    """
    Lowercase, remove non-ASCII characters, normalize whitespace.
    """
    text = text.lower()
    # remove non-ASCII
    text = re.sub(r'[^\x00-\x7f]', ' ', text)
    # normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Load spaCy model (run python -m spacy download en_core_web_sm first if needed)
nlp = spacy.load("en_core_web_sm")

def extract_financial_features(text):
    """
    Extract financial features from a single NHS trust audit report.
    
    Args:
        text (str): The audit report text to analyze
        
    Returns:
        dict: A dictionary of extracted features with normalized values
    """
    doc = nlp(text)
    features = {}
    
    # Helper function to normalize amounts to millions
    def normalize_amount(amount_str):
        if not amount_str:
            return 0.0
        clean_str = re.sub(r'[^\d.]', '', amount_str)
        if not clean_str:
            return 0.0
        amount = float(clean_str)
        if 'thousand' in amount_str.lower():
            return amount / 1000
        elif 'million' in amount_str.lower():
            return amount
        elif 'billion' in amount_str.lower() or 'bn' in amount_str.lower():
            return amount * 1000
        else:
            return amount / 1e6
    
    # 1. Financial amounts
    amounts = re.findall(r'£\s*(\d{1,3}(?:,\d{3})(?:\.\d{1,2})?|\d+\s(?:million|billion|bn|m|thousand)\b)', text, re.IGNORECASE)
    normalized_amounts = [normalize_amount(amt) for amt in amounts]
    features['total_amount_mentioned'] = sum(normalized_amounts)
    features['num_amount_mentions'] = len(normalized_amounts)
    
    # 2. Context-specific amounts
    context_patterns = {
        'deficit_amount': r'(?:deficit|shortfall)\D{1,20}£\s*([\d,\.]+)',
        'savings_amount': r'(?:savings|efficiency)\D{1,20}£\s*([\d,\.]+)',
        'overspend_amount': r'(?:overspend|exceed)\D{1,20}£\s*([\d,\.]+)'
    }
    for name, pattern in context_patterns.items():
        matches = re.findall(pattern, text, re.IGNORECASE)
        features[name] = normalize_amount(matches[0]) if matches else 0.0
    
    # 3. Risk indicators
    risk_terms = ['risk', 'concern', 'challenge', 'weakness', 'vulnerability']
    features['risk_count'] = sum(1 for token in doc if token.lemma_.lower() in risk_terms)
    
    # 4. Governance indicators
    governance_terms = ['governance', 'control', 'oversight', 'compliance']
    features['governance_count'] = sum(1 for token in doc if token.lemma_.lower() in governance_terms)
    
    # 5. Performance metrics
    features['kpi_mentions'] = len(re.findall(r'\b(?:performance\s+)?(target|metric|indicator|kpi|benchmark)\b', text, re.IGNORECASE))
    
    # 6. Negative sentiment
    negative_terms = ['fail', 'inadequate', 'poor', 'weak', 'insufficient']
    features['negative_count'] = sum(1 for token in doc if token.lemma_.lower() in negative_terms)
    
    # 7. Audit opinion
    opinion_match = re.search(r'\b(qualified\s+opinion|adverse\s+opinion|disclaimer|emphasis\s+of\s+matter)\b', text, re.IGNORECASE)
    features['audit_opinion'] = opinion_match.group(0) if opinion_match else 'unqualified'
    
    # 8. Sentence-level analysis
    deficit_sents = [sent.text for sent in doc.sents if any(t.lemma_.lower() in {'deficit', 'shortfall'} for t in sent)]
    features['deficit_sentences'] = len(deficit_sents)
    
    # 9. Named entities
    money_ents = [ent.text for ent in doc.ents if ent.label_ == 'MONEY']
    features['money_entities'] = len(money_ents)
    
    # 10. Normalized scores
    total_words = len([t for t in doc if t.is_alpha])
    features['risk_density'] = features['risk_count'] / total_words if total_words > 0 else 0
    features['negative_density'] = features['negative_count'] / total_words if total_words > 0 else 0
    
    return features

def preprocess_and_extract_features(text):
    cleaned = preprocess(text)
    features = extract_financial_features(cleaned)
    return cleaned, features

def split_overlapping_chunks(text, chunk_size=2500, overlap=50): # some reports might need smaller chunk_size to work
    """
    Split `text` into overlapping chunks.

    Args:
        text (str): The input string to split.
        chunk_size (int): The maximum length of each chunk (must be > 0).
        overlap (int): The number of characters by which consecutive chunks overlap (0 <= overlap < chunk_size).

    Returns:
        List[str]: A list of string chunks.
    """
    if chunk_size <= 0:
        raise ValueError("chunk_size must be > 0")
    if not (0 <= overlap < chunk_size):
        raise ValueError("overlap must satisfy 0 <= overlap < chunk_size")

    step = chunk_size - overlap
    chunks = []
    for start in range(0, len(text), step):
        end = start + chunk_size
        chunks.append(text[start:end])
        if end >= len(text):
            break
    return chunks

# Memwalker code
@dataclass
class Node:
    summary:  Optional[str]      = None      # internal summaries
    text:     Optional[str]      = None      # raw passage for leaves
    children: List["Node"]       = field(default_factory=list)

    @property
    def is_leaf(self) -> bool:
        return not self.children

def build_memory_tree(text,
                      client,
                      branching = 4):
    """
    1. Chunk long text into leaves.
    2. Recursively group `branching` children and ask the LLM to
       summarize them, building parent nodes until a single root remains.
    """
    # ---- 1-a. Create leaf level -----------------
    leaves = [Node(text=chunk) for chunk in
              split_overlapping_chunks(text)]
    level = leaves

    # ---- 1-b. Bottom-up summarization -----------
    def _summarize(chunks):
        joined = "\n\n".join(chunks)
        return summarize_chunk(joined, client) 

    while len(level) > 1:
        parents = []
        for i in range(0, len(level), branching):
            group = level[i:i + branching]
            parent_summary = _summarize(
                [c.text if c.is_leaf else c.summary for c in group]
            )
            parents.append(Node(summary=parent_summary,
                                children=group))
        level = parents                      # climb one level
    return level[0]                          # root


DECISION_TEMPLATE = """You are an auditor searching a document tree for
evidence of "financial mismanagement". Below is the current node’s summary followed by its
child summaries.

Current summary:
{parent_summary}

Child summaries (numbered):
{numbered_children}

Which child(ren) should be opened next? Reply with *comma-separated indices*
(e.g. "2" or "1,3") or "none" if none seem relevant.
"""

def _preview(node: Node, n_chars: int = 200) -> str:
    """
    Return a short snippet for the navigation prompt:
      • use node.summary if it exists
      • otherwise fall back to node.text
    """
    src = node.summary if node.summary is not None else node.text
    return (src or "")[:n_chars] + "…"

def choose_children_via_llm(parent, client):
    numbered = "\n".join(
        f"{idx}. {_preview(child)}"
        for idx, child in enumerate(parent.children, 1)
    )
    prompt = DECISION_TEMPLATE.format(
        parent_summary=parent.summary,
        numbered_children=numbered
    )
    reply = client.models.generate_content(
        model="gemini-2.0-flash-lite",
        contents=prompt,
        config=types.GenerateContentConfig(temperature=0)         
    ).text.lower().strip()
    if "none" in reply:
        return []
    # extract integers 1-based → 0-based
    return [int(i) - 1 for i in reply.split(",") if i.strip().isdigit()]

def walk_memory_tree(root,
                     client,
                     max_hops = 10):
    """
    Depth-first walk driven by LLM decisions.
    Returns the *leaf* nodes visited in order.
    """
    stack   = [(root, 0)]      # node, depth
    visited = []

    while stack and len(visited) < max_hops:
        node, depth = stack.pop()
        if node.is_leaf:
            visited.append(node)
            continue
        # Ask which branches to open
        choices = choose_children_via_llm(node, client)
        # Push chosen children to the stack (LIFO → DFS)
        for idx in reversed(choices):
            stack.append((node.children[idx], depth + 1))
    return visited             # ordered path of relevant leaves

def memwalker_retrieve(text,
                       client,
                       branching   = 4,
                       max_hops    = 10):
    """
    One-shot helper that:
       • builds the memory tree,
       • walks it,
       • returns the *raw text* of visited leaves.
    """
    root = build_memory_tree(text, client,
                               branching=branching)
    leaves = walk_memory_tree(root, client, max_hops=max_hops)
    return [leaf.text for leaf in leaves]

# Not used in final model
def rank_chunks_tfidf(chunks, query="financial mismanagement", top_k=10):
    """
    Ranks text chunks by their relevance to a query using TF-IDF + cosine similarity.
    
    Parameters:
    - chunks: list of str, the document chunks.
    - query: str, the concept to rank against.
    - top_k: int, the number of top chunks to return.
    
    Returns:
    - List of indices of the top_k most relevant chunks.
    """
    # Fit TF-IDF on both chunks and the query
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(chunks + [query])
    
    # Separate chunk vectors and the query vector
    chunk_vectors = tfidf_matrix[:-1]
    query_vector = tfidf_matrix[-1]
    
    # Compute cosine similarities
    similarities = cosine_similarity(query_vector, chunk_vectors)[0]
    
    # Select top_k chunk indices
    top_indices = similarities.argsort()[::-1][:top_k]
    return top_indices.tolist()

def summarize_chunk(chunk, client):
    prompt = (
        "Please summarize the following section:\n\n"
        f"{chunk}"
    )
    resp = client.models.generate_content(
        model="gemini-2.0-flash-lite",
        contents= prompt,
        config=types.GenerateContentConfig(temperature=0)).text    
    return resp.strip()

#  Synthesize final summary 
def synthesize_summaries(summaries, client):
    combined = "\n\n".join(f"- {s}" for s in summaries)
    prompt = (
        "Here are summaries of the relevant sections:\n\n"
        f"{combined}\n\n"
        "Please produce a cohesive executive summary highlighting the key points."
    )
    resp = client.models.generate_content(
        model="gemini-2.0-flash-lite",
        contents= prompt,
        config=types.GenerateContentConfig(temperature=0)         
    ).text     
    return resp.strip()

def summary_to_score(summary, features_1, features_2, client):
    response = client.models.generate_content(
        model="gemini-2.0-flash-lite",
        contents= f"""
    You are an auditor evaluating the attached summary of an NHS Trust audit report. Also consider some extracted features from the report. 
    Output only a single JSON object with:
    - "financial_mismanagement_score": a numeric value (0.0 to 1.0) for overall likelihood of financial mismanagement or fraud (assume a lower risk if much is redacted/unconfirmed).

    Do not add any commentary or explanation beyond this JSON structure. Do this in a step-by-step manner.
    Here is the report: {summary} and the features {features_1}, {features_2}.
    """,
    config=types.GenerateContentConfig(temperature=0)         
    ).text 
    
    start_index = response.index('{')
    end_index = response.rindex('}') + 1
    json_content = response[start_index:end_index]

    # 2. Load as JSON.
    data = json.loads(json_content)

    financial_mismanagement_score = data["financial_mismanagement_score"]   
    return financial_mismanagement_score 

# didn't use it in final model
def summary_features_to_factsheet(summary, features, excel_features, client):
    EXTRACT_PROMPT = """
You are a forensic accountant.

Task 1 (critical – do not skip):
Read the document and the features and fill the 26 JSON fields below.
If a fact is missing, use null.

Return **only** valid JSON – no commentary.

Fields:
{{
    "trust_name": "",                    # e.g. "Acme Trust"
    "fy_start": "",                       # "2019-04-01"
    "fy_end": "",                         # "2020-03-31"
    "audit_opinion": "",                  # qualified | unqualified | adverse
    "section_24_report": null,            # true | false
    "statutory_breakeven_breach": null,   # true | false
    "unlawful_expenditure": null,         # true | false
    "use_of_resources_rating": "",        # good | requires improvement | inadequate
    "annual_deficit_m£": null,            # float, millions GBP
    "cumulative_loans_m£": null,          # float, millions GBP
    "OpinionOnFinancialStatements": "",   # standard | non-standard
    "ValueformoneyArrangementsConclusion": "",  # qualified | unqualified
    "AdditionalReportingPowers": "",      # exercised | not exercised
    "total_amount_mentioned": null,       # GBP total figures
    "num_amount_mentions": null,          # integer count
    "deficit_amount": null,               # GBP deficit figure
    "savings_amount": null,               # GBP savings figure
    "overspend_amount": null,             # GBP overspend figure
    "risk_count": null,                   # number of risks called out
    "governance_count": null,             # governance issues count
    "kpi_mentions": null,                 # count of KPI mentions
    "negative_count": null,               # count of negative words
    "deficit_sentences": null,            # number of sentences about deficit
    "money_entities": null,               # named monetary entities
    "risk_density": null,                 # float per 1k words
    "negative_density": null              # float per 1k words
}}

<BEGIN_FEATURES>
{features_1}
{features_2}
<END_FEATURES>

<BEGIN_DOCUMENT>
{document}
<END_DOCUMENT>
"""
    prompt = EXTRACT_PROMPT.format(
        document=summary,
        features_1=excel_features,
        features_2=features
    )
    response = client.models.generate_content(
        model="gemini-2.0-flash-lite",
        contents=prompt,
        config=types.GenerateContentConfig(temperature=0)         
    ).text
    return response


def summarize_full_text(text, client, chunk_size=2000):
    """
    Splits `text` into chunks of up to `chunk_size` characters (splitting on word boundaries),
    summarises each chunk, and then synthesises an ~800-word executive summary.
    
    Args:
        text (str): The full document to summarise.
        client: An LLM client
        chunk_size (int): Maximum characters per chunk (default: 2000).
        
    Returns:
        str: A cohesive ~800-word executive summary.
    """
    # 1. Split text into word-safe chunks
    words = text.split()
    chunks, current = [], ""
    for w in words:
        # +1 for the space we’ll add
        if len(current) + len(w) + 1 <= chunk_size:
            current += (" " if current else "") + w
        else:
            chunks.append(current)
            current = w
    if current:
        chunks.append(current)

    # 2. Summarise each chunk
    summaries = []
    for idx, chunk in enumerate(chunks, start=1):
        prompt = (
            f"Chunk {idx} of {len(chunks)}:\n\n"
            f"{chunk}\n\n"
            "Please provide a concise summary (around 150 words) of the above."
        )
        resp = client.models.generate_content(
            model="gemini-2.0-flash-lite",
            contents=prompt,
            config=types.GenerateContentConfig(temperature=0)            
        ).text
        summaries.append(resp.strip())

    # 3. Combine chunk summaries and request final 800-word summary
    combined = "\n\n".join(f"- {s}" for s in summaries)
    final_prompt = (
        "Here are summaries of each section:\n\n"
        f"{combined}\n\n"
        "Please produce a single, cohesive executive summary of approximately 700 words, "
        "highlighting the key points across all sections."
    )
    final_resp = client.models.generate_content(
        model="gemini-2.0-flash-lite",
        contents=final_prompt,
        config=types.GenerateContentConfig(temperature=0)         
    ).text

    return final_resp.strip()

def extract_excel_features(code, year, df):
    # build mask for matching row
    mask = (df['Row Labels'] == code) & (df['Year'] == year)
    matched = df.loc[mask]

    # ensure exactly one match
    n = len(matched)
    if n == 0:
        raise ValueError(f"No rows found for code={code!r}, year={year}")
    if n > 1:
        raise ValueError(f"Expected 1 row but found {n} for code={code!r}, year={year}")

    # extract the single row as a Series
    row = matched.iloc[0]

    # pull out the desired fields
    return {
        'OpinionOnFinancialStatements':      row['OpinionOnFinancialStatements'],
        'ValueformoneyArrangementsConclusion':         row['VfmArrangementsConclusion'],
        'AdditionalReportingPowers':         row['AdditionalReportingPowers']
    }


# didn't use this part - used MemWalker instead of tf idf
# def text_to_score(text):
#     client = genai.Client(api_key="AIzaSyAs8WP1VWESVbfIZHARASewZO30j_r3tqY")
#     cleaned_text, features = preprocess_and_extract_features(text)
#     chunks = split_overlapping_chunks(cleaned_text)
#     top_indices = rank_chunks_tfidf(chunks, top_k=10)
#     selected = [chunks[i-1] for i in top_indices]
#     summaries = [summarize_chunk(c, client=client) for c in selected]
#     final_summary = synthesize_summaries(summaries, client=client)
#     score = summary_to_score(final_summary, features=features, client=client)
#     return score

def text_to_score(text, code, year, df):
    client = genai.Client(api_key=KEY)
    cleaned_text, features = preprocess_and_extract_features(text)
    relevant_passages = memwalker_retrieve(cleaned_text, client=client)
    final_input = "\n\n".join(relevant_passages)
    if len(final_input) > 800*6:
        final_input = summarize_full_text(final_input, client)
    xl_f = extract_excel_features(code, year, df)
    score = summary_to_score(final_input, features_1=features, features_2=xl_f, client=client)
    return score

In [None]:
df = pd.read_excel("/Volumes/Z Slim/finaldataset?.xlsx")

folder_path = "/Volumes/Z Slim/processed_reports/scanned_to_text"
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    # Check if it's a file (and not a subfolder)
    if os.path.isfile(file_path):
        print(f"Processing file: {filename}")
        code = filename[0:3]
        year = filename[3:12]
        assert filename[12:] == '.txt', f"Error on file {filename}"
        with open(file_path, "r", encoding="utf-8") as f:
            raw_text = f.read()    
        financial_mismanagement_score = text_to_score(raw_text, code, year, df)
        print(financial_mismanagement_score)
        # Save score
        mask = (df['Row Labels'] == code) & (df['Year'] == year)
        row_count = mask.sum()  # Number of True values in mask
        assert row_count == 1, f"Expected exactly 1 matching row for (code={code}, year={year}), found {row_count}"
        df.loc[mask, 'financial mismangement score (0-1)'] = financial_mismanagement_score     

Processing file: RL12017-2018.txt
0.2
Processing file: RQY2017-2018.txt
0.2


In [None]:
df.to_excel("/Volumes/Z Slim/finaldataset_with_iForest.xlsx", index=False)
print(f"\nDataFrame saved")  


DataFrame saved
