In [5]:
import docx
import openpyxl
import pandas as pd
from transformers import pipeline
from typing import List, Tuple, Dict, Any
from sklearn.metrics import accuracy_score
from tqdm import tqdm

# Initialize the classifier model (French-specific model)
classifier = pipeline("zero-shot-classification", model="joeddav/xlm-roberta-large-xnli", device=0)

def load_docx_paragraphs(file_path: str) -> List[Dict[str, Any]]:
    """
    Load paragraphs and table text from a .docx file along with page numbers.

    Args:
        file_path (str): Path to the .docx file.

    Returns:
        List[Dict[str, Any]]: List of paragraphs with text, page number, and type.
    """
    doc = docx.Document(file_path)
    paragraphs_data = []
    page_num = 1  # Assuming page numbers if metadata is not directly available

    for para in doc.paragraphs:
        if para.text.strip():
            paragraphs_data.append({
                "text": para.text.strip(),
                "page": page_num
            })

    # Extract text from tables
    for table in doc.tables:
        for row in table.rows:
            for cell in row.cells:
                if cell.text.strip():
                    paragraphs_data.append({
                        "text": cell.text.strip(),
                        "page": page_num
                    })
    return paragraphs_data

def classify_paragraphs(paragraphs: List[Dict[str, Any]], labels: List[str]) -> List[Dict[str, Any]]:
    """
    Classify paragraphs using zero-shot classification model.

    Args:
        paragraphs (List[Dict[str, Any]]): List of paragraphs with text and page.
        labels (List[str]): List of topics to classify into.

    Returns:
        List[Dict[str, Any]]: Paragraphs with predicted topics included.
    """
    for paragraph in tqdm(paragraphs, desc="Classifying paragraphs"):
        result = classifier(paragraph["text"], labels)
        paragraph["topic"] = result["labels"][0]  # Highest confidence label
    return paragraphs

def save_to_excel(paragraphs: List[Dict[str, Any]], output_path: str) -> None:
    """
    Save classified paragraphs into an Excel file.

    Args:
        paragraphs (List[Dict[str, Any]]): Classified paragraph data with topics.
        output_path (str): Path to save the .xlsx file.
    """
    df = pd.DataFrame(paragraphs)
    df.to_excel(output_path, index=False)

def evaluate_classification(predicted: List[str], actual: List[str]) -> float:
    """
    Evaluate the classification accuracy.

    Args:
        predicted (List[str]): List of predicted labels.
        actual (List[str]): List of actual labels.

    Returns:
        float: Accuracy of the classification.
    """
    return accuracy_score(actual, predicted)

def main(input_docx: str, output_xlsx: str, topics: List[str], labeled_data: List[Tuple[str, str]] = None) -> None:
    """
    Main function to classify document paragraphs and save to Excel.

    Args:
        input_docx (str): Path to the input .docx file.
        output_xlsx (str): Path to the output .xlsx file.
        topics (List[str]): List of topics for classification.
        labeled_data (List[Tuple[str, str]], optional): If provided, actual labeled data for evaluation.
    """
    # Load paragraphs from the DOCX
    paragraphs = load_docx_paragraphs(input_docx)

    # Classify paragraphs
    classified_paragraphs = classify_paragraphs(paragraphs, topics)

    # Save results to Excel
    save_to_excel(classified_paragraphs, output_xlsx)

    # Accuracy test if labeled data is provided
    if labeled_data:
        actual_labels = [label for _, label in labeled_data]
        predicted_labels = [classified_paragraphs[i]["topic"] for i, _ in enumerate(labeled_data)]
        #predicted_labels = [para["topic"] for para in classified_paragraphs]
        accuracy = evaluate_classification(predicted_labels, actual_labels)
        print(f"Classification Accuracy: {accuracy:.2%}")

# Sample usage:
if __name__ == "__main__":
    # Define input and output paths
    input_docx_path = "240705_R_EHPAD ARS_CD.docx"
    output_xlsx_path = "240705_R_EHPAD ARS_CD_classified_output.xlsx"

    # Define topics for classification
    topics = ["Soins", "Usagers", "Formation"]

    # Optionally, provide labeled data for accuracy evaluation
    labeled_paragraphs = [
        ("Example paragraph about Soins", "Soins"),
        ("Example paragraph about Usagers", "Usagers"),
        ("Example paragraph about Formation", "Formation"),
    ]

    main(input_docx_path, output_xlsx_path, topics, labeled_paragraphs)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFXLMRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFXLMRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFXLMRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFXLMRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaForSequenceClassification for predictions without further training.
Classifying paragraphs: 100%|██████████| 698/698 [24:50<00:00,  2.

Classification Accuracy: 0.00%



