In [25]:
import csv
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

In [26]:
def load_model(model_name):
    """Loads the fine-tuned NER model and tokenizer."""
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForTokenClassification.from_pretrained(model_name)
        nlp_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
        return tokenizer, nlp_pipeline
    except Exception as e:
        print(f"Error loading model: {e}")
        exit(1)


In [27]:
import re
import csv
import os

def clean_text(file_path, cleaned_file_path):
    """Reads, cleans, and saves text from a file."""
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    
    # Remove URLs (http, https, www)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    # Preserve MIC-related details "(1)...(2a and 2b)...(3)..."
    pattern_mic_details = r"\(\d+[a-z]?(?: and \d+[a-z]?)?\)[^()\n]+"
    mic_matches = re.findall(pattern_mic_details, text)

    # Remove unwanted special characters but keep important ones
    text = re.sub(r"[^\w\s.,;()/'\"-]", '', text)

    # Ensure MIC-related details remain in their original positions
    for match in mic_matches:
        if match not in text:
            text += "\n" + match  # Appending as a last resort if missing

    # Normalize spaces and punctuation
    text = re.sub(r'\s+', ' ', text).replace(' .', '.').replace(' ,', ',').strip()

    # Save cleaned text to a file
    with open(cleaned_file_path, "w", encoding="utf-8") as file:
        file.write(text)

    print(f"✅ Text cleaned and saved to: {cleaned_file_path}")

In [28]:
def extract_entities(text, nlp_pipeline):
    """Extracts named entities from the text using the fine-tuned NER model."""
    ner_results = nlp_pipeline(text)
    entities = {"LABEL_3": [], "LABEL_4": [], "LABEL_1": [], "LABEL_2": [], "LABEL_5": [], "LABEL_6": []}
    
    for entity in ner_results:
        label = entity['entity_group']
        if label in entities:
            entities[label].append(entity['word'])
    
    return entities

In [29]:
def format_output(entities):
    """Formats extracted entities into structured fields."""
    dates = ", ".join(entities["LABEL_3"]+entities["LABEL_4"]) or "N/A"
    
    fatalities = sorted(set(entities["LABEL_5"] + entities["LABEL_6"]))
    fatality_range = f"[{fatalities[0]} to {fatalities[-1]}]" if fatalities else "N/A"
    
    countries = ", ".join(entities["LABEL_1"] + entities["LABEL_2"]) or "N/A"
    
    return dates, fatality_range, countries



In [30]:
def process_articles(file_path, model_name, output_csv):
    """Processes a cleaned text file containing multiple articles and extracts MIC-related entities."""
    tokenizer, nlp_pipeline = load_model(model_name)

    with open(file_path, "r", encoding="utf-8") as file:
        articles = file.read().split("____________________________________________________________")

    with open(output_csv, "w", newline="", encoding="utf-8") as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(["Article Number", "Dates", "Fatality Range", "Countries"])
        
        for idx, article in enumerate(articles, start=1):
            entities = extract_entities(article.strip(), nlp_pipeline)
            csv_writer.writerow([idx, *format_output(entities)])
    
    print(f"✅ Processed {len(articles)} articles. Results saved to {output_csv}")

In [31]:
if __name__ == "__main__":
    file_path = input("Enter the path to the input text file: ").strip()
    cleaned_file_path = "cleaned_text.txt"
    output_csv = "output_.csv"
    model_name = "roberta_finetuned_MIC_"

    # Step 1: Clean the text and save to a new file
    clean_text(file_path, cleaned_file_path)

    # Step 2: Process the cleaned text file
    process_articles(cleaned_file_path, model_name, output_csv)


Enter the path to the input text file:  articles/merge/ProQuestDocuments-2025-01-02 (5)_3.txt


Device set to use cuda:0


✅ Text cleaned and saved to: cleaned_text.txt
✅ Processed 503 articles. Results saved to output_.csv
