In [None]:
import re

def clean_text(file_path, output_path):
    """
    Cleans a text file by removing links while keeping MIC-related details.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    
    # Remove URLs (http, https, www)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    # Preserve MIC-related details "(1)...(2a and 2b)...(3)..."
    pattern_mic_details = r"\(\d+[a-z]?(?: and \d+[a-z]?)?\)[^()\n]+"
    mic_matches = re.findall(pattern_mic_details, text)

    # Remove unwanted special characters but keep important ones
    text = re.sub(r"[^\w\s.,;()/'\"-]", '', text)

    # Ensure MIC-related details remain in their original positions
    for match in mic_matches:
        if match not in text:
            text += "\n" + match  # Appending as a last resort if missing

    # Normalize spaces and punctuation
    text = re.sub(r'\s+', ' ', text).replace(' .', '.').replace(' ,', ',').strip()

    # Save cleaned text
    with open(output_path, 'w', encoding='utf-8') as file:
        file.write(text)
    
    print(f"✅ Cleaning complete! Saved to: {output_path}")

# Process two files
input_files = [
    "articles/merge/ProQuestDocuments-2025-01-02 (5)_3.txt",
]
output_files = [
    "cleaned_text_5.txt",
]

for inp, out in zip(input_files, output_files):
    clean_text(inp, out)

In [2]:
import spacy
import json

# Load spaCy English model
nlp = spacy.load("en_core_web_trf")
nlp.max_length = 5000000  # Increase processing limit if needed

# Define entity mappings for BIO tagging
ENTITY_MAP = {
    "DATE": "DATE",
    "CARDINAL": "FATALITY",  # Potential fatality numbers
    "GPE": "COUNTRY"
}

def bio_tagging(text):
    """
    Assigns BIO (Beginning, Inside, Outside) tags to entities in the text.
    """
    doc = nlp(text)
    tokens_with_tags = []

    for i, token in enumerate(doc):
        entity = token.ent_type_
        word = token.text
        tag = "O"  # Default to 'O' (Outside)

        # Assign BIO tags based on entity type
        if entity in ENTITY_MAP:
            label = ENTITY_MAP[entity]
            tag = f"B-{label}" if not tokens_with_tags or tokens_with_tags[-1]["label"] != f"B-{label}" else f"I-{label}"

        # Additional check for fatality indicators
        elif entity == "CARDINAL" and i < len(doc) - 2:
            next_word = doc[i + 1].text.lower()
            next_next_word = doc[i + 2].text.lower() if i + 2 < len(doc) else ""
            if next_word in {"killed", "deaths", "fatalities", "dead", "casualties"} or \
               next_next_word in {"killed", "deaths", "fatalities", "dead", "casualties"}:
                tag = f"B-FATALITY" if not tokens_with_tags or tokens_with_tags[-1]["label"] != "B-FATALITY" else f"I-FATALITY"
        
        tokens_with_tags.append({"token": word, "label": tag})

    return tokens_with_tags

def process_large_text(input_file, output_file, chunk_size=500000):
    """
    Processes a large text file in chunks, applies BIO tagging, and saves the output to a JSON file.
    """
    with open(input_file, "r", encoding="utf-8") as file:
        text = file.read()

    chunks = [text[i : i + chunk_size] for i in range(0, len(text), chunk_size)]
    labeled_data = []

    for chunk in chunks:
        doc = nlp(chunk)
        for sent in doc.sents:
            labeled_data.append({"sentence": sent.text, "tokens": bio_tagging(sent.text)})

    with open(output_file, "w", encoding="utf-8") as json_file:
        json.dump(labeled_data, json_file, indent=4, ensure_ascii=False)

    print(f"✅ BIO-tagged data saved to: {output_file}")

# Example Usage
process_large_text("cleaned_text_5.txt", "train_more_7.json")

✅ BIO-tagged data saved to: train_more_1.json


In [1]:
import spacy
import json

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 5000000  # Increase processing limit if needed

# Define entity mappings
ENTITY_MAP = {
    "DATE": "DATE",
    "CARDINAL": "FATALITY",  # Potential fatality numbers
    "GPE": "COUNTRY"
}

def tag_entities(text):
    """
    Tags DATE, FATALITY, and COUNTRY entities in the text.
    """
    doc = nlp(text)
    tokens_with_tags = []

    for i, token in enumerate(doc):
        entity = token.ent_type_
        word = token.text
        tag = "O"  # Default to 'O' (Outside)

        if entity in ENTITY_MAP:
            label = ENTITY_MAP[entity]
            tag = label
        
        # Additional check for fatality indicators
        elif entity == "CARDINAL" and i < len(doc) - 2:
            next_word = doc[i + 1].text.lower()
            next_next_word = doc[i + 2].text.lower() if i + 2 < len(doc) else ""
            if next_word in {"killed", "deaths", "fatalities", "dead", "casualties"} or \
               next_next_word in {"killed", "deaths", "fatalities", "dead", "casualties"}:
                tag = "FATALITY"
        
        tokens_with_tags.append({"token": word, "label": tag})

    return tokens_with_tags

def process_large_text(input_file, output_file, chunk_size=500000):
    """
    Processes a large text file in chunks, applies entity tagging, and saves the output to a JSON file.
    """
    with open(input_file, "r", encoding="utf-8") as file:
        text = file.read()

    chunks = [text[i : i + chunk_size] for i in range(0, len(text), chunk_size)]
    labeled_data = []

    for chunk in chunks:
        doc = nlp(chunk)
        for sent in doc.sents:
            labeled_data.append({"sentence": sent.text, "tokens": tag_entities(sent.text)})

    with open(output_file, "w", encoding="utf-8") as json_file:
        json.dump(labeled_data, json_file, indent=4, ensure_ascii=False)

    print(f"✅ Tagged data saved to: {output_file}")

# Example Usage
process_large_text("cleaned_text_5.txt", "train_more_2.json")

✅ Tagged data saved to: train_more_2.json
