In [None]:
!pip install -U spacy > /dev/null
!python -m spacy download hr_core_news_sm > /dev/null
print("spaCy and Croatian model (hr_core_news_sm) installed/updated.")

import spacy
import re
try:
    nlp = spacy.load("hr_core_news_sm")
    print("Croatian model loaded successfully.")
except OSError:
    print("Model 'hr_core_news_sm' not found. Please ensure it was downloaded correctly.")
    nlp = None #

METADATA_PREFIXES = [
    "Novina:",
    "Datum:",
    "Rubrika:",
    "Nadnaslov:",
    "Naslov:",
    "Podnaslov:",
    "Strana:",
    "Autori:"
]

def count_article_tokens(instance_text, nlp_model):
    if not nlp_model:
        return 0

    article_lines = []
    lines = instance_text.strip().split('\n')

    for line in lines:
        line_stripped = line.strip()
        if not line_stripped:
            continue

        is_metadata = False
        for prefix in METADATA_PREFIXES:
            if line_stripped.startswith(prefix):
                is_metadata = True
                break

        if not is_metadata:
            article_lines.append(line_stripped)

    article_content = " ".join(article_lines)

    if not article_content:
        return 0

    doc = nlp_model(article_content)
    return len(doc)

from google.colab import files
print("Please upload your .txt file:")
uploaded = files.upload()
file_name = next(iter(uploaded))
file_content = uploaded[file_name].decode('utf-8')
print(f"File '{file_name}' uploaded successfully.")

if nlp:
    instances = file_content.split("<***>")

    total_tokens_all_articles = 0
    instance_token_counts = []

    print("\n--- Token Counts per Article Instance ---")
    for i, instance_str in enumerate(instances):
        instance_str_cleaned = instance_str.strip()
        if not instance_str_cleaned:
            continue

        token_count = count_article_tokens(instance_str_cleaned, nlp)
        instance_token_counts.append(token_count)
        total_tokens_all_articles += token_count
        print(f"Instance {i+1}: {token_count} tokens")

    print("\n--- Summary ---")
    print(f"Total number of instances processed: {len(instance_token_counts)}")
    print(f"Total tokens in all articles: {total_tokens_all_articles}")
    if instance_token_counts:
        avg_tokens = total_tokens_all_articles / len(instance_token_counts)
        print(f"Average tokens per article: {avg_tokens:.2f}")

else:
    print("Cannot proceed without a loaded spaCy model.")

spaCy and Croatian model (hr_core_news_sm) installed/updated.
Croatian model loaded successfully.
Please upload your .txt file:


Saving Infobiro - Dnevni Avaz - 2005 (1-2).txt to Infobiro - Dnevni Avaz - 2005 (1-2) (1).txt
File 'Infobiro - Dnevni Avaz - 2005 (1-2) (1).txt' uploaded successfully.

--- Token Counts per Article Instance ---
Instance 2: 207 tokens
Instance 3: 160 tokens
Instance 4: 672 tokens
Instance 5: 870 tokens
Instance 6: 305 tokens
Instance 7: 199 tokens
Instance 8: 248 tokens
Instance 9: 95 tokens
Instance 10: 129 tokens
Instance 11: 365 tokens
Instance 12: 206 tokens
Instance 13: 62 tokens
Instance 14: 646 tokens
Instance 15: 313 tokens
Instance 16: 233 tokens
Instance 17: 398 tokens
Instance 18: 162 tokens
Instance 19: 186 tokens
Instance 20: 148 tokens
Instance 21: 374 tokens
Instance 22: 504 tokens
Instance 23: 114 tokens
Instance 24: 558 tokens
Instance 25: 515 tokens
Instance 26: 99 tokens
Instance 27: 239 tokens
Instance 28: 873 tokens
Instance 29: 186 tokens
Instance 30: 77 tokens
Instance 31: 401 tokens
Instance 32: 374 tokens
Instance 33: 333 tokens
Instance 34: 86 tokens
Instance 3