# Dataset File - Split
# Train/ Validate/ Test ===> 70/20/10

# Dataset Statistics

In [22]:
def analyze_ner_tags(text):
    # Split the text into lines
    lines = text.strip().split('\n')
    
    # Initialize counters
    tag_counts = {
        'TOTAL_TAGS': {},
        'BIO_TAGS': {},
        'SENTENCES': 0,
        'TOTAL_TOKENS': 0  # Count total number of tokens (lines with tags)
    }
    
    # Predefined tag categories
    categories = ['PER', 'LOC', 'ORG', 'DATE', 'MISC', 'O']
    
    # Initialize counts for each category
    for cat in categories:
        tag_counts['TOTAL_TAGS'][cat] = 0
        tag_counts['BIO_TAGS'][f'B-{cat}'] = 0
        tag_counts['BIO_TAGS'][f'I-{cat}'] = 0
    
    # sentence counting
    is_sentence_end = False
    
    # Analyze lines
    for i, line in enumerate(lines):
        line = line.strip() 
        if not line:  # Skip empty lines
            continue
        
        # Split the line into tokens
        tokens = line.split()
        
        # Count total tokens (non-empty lines)
        tag_counts['TOTAL_TOKENS'] += 1
        
        # Process each token
        for token in tokens:
            if '-' in token:
                # Split the token to get the tag
                tag = token.split('-')[-1]
                prefix = token.split('-')[0]
                
                # Count total tags
                if tag in tag_counts['TOTAL_TAGS']:
                    tag_counts['TOTAL_TAGS'][tag] += 1
                
                # Count BIO tags
                full_tag = token
                if full_tag in tag_counts['BIO_TAGS']:
                    tag_counts['BIO_TAGS'][full_tag] += 1
            else:
                # Count Outside tags
                if token == 'O':
                    tag_counts['TOTAL_TAGS']['O'] += 1
                    tag_counts['BIO_TAGS']['O'] = tag_counts['BIO_TAGS'].get('O', 0) + 1
        
        # Check if the current line ends a sentence ('. O')
        if line == '. O':
            is_sentence_end = True
        
        # Sentence counting: check if the sentence ends after '. O' and followed by an empty line
        if is_sentence_end and (i + 1 >= len(lines) or not lines[i + 1].strip()):
            tag_counts['SENTENCES'] += 1
            is_sentence_end = False  # Reset for the next sentence
    
    # Remove B-O and I-O from BIO_TAGS
    tag_counts['BIO_TAGS'].pop('B-O', None)
    tag_counts['BIO_TAGS'].pop('I-O', None)
    
    return tag_counts

# Read text from a file and analyze the tags
def analyze_from_file(file_path):
    # Read the text data from the file
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    
    # Analyze the text
    results = analyze_ner_tags(text)
    
    # Print the results
    print("Total Tags:")
    for tag, count in results['TOTAL_TAGS'].items():
        print(f"{tag}: {count}")

    print("\nBIO Tags:")
    for tag, count in results['BIO_TAGS'].items():
        print(f"{tag}: {count}")

    print(f"\nTotal Sentences: {results['SENTENCES']}")
    print(f"Total Tokens: {results['TOTAL_TOKENS']}")

# Dataset Path and run the fuction (analyze_from_file)
file_path = 'AgaCKNER_Dataset.txt'
analyze_from_file(file_path)

Total Tags:
PER: 2814
LOC: 3576
ORG: 4207
DATE: 1532
MISC: 2775
O: 49659

BIO Tags:
B-PER: 1439
I-PER: 1375
B-LOC: 2732
I-LOC: 844
B-ORG: 1956
I-ORG: 2251
B-DATE: 718
I-DATE: 814
B-MISC: 2000
I-MISC: 775
O: 49659

Total Sentences: 2534
Total Tokens: 64563


# -------------------------------------------------------------------

# Splition - CODE

In [24]:
import random
import os

def analyze_ner_tags(text):
    # Split the text into lines
    lines = text.strip().split('\n')
    
    # Initialize counters
    tag_counts = {
        'TOTAL_TAGS': {},
        'BIO_TAGS': {},
        'SENTENCES': 0,
        'TOTAL_TOKENS': 0  # Count total number of tokens (lines with tags)
    }
    
    # Tag categories
    categories = ['PER', 'LOC', 'ORG', 'DATE', 'MISC', 'O']
    
    # Initialize counts for each category
    for cat in categories:
        tag_counts['TOTAL_TAGS'][cat] = 0
        tag_counts['BIO_TAGS'][f'B-{cat}'] = 0
        tag_counts['BIO_TAGS'][f'I-{cat}'] = 0
    
    # Track sentence counting
    is_sentence_end = False
    
    # Analyze lines
    for i, line in enumerate(lines):
        line = line.strip() 
        if not line: 
            continue
        
        # Split the line into tokens
        tokens = line.split()
        
        # Count total tokens (non-empty lines)
        tag_counts['TOTAL_TOKENS'] += 1
        
        # Process each token
        for token in tokens:
            if '-' in token:
                # Split the token to get the tag
                tag = token.split('-')[-1]
                prefix = token.split('-')[0]
                
                # Count total tags
                if tag in tag_counts['TOTAL_TAGS']:
                    tag_counts['TOTAL_TAGS'][tag] += 1
                
                # Count BIO tags
                full_tag = token
                if full_tag in tag_counts['BIO_TAGS']:
                    tag_counts['BIO_TAGS'][full_tag] += 1
            else:
                # Count Outside tags
                if token == 'O':
                    tag_counts['TOTAL_TAGS']['O'] += 1
                    tag_counts['BIO_TAGS']['O'] = tag_counts['BIO_TAGS'].get('O', 0) + 1
        
        # Check if the current line ends a sentence ('. O')
        if line == '. O':
            is_sentence_end = True
        
        # Sentence counting: check if the sentence ends after '. O' and followed by an empty line
        if is_sentence_end and (i + 1 >= len(lines) or not lines[i + 1].strip()):
            tag_counts['SENTENCES'] += 1
            is_sentence_end = False  # Reset for the next sentence
    
    # Remove B-O and I-O from BIO_TAGS
    tag_counts['BIO_TAGS'].pop('B-O', None)
    tag_counts['BIO_TAGS'].pop('I-O', None)
    
    return tag_counts

# Read text from a file
def analyze_from_file(file_path):
    # Read the text data from the file
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    
    # Analyze the text
    results = analyze_ner_tags(text)
    
    # Print the results
    print("Total Tags:")
    for tag, count in results['TOTAL_TAGS'].items():
        print(f"{tag}: {count}")

    print("\nBIO Tags:")
    for tag, count in results['BIO_TAGS'].items():
        print(f"{tag}: {count}")

    print(f"\nTotal Sentences: {results['SENTENCES']}")
    print(f"Total Tokens: {results['TOTAL_TOKENS']}")

# Splition
def split_ner_dataset(input_file, train_file, val_file, test_file, train_ratio=0.7, val_ratio=0.2, test_ratio=0.1):
    """
    Split a CoNLL format NER dataset into train, validation, and test sets.
    Precisely splits sentences and maintains exact formatting.
    """
    # Read the dataset
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    # Extract sentences
    sentences = []
    current_sentence = []
    
    for line in lines:
        # Strip trailing whitespace
        line = line.rstrip()
        
        # Add line to current sentence
        current_sentence.append(line)
        
        # Check if sentence ends with ". O"
        if line == '. O':
            # Add the complete sentence
            sentences.append(current_sentence)
            current_sentence = []
    
    # add remaining sentences if exists
    if current_sentence:
        sentences.append(current_sentence)
    
    # Shuffle the sentences
    random.seed(42)
    random.shuffle(sentences)
    
    # Calculate split indices
    total_samples = len(sentences)
    train_end = int(total_samples * train_ratio)
    val_end = train_end + int(total_samples * val_ratio)
    
    # Split the dataset
    train_set = sentences[:train_end]
    val_set = sentences[train_end:val_end]
    test_set = sentences[val_end:]
    
    # Write to files
    def write_set(file_path, data_set):
        with open(file_path, 'w', encoding='utf-8') as f:
            for sentence in data_set:
            f.write('\n'.join(sentence) + '\n')  # Remove the extra empty line

    
    write_set(train_file, train_set)
    write_set(val_file, val_set)
    write_set(test_file, test_set)
    
    # Print Splition Statistics
    print(f"\nSplit Dataset Statistics:")
    print(f"Total sentences: {total_samples}")
    print(f"Train set: {len(train_set)} sentences ({train_ratio*100}%)")
    print(f"Validation set: {len(val_set)} sentences ({val_ratio*100}%)")
    print(f"Test set: {len(test_set)} sentences ({test_ratio*100}%)")

def main():
    # Input file path
    input_file = 'AgaCKNER_Dataset.txt'
    
    # Analyze the dataset
    analyze_from_file(input_file)
    
    # check output directorys
    os.makedirs('splits', exist_ok=True)
    
    # Output
    train_file = 'splits/train.txt'
    val_file = 'splits/val.txt'
    test_file = 'splits/test.txt'
    
    # Split the dataset
    split_ner_dataset(input_file, train_file, val_file, test_file)

if __name__ == '__main__':
    main()

Total Tags:
PER: 2814
LOC: 3576
ORG: 4207
DATE: 1532
MISC: 2775
O: 49659

BIO Tags:
B-PER: 1439
I-PER: 1375
B-LOC: 2732
I-LOC: 844
B-ORG: 1956
I-ORG: 2251
B-DATE: 718
I-DATE: 814
B-MISC: 2000
I-MISC: 775
O: 49659

Total Sentences: 2534
Total Tokens: 64563

Split Dataset Statistics:
Total sentences: 2534
Train set: 1773 sentences (70.0%)
Validation set: 506 sentences (20.0%)
Test set: 255 sentences (10.0%)


In [1]:
import random
import os

# Step 1: Read the Input Dataset
def read_input_file(input_file):
    """
    Reads the entire dataset from the input file.
    Each line consists of a word followed by its NER tag (e.g., word B-LOC),
    and sentences are separated by blank lines.
    """
    with open(input_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    return lines


# Step 2: Extract Sentences
def extract_sentences(lines):
    """
    Groups lines into sentences. A sentence ends when a line equals '. O'.
    Accumulates lines for each sentence and stores complete sentences in a list.
    """
    sentences = []
    current_sentence = []
    
    for line in lines:
        line = line.rstrip()  # Remove trailing whitespace
        if line:  # Skip empty lines
            current_sentence.append(line)
        if line == '. O':  # Check if this is the end of a sentence
            sentences.append(current_sentence)  # Add complete sentence to list
            current_sentence = []  # Reset for next sentence
    
    return sentences


# Step 3: Shuffle the Sentences
def shuffle_sentences(sentences):
    """
    Randomizes the order of sentences to avoid any bias in the dataset.
    A fixed random seed is used for reproducibility.
    """
    random.seed(42)
    random.shuffle(sentences)


# Step 4: Split the Dataset
def split_dataset(sentences, train_ratio=0.7, val_ratio=0.2, test_ratio=0.1):
    """
    Splits the shuffled sentences into training, validation, and test sets.
    The split ratios are provided as arguments (default: 70% train, 20% validation, 10% test).
    """
    total_samples = len(sentences)
    train_end = int(total_samples * train_ratio)
    val_end = train_end + int(total_samples * val_ratio)
    
    train_set = sentences[:train_end]
    val_set = sentences[train_end:val_end]
    test_set = sentences[val_end:]
    
    return train_set, val_set, test_set


# Step 5: Write the Subsets to Output Files
def write_subsets_to_files(train_set, val_set, test_set, train_file, val_file, test_file):
    """
    Saves each of the subsets (train, validation, test) to their respective output files.
    The sentences are written without extra empty lines between sentences.
    """
    def write_set(file_path, data_set):
        with open(file_path, 'w', encoding='utf-8') as f:
            for sentence in data_set:
                f.write('\n'.join(sentence) + '\n')  # Join words with newline and write

    write_set(train_file, train_set)
    write_set(val_file, val_set)
    write_set(test_file, test_set)


# Step 6: Print Split Statistics
def print_split_statistics(sentences, train_set, val_set, test_set):
    """
    Prints the total number of sentences and the count of sentences in the train, validation, and test sets.
    Also, displays the corresponding percentages.
    """
    total_samples = len(sentences)
    train_samples = len(train_set)
    val_samples = len(val_set)
    test_samples = len(test_set)

    print(f"\nTotal sentences: {total_samples}")
    print(f"Train set: {train_samples} sentences ({(train_samples/total_samples)*100:.2f}%)")
    print(f"Validation set: {val_samples} sentences ({(val_samples/total_samples)*100:.2f}%)")
    print(f"Test set: {test_samples} sentences ({(test_samples/total_samples)*100:.2f}%)")


# Main Function: Putting it All Together
def main():
    # Step 1: Read the Input Dataset
    input_file = 'AgaCKNER_Dataset.txt'  # Input file path
    lines = read_input_file(input_file)

    # Step 2: Extract Sentences
    sentences = extract_sentences(lines)

    # Step 3: Shuffle the Sentences
    shuffle_sentences(sentences)

    # Step 4: Split the Dataset
    train_set, val_set, test_set = split_dataset(sentences)

    # Step 5: Write the Subsets to Output Files
    output_dir = 'splits'
    os.makedirs(output_dir, exist_ok=True)
    write_subsets_to_files(train_set, val_set, test_set, 
                           os.path.join(output_dir, 'train.txt'), 
                           os.path.join(output_dir, 'val.txt'), 
                           os.path.join(output_dir, 'test.txt'))

    # Step 6: Print Split Statistics
    print_split_statistics(sentences, train_set, val_set, test_set)


# Run the main function
if __name__ == '__main__':
    main()


Total sentences: 2534
Train set: 1773 sentences (69.97%)
Validation set: 506 sentences (19.97%)
Test set: 255 sentences (10.06%)
