# Before Balancing

In [1]:
from collections import Counter

# Define the file paths
input_file_path = 'aga.txt'

# Initialize counters
total_rows = 0
sentence_count = 0
ner_counts = Counter()
tag_counts = Counter()

# Read the file
with open(input_file_path, 'r', encoding='utf-8') as file:
    lines = file.readlines()

in_sentence = False
for line in lines:
    line = line.strip()
    
    if not line:
        continue  # Skip any blank lines
    
    total_rows += 1
    
    word, label = line.split()
    
    # Check if the line marks the end of a sentence
    if word == '.' and label == 'O':
        sentence_count += 1
        in_sentence = False
        continue
    
    in_sentence = True
    
    # Count NER tags
    if label.startswith('B-') or label.startswith('I-'):
        ner_type = label.split('-')[1]
        ner_counts[ner_type] += 1
        tag_counts[label] += 1

# Print results
print(f"Total number of rows: {total_rows}")
print(f"Total number of sentences: {sentence_count}")

# Print NER counts
print("\nTotal NER counts:")
for ner_type, count in ner_counts.items():
    print(f"{ner_type}: {count}")

# Print tag counts
print("\nTotal tag counts:")
for tag, count in tag_counts.items():
    print(f"{tag}: {count}")

Total number of rows: 69182
Total number of sentences: 2765

Total NER counts:
LOC: 4252
MISC: 2087
ORG: 3323
DATE: 1182
PER: 2839

Total tag counts:
B-LOC: 3462
B-MISC: 1479
I-MISC: 608
B-ORG: 1548
I-ORG: 1775
B-DATE: 562
I-DATE: 620
I-LOC: 790
B-PER: 1436
I-PER: 1403


# After Balancing

In [5]:
import random
from collections import Counter, defaultdict
from itertools import groupby

# Define the file paths
input_file_path = 'aga.txt'

# Initialize counters
total_rows = 0
sentence_count = 0
ner_counts = Counter()
tag_counts = Counter()

# Read the file and store sentences
with open(input_file_path, 'r', encoding='utf-8') as file:
    lines = file.readlines()

sentences = []
sentence = []
for line in lines:
    line = line.strip()
    if line == '':
        continue
    word, label = line.split()

    sentence.append((word, label))
    
    # End of a sentence
    if label == 'O' and word == '.':
        sentences.append(sentence)
        sentence = []

# Separate sentences by tag type
tag_sentences = defaultdict(list)
for sentence in sentences:
    for word, label in sentence:
        if label.startswith('B-') or label.startswith('I-'):
            ner_type = label.split('-')[1]
            tag_sentences[ner_type].append(sentence)
            break

# Calculate target size for oversampling
all_b_tags = ['PER', 'LOC', 'ORG', 'DATE', 'MISC']
b_tag_sizes = [len(tag_sentences[tag]) for tag in all_b_tags]
target_size = max(b_tag_sizes)  # Oversample everything to match the largest class

# Oversample underrepresented classes
balanced_sentences = []
for tag in all_b_tags:
    sentences_for_tag = tag_sentences[tag]
    if tag in ['DATE', 'MISC']:
        # Aggressively oversample DATE and MISC
        oversampled_sentences = random.choices(sentences_for_tag, k=target_size*3)  # Increase oversampling for DATE and MISC
    elif len(sentences_for_tag) < target_size:
        oversampled_sentences = random.choices(sentences_for_tag, k=target_size*2)  # Original sampling strategy for other tags
    else:
        oversampled_sentences = sentences_for_tag
    balanced_sentences.extend(oversampled_sentences)

# Flatten the list of sentences back into lines
balanced_lines = []
for sentence in balanced_sentences:
    for word, label in sentence:
        balanced_lines.append(f"{word} {label}")
    balanced_lines.append("")  # Add a blank line between sentences

# Save the balanced data to a new file
output_file_path = 'wlina_bd.txt'
with open(output_file_path, 'w', encoding='utf-8') as out_file:
    out_file.write("\n".join(balanced_lines))

# Recount statistics for the balanced dataset
total_rows = len(balanced_lines) - len(balanced_sentences)  # Adjust for blank lines
sentence_count = len(balanced_sentences)

ner_counts = Counter()
tag_counts = Counter()

for sentence in balanced_sentences:
    in_sentence = False
    for word, label in sentence:
        # Count NER tags
        if label.startswith('B-') or label.startswith('I-'):
            ner_type = label.split('-')[1]
            ner_counts[ner_type] += 1
            tag_counts[label] += 1

# Print results
print(f"Total number of rows: {total_rows}")
print(f"Total number of sentences: {sentence_count}")

# Print NER counts
print("\nTotal NER counts:")
for tag in all_b_tags:
    print(f"{tag}: {ner_counts[tag]}")

# Print tag counts
print("\nTotal tag counts:")
for tag in all_b_tags:
    print(f"B-{tag}: {tag_counts[f'B-{tag}']}")
    print(f"I-{tag}: {tag_counts[f'I-{tag}']}")

Total number of rows: 210600
Total number of sentences: 7601

Total NER counts:
PER: 7256
LOC: 13669
ORG: 11242
DATE: 7878
MISC: 9714

Total tag counts:
B-PER: 3387
I-PER: 3869
B-LOC: 10926
I-LOC: 2743
B-ORG: 5144
I-ORG: 6098
B-DATE: 3454
I-DATE: 4424
B-MISC: 6607
I-MISC: 3107
