In [None]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from collections import Counter


## Load Sample Sentences

In [None]:
# Raw files
train_raw = os.path.join(raw_data_dir, "train.txt")
valid_raw = os.path.join(raw_data_dir, "valid.txt")
test_raw = os.path.join(raw_data_dir, "test.txt")

# Cleaned file paths
train_clean = os.path.join(clean_data_dir, "train_clean.txt")
valid_clean = os.path.join(clean_data_dir, "valid_clean.txt")
test_clean = os.path.join(clean_data_dir, "test_clean.txt")

with open(train_path, 'r', encoding='utf-8') as f:
    lines = f.readlines()

print("Total lines in train:", len(lines))
print("Sample lines:\n")
for i in range(5):
    print(f"{i+1}: {lines[i].strip()}")


## Basic Sentences Statistics

In [None]:
from collections import Counter

num_sentences = len(lines)
all_words = [word for line in lines for word in line.strip().split()]
num_words = len(all_words)
unique_words = set(all_words)

print(f"Total sentences: {num_sentences}")
print(f"Total words: {num_words}")
print(f"Unique words: {len(unique_words)}")

# Most frequent words
word_counts = Counter(all_words)
print("Top 10 words:", word_counts.most_common(10))


## Sentence Length Distributions

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sentence_lengths = [len(line.strip().split()) for line in lines]

plt.figure(figsize=(10, 4))
sns.histplot(sentence_lengths, bins=30, kde=True)
plt.title("Sentence Length Distribution (Train Set)")
plt.xlabel("Words per Sentence")
plt.ylabel("Frequency")
plt.show()


## Vocabulary Inspection

In [None]:
with open(vocab_path, 'r', encoding='utf-8') as f:
    vocab = [line.strip() for line in f.readlines()]

print("Total vocab words:", len(vocab))
print("First 20 vocab entries:", vocab[:20])


## Token Frequency Distribution

In [None]:
# Load tokenizer from your pipeline
tokenizer, _ = build_tokenizer("vocab_path")

# Tokenize raw lines and count
token_counts = Counter()
for line in raw_lines[:10000]:  # Sample first 10k lines for performance
    tokens = tokenizer.tokenize(line).numpy().tolist()
    token_counts.update(tokens)

# Top 20 most frequent tokens
top_tokens = token_counts.most_common(20)
tokens, freqs = zip(*top_tokens)

plt.figure(figsize=(12, 6))
sns.barplot(x=list(tokens), y=list(freqs))
plt.xticks(rotation=45)
plt.title("Top 20 Most Frequent Tokens")
plt.xlabel("Token ID")
plt.ylabel("Frequency")
plt.show()



## Sequence Lengths After Tokenization

In [None]:
seq_lens = []
for line in raw_lines[:10000]:
    tokens = tokenizer.tokenize(line)
    seq_lens.append(len(tokens))

plt.hist(seq_lens, bins=40, color='skyblue')
plt.title("Tokenized Sequence Lengths")
plt.xlabel("Token count")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()


## Inspect a Few Batches From train_ds

In [None]:
for inputs, labels in train_ds.take(1):
    print("Inputs shape:", inputs.shape)
    print("Labels shape:", labels.shape)
    print("First input example:\n", inputs[0].numpy())
    print("Decoded tokens:\n", [tokenizer.id_to_token(x) for x in inputs[0].numpy()])


## Analyze Padding Ratio

In [None]:
import numpy as np
import matplotlib.pyplot as plt

PAD_TOKEN_ID = tokenizer.token_to_id("[PAD]")

def compute_padding_ratio(dataset, sample_batches=100):
    total_tokens = 0
    pad_tokens = 0

    for i, (inputs, _) in enumerate(dataset.take(sample_batches)):
        # inputs shape: (batch_size, seq_len)
        inputs_np = inputs.numpy()
        pad_count = np.sum(inputs_np == PAD_TOKEN_ID)
        total_count = inputs_np.size

        total_tokens += total_count
        pad_tokens += pad_count

    padding_ratio = pad_tokens / total_tokens
    return padding_ratio

# Compute on train_ds
padding_ratio = compute_padding_ratio(train_ds)
print(f"🔢 Padding Ratio in `train_ds`: {padding_ratio:.2%}")
