# Grammar Correction Project - Data Exploration & Testing

This notebook is designed to:

1. **Explore the dataset**: Load and inspect the dataset to understand its structure.
2. **Test preprocessing**: Tokenize sentences and verify input-output pairs.
3. **Test the model**: Load the fine-tuned model and test its grammar correction capabilities.

## Step 1: Import Required Libraries

In [None]:
import tensorflow as tf
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration
import matplotlib.pyplot as plt

## Step 2: Load the Dataset

In [None]:
# Define TFRecord parsing logic
feature_description = {
    'input_text': tf.io.FixedLenFeature([], tf.string),
    'target_text': tf.io.FixedLenFeature([], tf.string),
}

def parse_tfrecord(example_proto):
    return tf.io.parse_single_example(example_proto, feature_description)

# Load and parse dataset
dataset_path = '../data/grammar_dataset.tfrecord'  # Update this path as necessary
raw_dataset = tf.data.TFRecordDataset(dataset_path)
parsed_dataset = raw_dataset.map(parse_tfrecord)

# Display a few examples
inputs, targets = [], []
for record in parsed_dataset.take(5):
    inputs.append(record['input_text'].numpy().decode('utf-8'))
    targets.append(record['target_text'].numpy().decode('utf-8'))

pd.DataFrame({'Input Text': inputs, 'Target Text': targets})

## Step 3: Tokenization Example

In [None]:
# Load the tokenizer
tokenizer = T5Tokenizer.from_pretrained('google/t5-base')

# Tokenize one example
example_input = f"grammar correction: {inputs[0]}"
tokenized = tokenizer(example_input, max_length=128, padding='max_length', truncation=True)

# Display tokenized input
print("Tokenized Input IDs:", tokenized['input_ids'])
print("Decoded Tokens:", tokenizer.decode(tokenized['input_ids']))

## Step 4: Load and Test the Fine-Tuned Model

In [None]:
# Load fine-tuned model
model_path = '../models/fine_tuned_t5_base'  # Update this path as necessary
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained(model_path)

# Test the model on a sample sentence
test_sentence = "She go to market every day."
input_text = f"grammar correction: {test_sentence}"
inputs = tokenizer.encode(input_text, return_tensors='pt', max_length=128, truncation=True)

# Generate corrected text
outputs = model.generate(inputs, max_length=128, num_beams=5, early_stopping=True)
corrected_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Original Sentence:", test_sentence)
print("Corrected Sentence:", corrected_sentence)

## Step 5: Visualize Training Data Distribution (Optional)

In [None]:
# Analyze length of input sentences
input_lengths = [len(text.split()) for text in inputs]

# Plot histogram
plt.hist(input_lengths, bins=20, color='blue', edgecolor='black')
plt.title('Distribution of Sentence Lengths')
plt.xlabel('Number of Words')
plt.ylabel('Frequency')
plt.show()