Text Summarization Model Training

This notebook demonstrates the process of training a text summarization model using the provided dataset.

In [None]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import json
import sys
from pathlib import Path
import os
import ijson
import json


# Add the project root directory to Python path
project_root = str(Path.cwd().parent) if 'notebooks' in str(Path.cwd()) else str(Path.cwd())
if project_root not in sys.path:
    sys.path.append(project_root)

# Import custom modules
from src.preprocessing import load_data, preprocess_data, split_and_save_data
from src.model import initialize_model, train_model, save_model, summarize
from src.evaluation import calculate_rouge, calculate_bleu
from src.visualizations import plot_loss

In [None]:
# Define paths
raw_data_path = r"C:\Users\ELITEBOOK\OneDrive\Desktop\Projects\TinyLLaMa-Summarization\data\raw\sample.jsonl"
processed_data_dir = "../data/processed"

# Load and preprocess data
print("Loading raw data...")
raw_data = load_data(raw_data_path)

print("Preprocessing data...")
articles, summaries = preprocess_data(raw_data)

# Split and save processed data
print("Splitting and saving processed data...")
split_and_save_data(articles, summaries, output_dir=processed_data_dir)

print(f"Number of articles: {len(articles)}")
print(f"Sample article length: {len(articles[0])} characters")
print(f"Sample summary length: {len(summaries[0])} characters")

In [None]:
# Load training data
print("Loading training data...")
with open(os.path.join(processed_data_dir, "train.json"), "r") as f:
    train_data = []
    parser = ijson.parse(f)
    for prefix, event, value in parser:
        if prefix.endswith('.article'):
            article = value
            highlights_prefix = prefix.replace('.article', '.highlights')
            highlights = next((item[2] for item in parser if item[0] == highlights_prefix), None)
            if highlights is not None:
                train_data.append({'article': article, 'highlights': highlights})
    train_data = train_data[:100000]
    
# Initialize model
print("Initializing model...")
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
model, tokenizer = initialize_model(model_name)

# Train model
print("Training model...")
training_stats = train_model(model, tokenizer, train_data, epochs=5)

# Plot training loss
plot_loss(training_stats['losses'])

In [None]:
# Load test data
print("Loading test data...")
with open(os.path.join(processed_data_dir, "test.json"), "r") as f:
    test_data = json.load(f)

# Evaluate on sample from test set
print("\nEvaluating model on sample test data:")
rouge_scores = []
bleu_scores = []

for idx, item in enumerate(test_data[:5]):
    generated_summary = summarize(model, tokenizer, item['article'])
    rouge = calculate_rouge(item['highlights'], generated_summary)
    bleu = calculate_bleu(item['highlights'], generated_summary)
    
    rouge_scores.append(rouge)
    bleu_scores.append(bleu)

# Results dictionary
results = {
    'average_scores': {
        'rouge1': avg_rouge1,
        'rouge2': avg_rouge2,
        'rougeL': avg_rougeL,
        'bleu': avg_bleu
    },
    'example_predictions': [
        {
            'article': item['article'],
            'reference_summary': item['highlights'],
            'generated_summary': summarizer.summarize(item['article']),
        }
        for item in test_data[:5]
    ]
}

In [None]:
# Create results dictionary
results = {
    'average_scores': {
        'rouge1': avg_rouge1,
        'rouge2': avg_rouge2,
        'rougeL': avg_rougeL,
        'bleu': avg_bleu
    },
    'example_predictions': [
        {
            'article': item['article'],
            'reference_summary': item['summary'],
            'generated_summary': summarizer.summarize(item['article']),
        }
        for item in test_data[:5]
    ]
}

# Save results
os.makedirs('../reports', exist_ok=True)
with open('../reports/evaluation_results.json', 'w') as f:
    json.dump(results, f, indent=2)

print("Evaluation results saved to reports/evaluation_results.json")

In [None]:

# Save model
print("Saving model...")
save_model(model, tokenizer, "../models/tiny-llama-model")