Text Summarization Model Training

This notebook demonstrates the process of training a text summarization model using the provided dataset.

In [1]:
!pip install torch transformers scikit-learn rouge-score nltk fastapi uvicorn matplotlib pandas numpy seaborn emoji



In [2]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import json
import sys
from pathlib import Path
import os
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Add the project root directory to Python path
project_root = str(Path.cwd().parent) if 'notebooks' in str(Path.cwd()) else str(Path.cwd())
if project_root not in sys.path:
    sys.path.append(project_root)

# Import custom modules
from src.preprocessing import load_data, preprocess_data, save_data, preprocess_batch
from src.model import initialize_model, train_model, save_model, summarize
from src.evaluation import calculate_rouge, calculate_bleu
from src.visualizations import plot_loss

[nltk_data] Downloading package punkt to /home/sagemaker-
[nltk_data]     user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/sagemaker-
[nltk_data]     user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/sagemaker-
[nltk_data]     user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/sagemaker-
[nltk_data]     user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/sagemaker-
[nltk_data]     user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/sagemaker-
[nltk_data]     user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/sagemaker-
[nltk_data]     user/nltk_data...
[nltk_data]   Package 

In [3]:
import os

# Navigate to the root project directory
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))  # Go one level up
raw_data_dir = os.path.join(project_root, "data", "raw")
processed_data_dir = os.path.join(project_root, "data", "processed")

print("Raw Data Path:", raw_data_dir)
print("Processed Data Path:", processed_data_dir)

# Sample size
SAMPLE_SIZE = 0.10

files = ["train.csv", "test.csv", "validation.csv"]
for file in files:
    file_path = os.path.join(raw_data_dir, file)
    
    # Debugging: Print expected file path
    print(f"Looking for file: {file_path}")
    
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")

    data = load_data(file_path)  # Now it should not fail
    sampled_data = data.sample(frac=SAMPLE_SIZE, random_state=42)
    articles, highlights = preprocess_data(sampled_data)
    save_data(articles, highlights, processed_data_dir, f"processed_{file}")
    print(f"Processed {len(articles)} samples from {file}")


Raw Data Path: /home/sagemaker-user/TinyLLaMa-Summarization/data/raw
Processed Data Path: /home/sagemaker-user/TinyLLaMa-Summarization/data/processed
Looking for file: /home/sagemaker-user/TinyLLaMa-Summarization/data/raw/train.csv
Saving processed data to: /home/sagemaker-user/TinyLLaMa-Summarization/data/processed/processed_train.csv
Processed 28711 samples from train.csv
Looking for file: /home/sagemaker-user/TinyLLaMa-Summarization/data/raw/test.csv
Saving processed data to: /home/sagemaker-user/TinyLLaMa-Summarization/data/processed/processed_test.csv
Processed 1149 samples from test.csv
Looking for file: /home/sagemaker-user/TinyLLaMa-Summarization/data/raw/validation.csv
Saving processed data to: /home/sagemaker-user/TinyLLaMa-Summarization/data/processed/processed_validation.csv
Processed 1337 samples from validation.csv


In [4]:
import os

# Navigate to the root project directory
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
processed_data_dir = os.path.join(project_root, "data", "processed")

print("Processed Data Path:", processed_data_dir)

# Load processed training data
train_df = pd.read_csv(os.path.join(processed_data_dir, "processed_train.csv"))
train_data = [
    {'article': row['article'], 'highlights': row['highlights']} 
    for _, row in train_df.iterrows()
]

# Initialize model
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
model, tokenizer = initialize_model(model_name)

# Train model
training_stats = train_model(train_data, epochs=5)

# Plot training loss
plot_loss(training_stats['losses'])


Processed Data Path: /home/sagemaker-user/TinyLLaMa-Summarization/data/processed




NameError: name 'preprocess_batch' is not defined

In [None]:
import os
import json

# Navigate to the root project directory
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))  # Go one level up
processed_data_dir = os.path.join(project_root, "data", "processed")

print("Processed Data Path:", processed_data_dir)

# Load test data
print("Loading test data...")
test_data_path = os.path.join(processed_data_dir, "processed_test.csv")

if not os.path.exists(test_data_path):
    raise FileNotFoundError(f"File not found: {test_data_path}")

with open(test_data_path, "r") as f:
    test_data = json.load(f)

# Evaluate on sample from test set
print("\nEvaluating model on sample test data:")
rouge_scores = []
bleu_scores = []

for idx, item in enumerate(test_data[:5]):
    generated_summary = summarize(model, tokenizer, item['article'])
    rouge = calculate_rouge(item['highlights'], generated_summary)
    bleu = calculate_bleu(item['highlights'], generated_summary)
    
    rouge_scores.append(rouge)
    bleu_scores.append(bleu)

# Compute average scores
avg_rouge1 = sum(score['rouge1'] for score in rouge_scores) / len(rouge_scores)
avg_rouge2 = sum(score['rouge2'] for score in rouge_scores) / len(rouge_scores)
avg_rougeL = sum(score['rougeL'] for score in rouge_scores) / len(rouge_scores)
avg_bleu = sum(bleu_scores) / len(bleu_scores)

# Results dictionary
results = {
    'average_scores': {
        'rouge1': avg_rouge1,
        'rouge2': avg_rouge2,
        'rougeL': avg_rougeL,
        'bleu': avg_bleu
    },
    'example_predictions': [
        {
            'article': item['article'],
            'reference_summary': item['highlights'],
            'generated_summary': summarize(model, tokenizer, item['article']),
        }
        for item in test_data[:5]
    ]
}

print("Evaluation completed.")


In [None]:
import os
import json

# Navigate to the root project directory
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))  # Go one level up
reports_dir = os.path.join(project_root, "reports")
results_path = os.path.join(reports_dir, "evaluation_results.json")

# Create directory if it doesn't exist
os.makedirs(reports_dir, exist_ok=True)

# Create results dictionary
results = {
    'average_scores': {
        'rouge1': avg_rouge1,
        'rouge2': avg_rouge2,
        'rougeL': avg_rougeL,
        'bleu': avg_bleu
    },
    'example_predictions': [
        {
            'article': item['article'],
            'reference_summary': item['summary'],
            'generated_summary': summarize(model, tokenizer, item['article']),
        }
        for item in test_data[:5]
    ]
}

# Save results
with open(results_path, 'w') as f:
    json.dump(results, f, indent=2)

print(f"Evaluation results saved to {results_path}")


In [None]:
import os

# Navigate to the root project directory
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))  # Go one level up
models_dir = os.path.join(project_root, "models", "tiny-llama-model")

# Create directory if it doesn't exist
os.makedirs(models_dir, exist_ok=True)

# Save model
print(f"Saving model to {models_dir}...")
save_model(model, tokenizer, models_dir)

print(f"Model saved successfully at {models_dir}")