In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, AutoConfig,
    TrainingArguments, Trainer, DataCollatorForLanguageModeling
)
from datasets import load_dataset, Dataset
import numpy as np
from tqdm import tqdm
import json
import os
from typing import Dict, List
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
class FinancialPretrainingPipeline:
    def __init__(self, model_name="meta-llama/Llama-3.2-1B", max_length=512):
        self.model_name = model_name
        self.max_length = max_length
        self.tokenizer = None
        self.model = None
        self.original_model = None

    def load_model_and_tokenizer(self):
        print("Loading Meta LLaMA 3.2-1B tokenizer and model...")

        # Load tokenizer - LLaMA 3.2 uses a different tokenizer
        from transformers import AutoTokenizer, AutoModelForCausalLM

        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)

        # Set padding token if not present
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

        # Load model with appropriate settings for LLaMA 3.2-1B
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            torch_dtype=torch.float16,
            device_map="auto" if torch.cuda.is_available() else None,
            trust_remote_code=True,  # May be needed for newer models
        )

        # Keep a copy of original model for comparison
        self.original_model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            torch_dtype=torch.float16,
            device_map="auto" if torch.cuda.is_available() else None,
            trust_remote_code=True,
        )

        print(f"Model loaded: {self.model_name}")
        print(f"Model parameters: {self.model.num_parameters():,}")
        print(f"Model size: ~{self.model.num_parameters() / 1e9:.1f}B parameters")

    def load_financial_dataset(self):
        print("Loading financial documents dataset...")

        texts = []

        # Try multiple financial datasets with error handling
        datasets_to_try = [
            {
                'name': 'zeroshot/twitter-financial-news-sentiment',
                'text_field': 'text',
                'description': 'Financial Twitter sentiment data'
            },
            {
                'name': 'financial_phrasebank',
                'text_field': 'sentence',
                'description': 'Financial PhraseBank'
            },
            {
                'name': 'gbharti/finance-alpaca',
                'text_field': 'output',
                'description': 'Finance Alpaca dataset'
            }
        ]

        for dataset_info in datasets_to_try:
            try:
                print(f"Trying to load {dataset_info['description']}...")
                dataset = load_dataset(dataset_info['name'], split="train")

                # Extract text from dataset
                for item in dataset:
                    text_content = None

                    # Try different possible text fields
                    for field in [dataset_info['text_field'], 'text', 'content', 'sentence', 'output', 'input']:
                        if field in item and item[field]:
                            text_content = str(item[field])
                            break

                    if text_content and len(text_content.strip()) > 50:  # Only keep substantial text
                        texts.append(text_content.strip())

                print(f"✅ Loaded {len([t for t in texts])} texts from {dataset_info['description']}")

                if len(texts) > 10000:  # If we have enough data, break
                    break

            except Exception as e:
                print(f"❌ Failed to load {dataset_info['name']}: {str(e)}")
                continue

        # If no datasets worked, create synthetic financial text
        if len(texts) < 100:
            print("⚠️  No datasets loaded successfully. Creating synthetic financial text...")
            texts = self.create_synthetic_financial_text()

        # Filter and clean texts
        cleaned_texts = []
        for text in texts:
            if len(text.strip()) > 50 and len(text.strip()) < 2048:  # Reasonable length
                cleaned_texts.append(text.strip())

        print(f"Total documents after cleaning: {len(cleaned_texts)}")
        return cleaned_texts

    def create_synthetic_financial_text(self):
        print("Creating synthetic financial documents...")

        templates = [
            "The quarterly earnings report shows revenue of ${} million, representing a {}% increase from the previous quarter.",
            "The Federal Reserve announced a {}% interest rate adjustment, impacting bond yields and equity markets.",
            "Market analysis indicates that {} sector is experiencing significant volatility due to regulatory changes.",
            "The P/E ratio of {} has reached {}, suggesting the stock may be {} relative to industry averages.",
            "Economic indicators point to {} inflation trends, with consumer spending {} by {}% this quarter.",
            "The merger between {} and {} is expected to create synergies worth ${} million annually.",
            "Dividend yield for {} has increased to {}%, making it attractive for income investors.",
            "The cryptocurrency market shows {} correlation with traditional assets during recent market turbulence.",
            "Supply chain disruptions have affected {} industry margins, with costs rising {}% year-over-year.",
            "Central bank policy decisions regarding {} are expected to influence currency exchange rates significantly."
        ]

        import random

        texts = []
        for _ in range(5000):  # Generate 5000 synthetic financial texts
            template = random.choice(templates)

            # Fill in template with realistic financial terms
            companies = ["Apple", "Microsoft", "Amazon", "Tesla", "Google", "JPMorgan", "Goldman Sachs"]
            sectors = ["technology", "healthcare", "financial services", "energy", "consumer goods"]
            percentages = [f"{random.uniform(-10, 20):.1f}" for _ in range(3)]
            amounts = [f"{random.randint(100, 5000)}" for _ in range(2)]

            try:
                if "{}" in template:
                    # Fill template with appropriate values
                    filled_template = template.format(
                        random.choice(amounts),
                        random.choice(percentages),
                        random.choice(sectors),
                        random.choice(companies),
                        random.choice(percentages),
                        random.choice(["undervalued", "overvalued"]),
                        random.choice(["rising", "declining", "stable"]),
                        random.choice(["increasing", "decreasing"]),
                        random.choice(percentages),
                        random.choice(companies),
                        random.choice(companies),
                        random.choice(amounts),
                        random.choice(companies),
                        random.choice(percentages),
                        random.choice(["positive", "negative", "mixed"]),
                        random.choice(sectors),
                        random.choice(percentages),
                        random.choice(["monetary policy", "fiscal policy", "trade policy"])
                    )
                    texts.append(filled_template)
                else:
                    texts.append(template)
            except:
                texts.append("Financial markets showed mixed performance with varied sector rotation patterns.")

        return texts

    def tokenize_dataset(self, texts):
        print("Tokenizing dataset...")

        def tokenize_function(examples):
            # Tokenize and truncate to max_length
            tokenized = self.tokenizer(
                examples,
                truncation=True,
                padding=False,
                max_length=self.max_length,
                return_tensors=None
            )
            return tokenized

        # Process in batches to avoid memory issues
        tokenized_texts = []
        batch_size = 1000

        for i in tqdm(range(0, len(texts), batch_size)):
            batch = texts[i:i + batch_size]
            tokenized_batch = tokenize_function(batch)
            tokenized_texts.extend(tokenized_batch['input_ids'])

        # Create dataset
        tokenized_dataset = Dataset.from_dict({
            'input_ids': tokenized_texts
        })

        print(f"Tokenized {len(tokenized_texts)} documents")
        return tokenized_dataset

    def calculate_tokens_processed(self, dataset):
        total_tokens = sum(len(item['input_ids']) for item in dataset)
        print(f"Total tokens in dataset: {total_tokens:,}")
        return total_tokens

    def setup_training_args(self, output_dir="./financial_llama_3_2_1b", num_epochs=1):
        return TrainingArguments(
            output_dir=output_dir,
            overwrite_output_dir=True,
            num_train_epochs=num_epochs,
            per_device_train_batch_size=4,  # Can use larger batch size with 1B model
            gradient_accumulation_steps=4,   # Effective batch size = 4 * 4 = 16
            warmup_steps=500,
            learning_rate=5e-5,              # Lower learning rate for continued pre-training
            weight_decay=0.01,
            logging_steps=50,
            save_steps=1000,
            save_total_limit=3,
            prediction_loss_only=True,
            fp16=True,  # Use mixed precision
            dataloader_drop_last=True,
            remove_unused_columns=False,
            report_to=None,  # Disable wandb/tensorboard
            max_grad_norm=1.0,               # Gradient clipping
        )

    def continue_pretraining(self, dataset, target_tokens=1_000_000_000):
        print("Starting continued pre-training...")

        # Calculate how many epochs needed for target tokens
        total_tokens = self.calculate_tokens_processed(dataset)
        epochs_needed = max(1, target_tokens // total_tokens)

        print(f"Target tokens: {target_tokens:,}")
        print(f"Epochs needed: {epochs_needed}")

        # Setup training
        training_args = self.setup_training_args(num_epochs=epochs_needed)

        # Data collator for language modeling
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer,
            mlm=False,  # Causal LM, not masked LM
        )

        # Initialize trainer
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=dataset,
            data_collator=data_collator,
        )

        # Start training
        print("Training started...")
        trainer.train()

        # Save the fine-tuned model
        trainer.save_model("./financial_llama_3_2_1b_final")
        self.tokenizer.save_pretrained("./financial_llama_3_2_1b_final")
        print("Training completed and model saved!")

    def run_benchmark_tests(self):
        print("Running benchmark tests...")

        # Common sense reasoning prompts
        benchmark_prompts = [
            "The capital of France is",
            "2 + 2 equals",
            "The largest planet in our solar system is",
            "Water boils at",
            "The first president of the United States was",
            # Financial domain prompts
            "A stock's P/E ratio represents",
            "When interest rates rise, bond prices typically",
            "EBITDA stands for",
            "A bull market is characterized by",
            "The Federal Reserve's primary tool for monetary policy is"
        ]

        results = {
            'original': [],
            'fine_tuned': []
        }

        print("Testing original model...")
        for prompt in tqdm(benchmark_prompts):
            response = self.generate_response(self.original_model, prompt)
            results['original'].append({
                'prompt': prompt,
                'response': response
            })

        print("Testing fine-tuned model...")
        for prompt in tqdm(benchmark_prompts):
            response = self.generate_response(self.model, prompt)
            results['fine_tuned'].append({
                'prompt': prompt,
                'response': response
            })

        return results

    def generate_response(self, model, prompt, max_new_tokens=50):
        inputs = self.tokenizer(prompt, return_tensors="pt").to(model.device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=True,
                temperature=0.7,
                pad_token_id=self.tokenizer.eos_token_id
            )

        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response[len(prompt):].strip()

    def evaluate_perplexity(self, test_texts):
        print("Calculating perplexity...")

        def calculate_perplexity(model, texts):
            model.eval()
            total_loss = 0
            total_tokens = 0

            with torch.no_grad():
                for text in tqdm(texts[:100], desc="Calculating perplexity"):  # Sample for speed
                    inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
                    inputs = {k: v.to(model.device) for k, v in inputs.items()}

                    outputs = model(**inputs, labels=inputs["input_ids"])
                    loss = outputs.loss

                    total_loss += loss.item() * inputs["input_ids"].size(1)
                    total_tokens += inputs["input_ids"].size(1)

            avg_loss = total_loss / total_tokens
            perplexity = torch.exp(torch.tensor(avg_loss)).item()
            return perplexity

        original_ppl = calculate_perplexity(self.original_model, test_texts)
        finetuned_ppl = calculate_perplexity(self.model, test_texts)

        print(f"Original model perplexity: {original_ppl:.2f}")
        print(f"Fine-tuned model perplexity: {finetuned_ppl:.2f}")

        return original_ppl, finetuned_ppl

    def analyze_results(self, benchmark_results, perplexity_results):
        print("\n" + "="*50)
        print("BENCHMARK RESULTS ANALYSIS")
        print("="*50)

        # Compare responses
        for i, prompt in enumerate([r['prompt'] for r in benchmark_results['original']]):
            print(f"\nPrompt: {prompt}")
            print(f"Original: {benchmark_results['original'][i]['response']}")
            print(f"Fine-tuned: {benchmark_results['fine_tuned'][i]['response']}")
            print("-" * 30)

        # Perplexity analysis
        original_ppl, finetuned_ppl = perplexity_results
        print(f"\nPERPLEXITY ANALYSIS:")
        print(f"Original model: {original_ppl:.2f}")
        print(f"Fine-tuned model: {finetuned_ppl:.2f}")

        if finetuned_ppl > original_ppl:
            print("⚠️  POTENTIAL DEGRADATION DETECTED")
            print(f"Perplexity increased by {((finetuned_ppl - original_ppl) / original_ppl * 100):.1f}%")
        else:
            print("✅ No significant degradation detected")

        # Save results
        results_summary = {
            'benchmark_responses': benchmark_results,
            'perplexity': {
                'original': original_ppl,
                'fine_tuned': finetuned_ppl,
                'degradation_pct': ((finetuned_ppl - original_ppl) / original_ppl * 100)
            }
        }

        with open('pretraining_results.json', 'w') as f:
            json.dump(results_summary, f, indent=2)

        print("\nResults saved to 'pretraining_results.json'")


In [None]:
# Additional utility functions for monitoring and analysis

def plot_training_metrics(log_file="./financial_llama_3_2_1b/trainer_state.json"):
    try:
        with open(log_file, 'r') as f:
            training_state = json.load(f)

        # Extract loss values
        log_history = training_state.get('log_history', [])
        train_losses = [entry['train_loss'] for entry in log_history if 'train_loss' in entry]
        steps = [entry['step'] for entry in log_history if 'train_loss' in entry]

        # Plot training loss
        plt.figure(figsize=(10, 6))
        plt.plot(steps, train_losses)
        plt.title('Training Loss Over Time')
        plt.xlabel('Steps')
        plt.ylabel('Loss')
        plt.grid(True)
        plt.savefig('training_loss.png')
        plt.show()

        print("Training metrics plotted and saved as 'training_loss.png'")
    except FileNotFoundError:
        print("Training log file not found. Run training first.")

def compare_model_outputs(original_model, finetuned_model, tokenizer, prompts):
    comparisons = []

    for prompt in prompts:
        # Generate from both models
        inputs = tokenizer(prompt, return_tensors="pt")

        with torch.no_grad():
            orig_outputs = original_model.generate(**inputs, max_new_tokens=100, do_sample=False)
            ft_outputs = finetuned_model.generate(**inputs, max_new_tokens=100, do_sample=False)

        orig_text = tokenizer.decode(orig_outputs[0], skip_special_tokens=True)
        ft_text = tokenizer.decode(ft_outputs[0], skip_special_tokens=True)

        comparisons.append({
            'prompt': prompt,
            'original': orig_text[len(prompt):],
            'fine_tuned': ft_text[len(prompt):],
        })

    return comparisons

In [None]:

def main():
    print("Meta LLaMA 3.2-1B Financial Pre-training Pipeline")
    print("="*50)

    # Check for HuggingFace token
    import os
    if not os.getenv('HF_TOKEN'):
        print("⚠️  Warning: HF_TOKEN environment variable not set.")
        print("You may need to set your Hugging Face token to access the model:")
        print("export HF_TOKEN=your_token_here")
        print("Or login using: huggingface-cli login")
        print()

    try:
        # Initialize pipeline
        pipeline = FinancialPretrainingPipeline()

        # Load model and tokenizer
        pipeline.load_model_and_tokenizer()

        # Load financial dataset
        financial_texts = pipeline.load_financial_dataset()

        # Tokenize dataset
        tokenized_dataset = pipeline.tokenize_dataset(financial_texts)

        # Continue pre-training
        pipeline.continue_pretraining(tokenized_dataset, target_tokens=1_000_000_000)

        # Run benchmarks
        benchmark_results = pipeline.run_benchmark_tests()

        # Calculate perplexity (using subset of financial texts as test)
        test_texts = financial_texts[:1000]  # Use first 1000 for testing
        perplexity_results = pipeline.evaluate_perplexity(test_texts)

        # Analyze results
        pipeline.analyze_results(benchmark_results, perplexity_results)

        print("\nPipeline completed successfully! 🎉")

    except Exception as e:
        print(f"❌ Error occurred: {str(e)}")
        if "401" in str(e) or "access" in str(e).lower():
            print("\n🔐 This might be an authentication issue.")
            print("Make sure you have:")
            print("1. Requested access to meta-llama/Llama-3.2-1B on Hugging Face")
            print("2. Set your HF_TOKEN environment variable")
            print("3. Or run: huggingface-cli login")
        raise

if __name__ == "__main__":
    main()



Using device: cuda
Meta LLaMA 3.2-1B Financial Pre-training Pipeline
You may need to set your Hugging Face token to access the model:
export HF_TOKEN=your_token_here
Or login using: huggingface-cli login

Loading Meta LLaMA 3.2-1B tokenizer and model...
Model loaded: meta-llama/Llama-3.2-1B
Model parameters: 1,235,814,400
Model size: ~1.2B parameters
Loading financial documents dataset...
Trying to load Financial Twitter sentiment data...
❌ Failed to load zeroshot/twitter-financial-news-sentiment: Invalid pattern: '**' can only be an entire path component
Trying to load Financial PhraseBank...


Downloading builder script:   0%|          | 0.00/6.04k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.88k [00:00<?, ?B/s]

❌ Failed to load financial_phrasebank: Config name is missing.
Please pick one among the available configs: ['sentences_allagree', 'sentences_75agree', 'sentences_66agree', 'sentences_50agree']
Example of usage:
	`load_dataset('financial_phrasebank', 'sentences_allagree')`
Trying to load Finance Alpaca dataset...


Downloading readme:   0%|          | 0.00/831 [00:00<?, ?B/s]

❌ Failed to load gbharti/finance-alpaca: Invalid pattern: '**' can only be an entire path component
⚠️  No datasets loaded successfully. Creating synthetic financial text...
Creating synthetic financial documents...
Total documents after cleaning: 5000
Tokenizing dataset...


100%|██████████| 5/5 [00:00<00:00, 15.90it/s]


Tokenized 5000 documents
Starting continued pre-training...
Total tokens in dataset: 111,528
Target tokens: 1,000,000,000
Epochs needed: 8966




Training started...


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


wandb: Paste an API key from your profile and hit enter:

 ··········




❌ Error occurred: API key must be 40 characters long, yours was 4


ValueError: API key must be 40 characters long, yours was 4