In [None]:
import torch
import logging
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset
from torch.utils.data import DataLoader
from datasets import load_from_disk # if data is saved else where

In [None]:
import torch
import logging
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset
from torch.utils.data import DataLoader

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class FinGPTPipeline:
    def __init__(self, model_name="Your finetuned model", dataset_path=None, output_dir="your output directory"):
        self.model_name = model_name
        self.dataset_path = dataset_path
        self.output_dir = output_dir
        self.tokenizer = None
        self.model = None
        self.dataset = None

    def load_model(self):
        """Load FinGPT model and tokenizer"""
        logger.info("Loading FinGPT model and tokenizer...")
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForCausalLM.from_pretrained(self.model_name)

    def load_dataset(self):
        """Load and preprocess dataset"""
        logger.info("Loading dataset...")

        if self.dataset_path:
            self.dataset = load_dataset("csv", data_files=self.dataset_path, split="train")
        else:
            self.dataset = load_dataset("your dataset", split="train")  # Public financial dataset

        def tokenize_function(examples):
            return self.tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

        logger.info("Tokenizing dataset...")
        self.dataset = self.dataset.map(tokenize_function, batched=True)

    def get_data_loader(self, batch_size=8): # change batch size
        """Prepare DataLoader"""
        data_collator = DataCollatorForLanguageModeling(tokenizer=self.tokenizer, mlm=False)
        return DataLoader(self.dataset, batch_size=batch_size, shuffle=True, collate_fn=data_collator)

    def train(self, epochs=3, batch_size=8, learning_rate=5e-5):
        """Train the FinGPT model"""
        """change epoch size to your requirements"""
        logger.info("Setting up training arguments...")
        training_args = TrainingArguments(
            output_dir=self.output_dir,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            learning_rate=learning_rate,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            num_train_epochs=epochs,
            weight_decay=0.01,
            logging_dir=f"{self.output_dir}/logs",
            logging_steps=10,
            push_to_hub=False
        )

        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=self.dataset,
            tokenizer=self.tokenizer,
            data_collator=DataCollatorForLanguageModeling(tokenizer=self.tokenizer, mlm=False),
        )

        logger.info("Starting training...")
        trainer.train()

        logger.info("Saving fine-tuned model...")
        self.model.save_pretrained(self.output_dir)
        self.tokenizer.save_pretrained(self.output_dir)

    def evaluate(self):
        """Evaluate model performance"""
        logger.info("Evaluating the trained model...")
        results = self.model.evaluate()
        logger.info(f"Evaluation Results: {results}")




In [None]:
if __name__ == "__main__":
    pipeline = FinGPTPipeline(dataset_path="financial_data.csv")  # Replace with your dataset path
    pipeline.load_model()
    pipeline.load_dataset()
    pipeline.train(epochs=3, batch_size=8)