-------------------

In [1]:
!pip install transformers
!pip install datasets












In [2]:
from datasets import load_dataset, DatasetDict
import torch
from transformers import BartForConditionalGeneration, BartTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments





In [3]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [4]:
# Load your dataset from the CSV file
data_files = {
    "train": "mahabharat_1-2.csv",  # Ensure correct path to your dataset
}


In [5]:
# Load dataset, specifying the columns
dataset = load_dataset('csv', data_files=data_files, column_names=['Section', 'Parv', 'Key Event', 'Summary'])

Generating train split: 0 examples [00:00, ? examples/s]

In [6]:
# Load pre-trained DistilBART model and tokenizer
model_name = 'sshleifer/distilbart-cnn-12-6'
model = BartForConditionalGeneration.from_pretrained(model_name).to(device)
tokenizer = BartTokenizer.from_pretrained(model_name)



In [7]:
# Preprocess function to tokenize input (parv) and target (summary)
def preprocess_function(examples):
    inputs = tokenizer(examples['Key Event'], max_length=512, truncation=True, padding="max_length", return_tensors="pt")

    # Process the 'summary' column as the target
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['Summary'], max_length=150, truncation=True, padding="max_length", return_tensors="pt")

    inputs['labels'] = labels['input_ids']
    return inputs

In [8]:
# Tokenize the dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/31 [00:00<?, ? examples/s]



In [9]:
# Create a DatasetDict with train and validation splits
train_testvalid = tokenized_datasets['train'].train_test_split(test_size=0.1)
tokenized_datasets = DatasetDict({
    'train': train_testvalid['train'],
    'validation': train_testvalid['test']  # 'test' is the validation split in train_test_split
})

In [10]:
# Now you can access and map the validation split
tokenized_datasets['validation'] = tokenized_datasets['validation'].map(preprocess_function, batched=True)

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

In [11]:
# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",                    # Output directory
    evaluation_strategy="epoch",               # Evaluate every few steps
    eval_steps=500,                            # Evaluate every 500 steps
    save_steps=500,                            # Save model every 500 steps
    logging_steps=100,                         # Log every 100 steps
    learning_rate=2e-5,                        # Learning rate
    per_device_train_batch_size=16,            # Batch size for training
    per_device_eval_batch_size=16,             # Batch size for evaluation
    weight_decay=0.01,                         # Weight decay to avoid overfitting
    save_total_limit=3,                        # Limit on number of checkpoints to keep
    num_train_epochs=18,                       # Number of epochs
    predict_with_generate=True,                # Generate summaries during evaluation
    logging_dir="./logs",                      # Directory for logs
    fp16=True,                                 # Enable mixed precision training for speed and memory efficiency
    dataloader_pin_memory=True,                # Pin memory for faster data transfer to GPU
)



In [12]:
# Trainer setup for summarization task
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],        # Training dataset
    eval_dataset=tokenized_datasets["validation"],    # Evaluation dataset
    tokenizer=tokenizer,
)

In [13]:
# Train the model 
trainer.train()

  0%|          | 0/36 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.544978618621826, 'eval_runtime': 18.9411, 'eval_samples_per_second': 0.211, 'eval_steps_per_second': 0.053, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.358855247497559, 'eval_runtime': 22.0291, 'eval_samples_per_second': 0.182, 'eval_steps_per_second': 0.045, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.264285087585449, 'eval_runtime': 25.5415, 'eval_samples_per_second': 0.157, 'eval_steps_per_second': 0.039, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.212522029876709, 'eval_runtime': 19.5546, 'eval_samples_per_second': 0.205, 'eval_steps_per_second': 0.051, 'epoch': 4.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.173277854919434, 'eval_runtime': 20.2729, 'eval_samples_per_second': 0.197, 'eval_steps_per_second': 0.049, 'epoch': 5.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.1394476890563965, 'eval_runtime': 20.9439, 'eval_samples_per_second': 0.191, 'eval_steps_per_second': 0.048, 'epoch': 6.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.122896671295166, 'eval_runtime': 17.1082, 'eval_samples_per_second': 0.234, 'eval_steps_per_second': 0.058, 'epoch': 7.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.108011245727539, 'eval_runtime': 17.5838, 'eval_samples_per_second': 0.227, 'eval_steps_per_second': 0.057, 'epoch': 8.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.09643030166626, 'eval_runtime': 19.0889, 'eval_samples_per_second': 0.21, 'eval_steps_per_second': 0.052, 'epoch': 9.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.091120719909668, 'eval_runtime': 19.1321, 'eval_samples_per_second': 0.209, 'eval_steps_per_second': 0.052, 'epoch': 10.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.089816093444824, 'eval_runtime': 20.9538, 'eval_samples_per_second': 0.191, 'eval_steps_per_second': 0.048, 'epoch': 11.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.090449810028076, 'eval_runtime': 29.6264, 'eval_samples_per_second': 0.135, 'eval_steps_per_second': 0.034, 'epoch': 12.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.090882778167725, 'eval_runtime': 17.6249, 'eval_samples_per_second': 0.227, 'eval_steps_per_second': 0.057, 'epoch': 13.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.0917134284973145, 'eval_runtime': 19.3688, 'eval_samples_per_second': 0.207, 'eval_steps_per_second': 0.052, 'epoch': 14.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.093687534332275, 'eval_runtime': 18.5634, 'eval_samples_per_second': 0.215, 'eval_steps_per_second': 0.054, 'epoch': 15.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.096490859985352, 'eval_runtime': 20.7143, 'eval_samples_per_second': 0.193, 'eval_steps_per_second': 0.048, 'epoch': 16.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.099451541900635, 'eval_runtime': 25.6614, 'eval_samples_per_second': 0.156, 'eval_steps_per_second': 0.039, 'epoch': 17.0}




  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.101038932800293, 'eval_runtime': 17.2153, 'eval_samples_per_second': 0.232, 'eval_steps_per_second': 0.058, 'epoch': 18.0}
{'train_runtime': 7967.117, 'train_samples_per_second': 0.061, 'train_steps_per_second': 0.005, 'train_loss': 3.6924925910101996, 'epoch': 18.0}


TrainOutput(global_step=36, training_loss=3.6924925910101996, metrics={'train_runtime': 7967.117, 'train_samples_per_second': 0.061, 'train_steps_per_second': 0.005, 'total_flos': 376142636777472.0, 'train_loss': 3.6924925910101996, 'epoch': 18.0})

In [14]:
# Save the fine-tuned model and tokenizer
trainer.save_model("./fine_tuned_distilbart")
tokenizer.save_pretrained("./fine_tuned_distilbart")

('./fine_tuned_distilbart\\tokenizer_config.json',
 './fine_tuned_distilbart\\special_tokens_map.json',
 './fine_tuned_distilbart\\vocab.json',
 './fine_tuned_distilbart\\merges.txt',
 './fine_tuned_distilbart\\added_tokens.json')

In [15]:
# Assuming the model and tokenizer are already loaded and available for inference

def generate_summary(parva_name):
    # Tokenize the input (parva name or related text)
    inputs = tokenizer(parva_name, max_length=512, truncation=True, padding="max_length", return_tensors="pt").to(device)

    # Generate the summary using the fine-tuned DistilBART model
    summary_ids = model.generate(inputs['input_ids'], max_length=150, num_beams=4, early_stopping=True)

    # Decode the output summary
    summary_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return summary_text

# Example usage:
parva = "Following the Rajasuya sacrifice"
summary = generate_summary(parva)

# Print or return the generated summary
print(f"Summary for {parva}: \n{summary}")


Summary for Following the Rajasuya sacrifice: 
After the Rajasuya sacrifice, the Pandavas returned to the city of Yudhisthira, where they were living, they decided to perform the sacrifice of their beloved son, Rishya. Following the sacrifice, they approached the city's capital, the city was located in the midst of a forest of forested areas, and the city had become very hot and dangerous. The citizens of the city were informed by the presence of Lord Krishna and Lord Vishnu, who had performed a great sacrifice, that was performed by Lord Krishna. The people of the cities were informed of the sacrifice by Krishna, the king, and that he had performed the sacrifice. They were also informed by Krishna that they had received the
