# Mounting Google Drive

 Mounting Google Drive for accessing files

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Setting Environment Variables
Setting environment variables to control PyTorch's CUDA memory allocation.

In [None]:
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:50'

# Preprocessing the Data

Preprocessing the text data by splitting it into sections based on 'CHAPTER' and removing any whitespace.


In [None]:
def preprocess_data(text):
    sections = text.split('CHAPTER')
    processed_sections = [section.strip() for section in sections if section.strip() != '']
    return processed_sections

# Read the text file and preprocess
with open('/content/drive/MyDrive/LegalDocs/usc45.xml', 'r', encoding='utf-8') as file:
    text_data = file.read()

processed_data = preprocess_data(text_data)

# Loading the BART Model for Summary Generation

Loading the pre-trained BART model and tokenizer from Hugging Face for generating summaries.


In [3]:
from transformers import BartForConditionalGeneration, BartTokenizer

# Load BART model and tokenizer for summary generation
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')


# Generating and Saving Summaries

Generating summaries for the processed data using the BART model and save them to your Google Drive.


In [4]:
def generate_summary(text):
    inputs = tokenizer([text], max_length=1024, return_tensors='pt', truncation=True)
    summary_ids = model.generate(inputs['input_ids'], max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Generate summaries
output_texts = [generate_summary(text) for text in processed_data]

# Save summaries
output_file_path = '/content/drive/MyDrive/LegalDocs/generatedSummaries.txt'
with open(output_file_path, 'w', encoding='utf-8') as file:
    for summary in output_texts:
        file.write(summary + '\n')


# Reading Saved Summaries
Reading the previously saved summaries from a file in Google Drive.

In [15]:
# Define the file path for the existing summaries
output_file_path = '/content/drive/MyDrive/LegalDocs/generated_summaries.txt'

# Read the summaries from the file
output_texts = []
with open(output_file_path, 'r', encoding='utf-8') as file:
    for line in file:
        output_texts.append(line.strip())  # Append each summary to the list

# Splitting Data for Training and Validation

Splitting the dataset into training and validation sets and tokenize the data.

In [5]:
from sklearn.model_selection import train_test_split
from transformers import BartTokenizer
import torch

# Split the data into training and validation sets
train_inputs, val_inputs, train_outputs, val_outputs = train_test_split(
    processed_data, output_texts, test_size=0.1
)

# Tokenize the data
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
train_input_encodings = tokenizer(train_inputs, truncation=True, padding='max_length', max_length=256)
train_output_encodings = tokenizer(train_outputs, truncation=True, padding='max_length', max_length=512)
val_input_encodings = tokenizer(val_inputs, truncation=True, padding='max_length', max_length=512)
val_output_encodings = tokenizer(val_outputs, truncation=True, padding='max_length', max_length=512)

# Define custom dataset class
class RailroadsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])
        return item

    def __len__(self):
        return len(self.encodings.input_ids)

# Create datasets
train_dataset = RailroadsDataset(train_input_encodings, train_output_encodings)
val_dataset = RailroadsDataset(val_input_encodings, val_output_encodings)


# Defining Custom Dataset Class

Defining a custom PyTorch dataset class for handling the tokenized data.

In [6]:
from transformers import BartForConditionalGeneration, TrainingArguments, Trainer

# Load the BART model
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')

# Define training arguments
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/LegalDocs/results',
    num_train_epochs=3,
    per_device_train_batch_size=2,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='/content/drive/MyDrive/LegalDocs/logs'
)


# Creating Training and Validation Datasets

Creating instances of the custom dataset for both training and validation.

In [7]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()
model.save_pretrained('/content/drive/MyDrive/LegalDocs/fine_tuned_BART_railroads')


Step,Training Loss


# Load Fine-Tuned BART Model
This cell loads the fine-tuned BART model and tokenizer. The model is used for generating summaries from text.


In [12]:
from transformers import BartForConditionalGeneration, BartTokenizer

# Load the fine-tuned model
model = BartForConditionalGeneration.from_pretrained('/content/drive/MyDrive/LegalDocs/fine_tuned_BART_railroads')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')


# Generate Summary Function
This function uses the loaded BART model to generate a summary for a given piece of text. The function takes the text, model, tokenizer, and maximum lengths for input and output as parameters.


In [13]:
def generate_summary(text, model, tokenizer, max_input_length=1024, max_output_length=150):
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=max_input_length, truncation=True)
    summary_ids = model.generate(inputs, max_length=max_output_length, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)


# Testing the Summary Generation
Here we test the summary generation function by providing a new text snippet. The generated summary is then printed.


In [14]:
new_text = "Vacancies in membership or office shall be filled, members shall be appointed in case of failure of the carriers or of labor organizations of the employees to select and designate representatives, members of the National Air Transport Adjustment Board shall be compensated, hearings shall be held, findings and awards made, stated, served, and enforced, and the number and compensation of any necessary assistants shall be determined and the compensation of such employees shall be paid, all in the same manner and to the same extent as provided with reference to the National Railroad Adjustment Board by section 153 of this title. The powers and duties prescribed and established by the provisions of section 153 of this title with reference to the National Railroad Adjustment Board and the several divisions thereof are conferred upon and shall be exercised and performed in like manner and to the same extent by the said National Air Transport Adjustment Board, not exceeding, however, the jurisdiction conferred upon said National Air Transport Adjustment Board by the provisions of this subchapter. "
summary = generate_summary(new_text, model, tokenizer)
print(summary)


summarize:  Vacancies in membership or office shall be filled, members shall be appointed in case of failure of the carriers or of labor organizations of the employees to select and designate representatives, members of the National Air Transport Adjustment Board shall be compensated, hearings shall be held, findings and awards made, stated, served, and enforced, and the number and compensation of any necessary assistants shall be determined and the compensation of such employees shall be paid, all in the same manner and to the same extent as provided with reference to the National Railroad Adjustment board by section 153 of this title. The powers and duties prescribed and established by the provisions of section 153 thereof are conferred upon and shall be exercised and performed by the said National Air
