In [None]:
!pip  install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.1-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m61.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m106.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90

In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BartTokenizer, BartForConditionalGeneration, AdamW
from torch.utils.data import DataLoader, Dataset

# dataset_path = '/content/Datasetnew from hugging face.xlsx'
df = pd.read_excel("/content/drive/MyDrive/Datasetnew from hugging face.xlsx")

train_text, test_text, train_summary, test_summary = train_test_split(
    df['judgement'], df['summary'], test_size=0.2, random_state=42
)
test_text, val_text, test_summary, val_summary = train_test_split(
    test_text, test_summary, test_size=0.5, random_state=42
)

tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')

train_encodings = tokenizer(list(train_text), truncation=True, padding=True)
train_labels = tokenizer(list(train_summary.astype(str)), truncation=True, padding=True)

val_encodings = tokenizer(list(val_text), truncation=True, padding=True)
val_labels = tokenizer(list(val_summary.astype(str)), truncation=True, padding=True)

test_encodings = tokenizer(list(test_text), truncation=True, padding=True)
test_labels = tokenizer(list(test_summary.astype(str)), truncation=True, padding=True)


class SummaryDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])
        return item

    def __len__(self):
        return len(self.labels['input_ids'])

train_dataset = SummaryDataset(train_encodings, train_labels)
val_dataset = SummaryDataset(val_encodings, val_labels)
test_dataset = SummaryDataset(test_encodings, test_labels)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=3, shuffle=True)

optimizer = AdamW(model.parameters(), lr=1e-5)

num_epochs = 1

for epoch in range(num_epochs):
    total_loss = 0
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} - Training Loss: {avg_loss:.4f}")

    model.eval()

    val_loader = DataLoader(val_dataset, batch_size=3, shuffle=False)

    total_val_loss = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(val_loader)
    print(f"Epoch {epoch+1} - Validation Loss: {avg_val_loss:.4f}")




Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]



Epoch 1 - Training Loss: 2.5761
Epoch 1 - Validation Loss: 1.9513


In [None]:
model_save_path = "/content/drive/MyDrive/bartmodel_hfd.h5"
torch.save(model.state_dict(), model_save_path)

print(f"Model saved to: {model_save_path}")

Model saved to: /content/drive/MyDrive/bartmodel_hfd.h5


In [None]:
test_loader = DataLoader(test_dataset, batch_size=3, shuffle=False)

predictions = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=200)
        batch_preds = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outputs]
        predictions.extend(batch_preds)


test_summary_list = test_summary.tolist()

for i in range(len(predictions)):
    print(f"Target: {test_summary_list[i]}")
    print(f"Prediction: {predictions[i]}")
    print("=" * 5)

Target: In the year 1978, a proviso was added to Rule 3(j) of the Bar Council of Delhi Election Rules, 1968 with the approval of the Bar Council of India in accordance with the requirement of Sub section (3) of section 15 of the . In accordance with that proviso a copy of the declaration form was sent on 14th June, 1978 to the Advocates whose names found place in the State roll of Advocates asking them to return the declaration form duly filled up and signed within the specified period. A publication to this effect was also made in some newspapers viz. Hindustan Times, Indian Express, Statesman etc. The last extended date for the submission of the declaration forms was 14th September, 1978 and the electoral roll was finally published on the 16th September, 1978 excluding the names of about 2,000 Advocates who had failed to submit such declaration forms. On the basis of the electoral roll so prepared, elections to the Bar Council of Delhi was held on the 17th November, 1978. The total n

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
from google.colab import drive

drive.mount('/content/drive')

test_summary_list = test_summary.tolist()

predictions_df = pd.DataFrame({'Target': test_summary_list, 'Prediction': predictions})

folder_path = '/content/drive/MyDrive'

file_path = folder_path + '/predicted_summaries.xlsx'
predictions_df.to_excel(file_path, index=False)

print(f"Predicted summaries saved to: {file_path}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Predicted summaries saved to: /content/drive/MyDrive/predicted_summaries.xlsx


In [None]:
!pip install rouge

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [None]:
import sys
sys.setrecursionlimit(10**6)

from rouge import Rouge


references = []
hypotheses = []

for i in range(len(predictions)):
    reference = test_summary_list[i]
    hypothesis = predictions[i]
    references.append(reference)
    hypotheses.append(hypothesis)

rouge = Rouge()

scores = rouge.get_scores(hypotheses, references, avg=True)

rouge_1_score = scores['rouge-1']
rouge_2_score = scores['rouge-2']
rouge_l_score = scores['rouge-l']

print(f"ROUGE-1 Score: {rouge_1_score}")
print(f"ROUGE-2 Score: {rouge_2_score}")
print(f"ROUGE-L Score: {rouge_l_score}")


ROUGE-1 Score: {'r': 0.21385350386697494, 'p': 0.5669849748263986, 'f': 0.2961942581094953}
ROUGE-2 Score: {'r': 0.09516850947743592, 'p': 0.3162377162114016, 'f': 0.1376674842451366}
ROUGE-L Score: {'r': 0.19432272541151702, 'p': 0.5188686054050003, 'f': 0.26967799074273985}


In [None]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

references = test_summary_list
hypotheses = predictions

tfidf_vectorizer = TfidfVectorizer()
vectorized_references = tfidf_vectorizer.fit_transform(references)
vectorized_hypotheses = tfidf_vectorizer.transform(hypotheses)


cosine_similarities = cosine_similarity(vectorized_references, vectorized_hypotheses)

average_cosine_similarity = np.mean(cosine_similarities)

print(f"Average Cosine Similarity: {average_cosine_similarity:.4f}")


Average Cosine Similarity: 0.2757
