In [1]:
#Step 0.0: Mount Google Drive in Colab

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
#Step 0.1: Install Dependencies

!pip install nltk transformers torch
import nltk
nltk.download('punkt')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
#Step 0.2: Import Libraries

import os
import json
import random
import time
import torch
from torch.utils.data import DataLoader, Dataset
from torch.cuda.amp import GradScaler, autocast # for mixed precision
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from nltk.tokenize import sent_tokenize


In [4]:
# Step 0.3 Check Device

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [5]:
folder_path = '/content/drive/My Drive/20per'
file_count = len([name for name in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, name))])
print(f"Number of files in the folder: {file_count}")

Number of files in the folder: 7365


# Data Preparation

In [15]:
# Function to extract data from a JSON file and derive labels
def extract_data_from_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
        passage = data["Meta(Refine)"]["passage"]
        summary = data["Annotation"]["summary3"]
        passage_sentences = sent_tokenize(passage)
        summary_sentences = set(sent_tokenize(summary))
        labels = [1 if sentence in summary_sentences else 0 for sentence in passage_sentences]
        return passage_sentences, labels

# Function to prepare data
def prepare_data(file_names, folder_path, num_files, split_ratio=0.8):
    sampled_files = random.sample(file_names, num_files)
    train_files = sampled_files[:int(len(sampled_files) * split_ratio)]
    test_files = sampled_files[int(len(sampled_files) * split_ratio):]

    train_passages, train_labels = [], []
    test_passages, test_labels = [], []

    for file_name in train_files:
        file_path = os.path.join(folder_path, file_name)
        passage_sentences, labels = extract_data_from_json(file_path)
        train_passages.extend(passage_sentences)
        train_labels.extend(labels)

    for file_name in test_files:
        file_path = os.path.join(folder_path, file_name)
        passage_sentences, labels = extract_data_from_json(file_path)
        test_passages.extend(passage_sentences)
        test_labels.extend(labels)

    return train_passages, train_labels, test_passages, test_labels

# Dataset class for text summarization
class TextSummarizationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]
        encoding = self.tokenizer.encode_plus(
          text,
          add_special_tokens=True,
          max_length=self.max_len,
          return_token_type_ids=False,
          padding='max_length',
          return_attention_mask=True,
          return_tensors='pt',
          truncation=True
        )
        return {
          'text': text,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'labels': torch.tensor(label, dtype=torch.long)
        }

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Data preparation
folder_path = '/content/drive/My Drive/20per'  # Data folder
file_names = os.listdir(folder_path)
num_files = int(len(file_names) * 0.02)  # Use 0.2% of the files (from total 7365 files)

train_passages, train_labels, test_passages, test_labels = prepare_data(file_names, folder_path, num_files)
train_dataset = TextSummarizationDataset(train_passages, train_labels, tokenizer)
test_dataset = TextSummarizationDataset(test_passages, test_labels, tokenizer)

train_data_loader = DataLoader(train_dataset, batch_size=2)
test_data_loader = DataLoader(test_dataset, batch_size=2)

# Train

In [16]:
# Load model
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)
model = model.to(device)

# Training loop
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
num_epochs = 1
total_steps = len(train_data_loader) * num_epochs

start_time = time.time()

print("Starting training...")
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    for i, batch in enumerate(train_data_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        correct_predictions += (preds == labels).sum().item()
        total_predictions += labels.size(0)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    epoch_loss = total_loss / len(train_data_loader)
    epoch_accuracy = correct_predictions / total_predictions
    print(f"Epoch [{epoch+1}/{num_epochs}]: Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}")

training_time = time.time() - start_time
print(f"Training completed in {training_time:.2f} seconds")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...
Epoch [1/1]: Loss: 0.4716, Accuracy: 0.8330
Training completed in 283.02 seconds


# Test

In [17]:
# Function to evaluate the model on test data
def evaluate_model(model, data_loader):
    model.eval()
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            correct_predictions += (preds == labels).sum().item()
            total_predictions += labels.size(0)

    accuracy = correct_predictions / total_predictions
    return accuracy

# Evaluate the model on test data
test_accuracy = evaluate_model(model, test_data_loader)
print(f"Test Accuracy: {test_accuracy:.4f}")

Test Accuracy: 0.8610


# Train (Mixed Precision)

In [18]:
# Load model
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)
model = model.to(device)

# Training loop
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
num_epochs = 1
total_steps = len(train_data_loader) * num_epochs

# Initialize the gradient scaler for mixed precision
scaler = GradScaler()

start_time = time.time()

print("Starting training...")
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    for i, batch in enumerate(train_data_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        # Enable autocast for the forward pass, automatically casting inputs to mixed precision
        with autocast():
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

        # Scales loss and calls backward() to create scaled gradients
        scaler.scale(loss).backward()

        # Unscales the gradients and calls or skips optimizer.step()
        scaler.step(optimizer)

        # Updates the scale for next iteration
        scaler.update()

        total_loss += loss.item()
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        correct_predictions += (preds == labels).sum().item()
        total_predictions += labels.size(0)

    epoch_loss = total_loss / len(train_data_loader)
    epoch_accuracy = correct_predictions / total_predictions
    print(f"Epoch [{epoch+1}/{num_epochs}]: Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}")

training_time = time.time() - start_time
print(f"Training completed in {training_time:.2f} seconds")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...
Epoch [1/1]: Loss: 0.4755, Accuracy: 0.8330
Training completed in 167.17 seconds


# Test (Mixed Precision)

In [19]:
# Function to evaluate the model on test data
def evaluate_model(model, data_loader):
    model.eval()
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            correct_predictions += (preds == labels).sum().item()
            total_predictions += labels.size(0)

    accuracy = correct_predictions / total_predictions
    return accuracy

# Evaluate the model on test data
test_accuracy = evaluate_model(model, test_data_loader)
print(f"Test Accuracy: {test_accuracy:.4f}")

Test Accuracy: 0.8610
