In [None]:
!wget

In [None]:
import os
import random
import pandas as pd
import numpy as np
import csv
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from google.colab import drive

# If you are using colab
drive.mount("/content/Drive/")

# Load data
df = pd.read_csv('/content/Drive/My Drive/ILDC_multi.csv')  # path to multi_dataset



In [None]:

#n1 = df.shape[0]
#test_n = int(0.1*n1)
#test_n
#new_df = df[:test_n]
subset_df = df.sample(frac=0.05, random_state=42)

# Split the subset into train, test, and validation sets based on the 'split' column
train_set, test_and_val_set = train_test_split(subset_df, test_size=0.4, random_state=42)
test_set, validation_set = train_test_split(test_and_val_set, test_size=0.5, random_state=42)


In [None]:
train_set

Unnamed: 0,text,label,split,name
34165,\nshah j. \n\na deed of partnership for carryi...,1,test,1967_187.txt
13950,"B. Pattanaik and B. N. Agrawal, JJ. This appe...",0,train,2001_171.txt
14074,2001 3 SCR 424 The following Orders of the ...,0,train,2001_370.txt
23228,"Varadarajan, J. These Criminal Appeals by spe...",1,train,1983_96.txt
6876,"Dr. ARIJIT PASAYAT, J. Noticing that there we...",1,train,2009_391.txt
...,...,...,...,...
5045,Leave granted. This appeal is directed against...,1,train,2011_770.txt
21402,CIVIL APPELLATE JURISDICTION Civil Appeal No. ...,0,train,1981_240.txt
9419,"CRIMINAL APPEAL NO. 548 OF 2007 P. MATHUR, J....",1,train,2007_968.txt
14275,Heard the learned Counsel for the parties. Le...,1,train,2001_696.txt


In [None]:
model_name = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(model_name)

def pad_sequences(sequences, maxlen=None, dtype=torch.long, padding='pre', truncating='pre', value=0.):
    # If maxlen is not provided, infer it from the longest sequence
    if maxlen is None:
        maxlen = max(len(seq) for seq in sequences)

    # Initialize the padded sequences tensor with zeros
    padded_sequences = []

    for seq in sequences:
        if truncating == 'pre':
            truncated_seq = seq[-maxlen:]
        else:
            truncated_seq = seq[:maxlen]

        if padding == 'pre':
            padded_seq = [value] * (maxlen - len(truncated_seq)) + truncated_seq
        else:
            padded_seq = truncated_seq + [value] * (maxlen - len(truncated_seq))

        padded_sequences.append(padded_seq)

    return torch.tensor(padded_sequences, dtype=dtype)

def input_id_maker(dataf, tokenizer):
    input_ids = []
    lengths = []

    for i in tqdm(range(len(dataf['text']))):
        sen = dataf['text'].iloc[i]
        sen = tokenizer.tokenize(sen, add_prefix_space=True)
        CLS = tokenizer.cls_token
        SEP = tokenizer.sep_token
        if(len(sen) > 510):
            sen = sen[len(sen)-510:]

        sen = [CLS] + sen + [SEP]
        encoded_sent = tokenizer.convert_tokens_to_ids(sen)
        input_ids.append(encoded_sent)
        lengths.append(len(encoded_sent))

    input_ids = pad_sequences(input_ids, maxlen=512, value=0, dtype=torch.long, truncating="pre", padding="post")
    return input_ids, lengths

train_input_ids, train_lengths = input_id_maker(train_set, tokenizer)
validation_input_ids, validation_lengths = input_id_maker(validation_set, tokenizer)

def att_masking(input_ids):
    attention_masks = []
    for sent in input_ids:
        att_mask = [int(token_id > 0) for token_id in sent]
        attention_masks.append(att_mask)
    return attention_masks

train_attention_masks = att_masking(train_input_ids)
validation_attention_masks = att_masking(validation_input_ids)

train_labels = train_set['label'].to_numpy().astype('int')
validation_labels = validation_set['label'].to_numpy().astype('int')

train_inputs = torch.tensor(train_input_ids)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_attention_masks)

validation_inputs = torch.tensor(validation_input_ids)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_attention_masks)

# Set batch size
batch_size = 6

# Create DataLoader for training
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create DataLoader for validation
if len(validation_inputs) > 0:
    # Create DataLoader for validation
    validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
    validation_sampler = RandomSampler(validation_data)
    validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)
else:
    print("Validation set is empty.")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load Roberta model for sequence classification
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
model.to(device)

lr = 2e-6
epochs = 3
num_total_steps = len(train_dataloader) * epochs
num_warmup_steps = 1000

# Create optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=lr, correct_bias=True)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_total_steps)

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

seed_val = 2212
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

loss_values = []

# Training loop
for epoch_i in range(epochs):
    print(f'======== Epoch {epoch_i + 1} / {epochs} ========')
    print('Training...')

    total_loss = 0
    model.train()

    for step, batch in enumerate(tqdm(train_dataloader)):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    loss_values.append(avg_train_loss)

    print(f"\nAverage training loss: {avg_train_loss:.2f}")

    print("\nRunning Validation...")
    model.eval()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    for batch in tqdm(validation_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    print(f"\nAccuracy: {eval_accuracy/nb_eval_steps:.2f}")

print("\nTraining complete!")

# Save the trained model
output_dir = './RoBERTa_final/'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print(f"Saving model to {output_dir}")
model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# Copy the model files to a directory in your Google Drive.
!cp -r ./RoBERTa_final2/ "/content/Drive/My Drive/RoBERTa_right_model2/"

# Testing the model
labels = test_set.label.to_numpy().astype(int)

input_ids, input_lengths = input_id_maker(test_set, tokenizer)
attention_masks = att_masking(input_ids)

# Convert to tensors.
prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)
prediction_labels = torch.tensor(labels)

# Set the batch size.
batch_size = 6

# Create the DataLoader.
prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

print(f'Predicting labels for {len(prediction_inputs):,} test sentences...')
model.eval()

predictions, true_labels = [], []

for (step, batch) in enumerate(tqdm(prediction_dataloader)):
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    predictions.append(logits)
    true_labels.append(label_ids)

print('    DONE.')

predictions = np.concatenate(predictions, axis=0)
true_labels = np.concatenate(true_labels, axis=0)

pred_flat = np.argmax(predictions, axis=1).flatten()
labels_flat = true_labels.flatten()

accuracy = flat_accuracy(predictions, true_labels)
print(f'Accuracy: {accuracy:.2f}')

100%|██████████| 1044/1044 [00:23<00:00, 43.76it/s]
100%|██████████| 349/349 [00:07<00:00, 44.44it/s]
  train_inputs = torch.tensor(train_input_ids)
  validation_inputs = torch.tensor(validation_input_ids)
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training...


100%|██████████| 174/174 [01:53<00:00,  1.54it/s]



Average training loss: 0.70

Running Validation...


100%|██████████| 59/59 [00:13<00:00,  4.41it/s]



Accuracy: 0.38
Training...


100%|██████████| 174/174 [01:52<00:00,  1.54it/s]



Average training loss: 0.69

Running Validation...


100%|██████████| 59/59 [00:13<00:00,  4.47it/s]



Accuracy: 0.64
Training...


100%|██████████| 174/174 [01:52<00:00,  1.55it/s]



Average training loss: 0.69

Running Validation...


100%|██████████| 59/59 [00:13<00:00,  4.48it/s]



Accuracy: 0.61

Training complete!
Saving model to ./RoBERTa_final/
cp: cannot stat './RoBERTa_final2/': No such file or directory


100%|██████████| 348/348 [00:05<00:00, 60.94it/s]
  prediction_inputs = torch.tensor(input_ids)


Predicting labels for 348 test sentences...


100%|██████████| 58/58 [00:12<00:00,  4.49it/s]

    DONE.
Accuracy: 0.59





In [None]:
import shutil
import os
from google.colab import drive

# Zip the directory
shutil.make_archive("/content/RoBERTa_final", 'zip', "/content/RoBERTa_final")

# Move the zipped file to Google Drive
shutil.move("/content/RoBERTa_final.zip", "/content/Drive/My Drive/RoBERTa_final.zip")


'/content/Drive/My Drive/RoBERTa_final.zip'