In [None]:
import torch
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, AdamW
from datasets import load_dataset
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

dataset = load_dataset("glue", "stsb")

class STSDataset(Dataset):
    def __init__(self, dataset, tokenizer):
        self.data = dataset["train"]
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        pair = self.data[idx]
        encoding = self.tokenizer(pair["sentence1"], pair["sentence2"], padding='max_length', truncation=True, max_length=128, return_tensors='pt')
        label = torch.tensor([pair["label"], 1.0 - pair["label"]], dtype=torch.float32)  # Assuming binary classification
        return {"input_ids": encoding["input_ids"].squeeze(), "attention_mask": encoding["attention_mask"].squeeze(), "label": label}

train_dataset = STSDataset(dataset, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3): 
    for batch in train_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

model.save_pretrained("fine_tuned_distilbert_sts")


In [None]:
source_embedding = model.encode(ideal)
comparison_sentences = [student]
comparison_embeddings = model.encode(comparison_sentences)

similarity_scores = util.pytorch_cos_sim(source_embedding, comparison_embeddings)[0].tolist()

max_similarity = 1.0
min_similarity = 0.0
max_marks = 5


for i, sentence in enumerate(comparison_sentences):
    similarity_score = similarity_scores[i]
    normalized_score = max(0, min(max_similarity, similarity_score))
    marks = round(normalized_score * max_marks, 1)
    print(f"Similarity Score: {similarity_score}, Marks: {marks}")