In [None]:
!pip install accelerate -U

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12=

In [None]:
!pip install transformers[torch]
#!pip install transformers[torch] - U



In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import random

class TextPairDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        # Concatenate the group headers with the requirement descriptions
        text_pair_1 = self.texts[idx][0] + " [SEP] " + self.texts[idx][2]  # R_RequirementDesc + R_ReqGroupHeader
        text_pair_2 = self.texts[idx][1] + " [SEP] " + self.texts[idx][3]  # L_RequirementDesc + L_ReqGroupHeader
        encoded_pair = self.tokenizer(
            text_pair_1, text_pair_2,
            truncation=True, padding='max_length', max_length=512,
            return_tensors='pt'
        )
        item = {key: val.squeeze(0) for key, val in encoded_pair.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    return {'accuracy': acc}

df = pd.read_csv('/content/drive/MyDrive/SampleDataCSL_TSRCoPandVSA.csv')


df_y = df[df['Mapped'] == 'Y']


df_n = df[df['Mapped'] == 'N'].sample(n=400, random_state=42)  # random_state is set for reproducibility


df_combined = pd.concat([df_y, df_n]).dropna(subset=['RequirementDesc', 'RequirementDesc.1', 'L_ReqGroupHeader', 'R_ReqGroupHeader'])
df_combined.reset_index(drop=True, inplace=True)
df_combined['is_match'] = df_combined['Mapped'].apply(lambda x: 1 if x == 'Y' else 0)

# Prepare text pairs, including group headers
train_texts, test_texts, train_labels, test_labels = train_test_split(
    list(zip(df_combined['RequirementDesc'], df_combined['RequirementDesc.1'], df_combined['R_ReqGroupHeader'], df_combined['L_ReqGroupHeader'])),
    df_combined['is_match'].tolist(),
    test_size=0.2,
    random_state=42
)

train_dataset = TextPairDataset(train_texts, train_labels)
test_dataset = TextPairDataset(test_texts, test_labels)

# Initialize model and training
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
training_args = TrainingArguments(
    output_dir='./results',            # Directory where the results and checkpoints will be saved
    num_train_epochs=5,                # Total number of training epochs
    per_device_train_batch_size=32,    # Batch size per device
    warmup_steps=250,                  # Number of warmup steps
    weight_decay=0.01,                 # Weight decay
    logging_dir='./logs',              # Directory for storing logs
    evaluation_strategy="steps",       # Evaluate every `logging_steps`
    save_strategy="steps",             # Save the model every `save_steps`
    save_steps=100,                    # Save the model every 100 steps
    logging_steps=50,                  # Log metrics every 50 steps
    load_best_model_at_end=True        # Load the best model at the end of training
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train()
final_results = trainer.evaluate(test_dataset)
print("Test Set Accuracy:", final_results['eval_accuracy'])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
50,0.6831,0.51373,0.757962
100,0.367,0.295651,0.89172


Test Set Accuracy: 0.89171974522293


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments
from torch.utils.data import Dataset

# Load new dataset
new_data_path = '/content/drive/MyDrive/DataSet_2024_07_20.csv'  # Update this path to your new data file
new_df = pd.read_csv(new_data_path)

# Filter and sample the data
df_y = new_df[new_df['Mapped'] == 'Y']
df_n = new_df[new_df['Mapped'] == 'N'].sample(n=1000, random_state=42)  # Sample 1000 'N' Mapped entries

# Combine and prepare the data
df_combined = pd.concat([df_y, df_n]).reset_index(drop=True)
df_combined['is_match'] = df_combined['Mapped'].apply(lambda x: 1 if x == 'Y' else 0)

# Splitting the data into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    list(zip(df_combined['L_RequirementDesc'], df_combined['R_RequirementDesc'], df_combined['R_ReqGroupHeader'], df_combined['L_ReqGroupHeader'])),
    df_combined['is_match'].tolist(),
    test_size=0.2,  # 20% test data
    random_state=42
)

# Load the saved model
model_path = '/content/results/checkpoint-100'
model = BertForSequenceClassification.from_pretrained(model_path)

# Dataset class
class TextPairDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text_pair_1 = self.texts[idx][0] + " [SEP] " + self.texts[idx][2]
        text_pair_2 = self.texts[idx][1] + " [SEP] " + self.texts[idx][3]
        encoded_pair = self.tokenizer(
            text_pair_1, text_pair_2,
            truncation=True, padding='max_length', max_length=512,
            return_tensors='pt'
        )
        item = {key: val.squeeze(0) for key, val in encoded_pair.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = TextPairDataset(train_texts, train_labels)
test_dataset = TextPairDataset(test_texts, test_labels)

# Update training arguments
training_args = TrainingArguments(
    output_dir='./new_results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./new_logs',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10
)

# Define compute_metrics function to evaluate the model
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {'accuracy': accuracy_score(labels, predictions)}

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Start training
trainer.train()
final_results = trainer.evaluate(test_dataset)
print("Test Set Accuracy:", final_results['eval_accuracy'])




Epoch,Training Loss,Validation Loss,Accuracy
1,0.3878,0.355136,0.875598
2,0.2041,0.320648,0.899522
3,0.1913,0.308463,0.923445


Test Set Accuracy: 0.9234449760765551


In [None]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn.functional as F


model_path = '/content/results/checkpoint-100'
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

df = pd.read_csv('/content/drive/MyDrive/SampleDataCSL_TSRCoPandVSA.csv')
df['RequirementDesc'] = df['RequirementDesc'].apply(lambda x: x.lower().strip())
df['RequirementDesc.1'] = df['RequirementDesc.1'].apply(lambda x: x.lower().strip())

texts = list(zip(df['RequirementDesc'], df['RequirementDesc.1']))
labels = df['Mapped'].apply(lambda x: 1 if x != 'N' else 0).tolist()


encodings = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")

# Create a TensorDataset and DataLoader
input_ids = encodings['input_ids']
attention_mask = encodings['attention_mask']
labels = torch.tensor(labels)

dataset = TensorDataset(input_ids, attention_mask, labels)
dataloader = DataLoader(dataset, batch_size=32, shuffle=False)

model.eval()

all_predictions = []
similarity_scores = []

# Run inference
with torch.no_grad():
    for batch in dataloader:
        input_ids, attention_mask, _ = batch
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits


        probs = F.softmax(logits, dim=1)


        match_probabilities = probs[:, 1]  # Assuming class '1' is the match class
        similarity_scores.extend(match_probabilities.tolist())

        predictions = torch.argmax(logits, dim=-1)
        all_predictions.extend(predictions.cpu().numpy())


df['IsCorrectMapping'] = ['Yes' if pred == 1 else 'No' for pred in all_predictions]

df['SimilarityScores'] = similarity_scores


best_alternative_match = []
for i, (mapping, desc1) in enumerate(zip(df['IsCorrectMapping'], df['RequirementDesc.1'])):
    if mapping == 'No':
        max_index = df[(df['IsCorrectMapping'] == 'No') & (df.index != i)]['SimilarityScores'].idxmax()
        best_alternative_match.append(df.loc[max_index, 'RequirementDesc.1'])
    else:
        best_alternative_match.append(desc1)

df['BestMatch_RequirementDesc'] = best_alternative_match

df.to_csv('/content/sample_data/Model_SampleDataCSL_TSRCoPandVSA(SimSco).csv', index=False)
print("DataFrame saved")


DataFrame saved
