<a href="https://colab.research.google.com/github/blt-tsp/Fine-tuning-BERT-and-summarization-/blob/main/RoBERTa_finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Finetuning RoBERTa and preprocessing datas


pipeline for bin to tensor 

In [None]:
!pip install librosa

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import os
import numpy as np
import struct
from transformers import RobertaTokenizer

# Define the PCM parameters
bit_depth = 16
sample_rate = 8000
frame_size = 0.01  # 10 milliseconds
frame_step = 0.005  # 5 milliseconds

# Load the RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Define a function to convert binary data to PCM
def binary_to_pcm(binary_data):
    # Convert binary data to array of 16-bit integers
    int_data = np.frombuffer(binary_data, dtype=np.int16)
    # Normalize the data to the range [-1, 1]
    float_data = int_data / 32768.0
    # Resample the data to the desired sample rate
    resampled_data = librosa.resample(float_data, 44100, sample_rate)
    # Convert the data to PCM format
    pcm_data = (resampled_data * (2 ** (bit_depth - 1) - 1)).astype(np.int16)
    return pcm_data

# Define a function to convert PCM data to text
def pcm_to_text(pcm_data):
    # Convert PCM data to binary string
    binary_data = struct.pack('h' * len(pcm_data), *pcm_data)
    # Encode binary string as ASCII text
    text_data = binary_data.encode('ascii', 'ignore')
    return text_data

# Define a function to tokenize text data
def tokenize_text(text_data):
    # Tokenize the text data
    tokenized_data = tokenizer.encode(text_data, add_special_tokens=True, max_length=512, truncation=True)
    return tokenized_data

# Define a function to prepare the data for a single binary file
def prepare_data_for_file(file_path):
    # Load the binary data from file
    with open(file_path, 'rb') as f:
        binary_data = f.read()

    # Convert binary data to PCM
    pcm_data = binary_to_pcm(binary_data)

    # Convert PCM data to text
    text_data = pcm_to_text(pcm_data)

    # Tokenize text data
    tokenized_data = tokenize_text(text_data)

    # Return the tokenized data
    return tokenized_data

# Define a function to prepare the data for all binary files in a directory
def prepare_data_for_directory(directory_path):
    # Get the list of binary files in the directory
    file_list = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f)) and f.endswith('.bin')]

    # Prepare the data for each file in the directory
    tokenized_data_list = []
    for file_name in file_list:
        file_path = os.path.join(directory_path, file_name)
        tokenized_data = prepare_data_for_file(file_path)
        tokenized_data_list.append(tokenized_data)

    # Convert the list of tokenized data to a tensor
    tensor_data = torch.tensor(tokenized_data_list)

    # Return the tensor data
    return tensor_data


fine tuning on some of our xml files


In [None]:
import torch
from transformers import RobertaForSequenceClassification, RobertaTokenizer
import xml.etree.ElementTree as ET

# Load the labeled dataset
input_tensors = torch.load('input_tensors.pt')
target_outputs = [ET.parse(xml_file).getroot() for xml_file in os.listdir('target_outputs') if xml_file.endswith('.xml')]

# Load the pre-trained RoBERTa model and tokenizer
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=5)  
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Define the optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# Define a function to train the model
def train_model(input_tensors, target_outputs, model, tokenizer, optimizer, loss_fn, num_epochs=10):
    # Set the model to training mode
    model.train()

    # Tokenize the target outputs
    tokenized_target_outputs = [torch.tensor(tokenizer.encode(ET.tostring(target_output).decode('utf-8'), add_special_tokens=True)) for target_output in target_outputs]

    # Combine the input tensors and target outputs into a Dataset
    dataset = torch.utils.data.TensorDataset(input_tensors, torch.stack(tokenized_target_outputs))

    # Define the DataLoader
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=4, shuffle=True)

    # Train the model for the specified number of epochs
    for epoch in range(num_epochs):
        # Loop over the batches in the DataLoader
        for batch in dataloader:
            # Extract the input and target tensors from the batch
            input_ids = batch[0]
            target_ids = batch[1]

            # Zero out the gradients
            optimizer.zero_grad()

            # Compute the model output for the input tensor
            outputs = model(input_ids)

            # Compute the loss between the model output and target tensor
            loss = loss_fn(outputs.logits, target_ids)

            # Backpropagate the loss and update the model weights
            loss.backward()
            optimizer.step()

        # Print the loss for the current epoch
        print(f"Epoch {epoch + 1}: Loss = {loss.item()}")

# Train the model on the labeled dataset
train_model(input_tensors, target_outputs, model, tokenizer, optimizer, loss_fn, num_epochs=10)

# Save the fine-tuned model
model.save_pretrained('fine_tuned_model')


FileNotFoundError: ignored

distance between 2 trees

In [None]:
def tree_distance(tree1, tree2):
    """
    Calculates the distance between two telecom trees.

    Args:
        tree1 (xml.etree.ElementTree.Element): The root node of the first tree.
        tree2 (xml.etree.ElementTree.Element): The root node of the second tree.

    Returns:
        The distance between the two trees.
    """
    # If the trees are identical, the distance is 0
    if ET.tostring(tree1) == ET.tostring(tree2):
        return 0

    # If the trees have a different number of children, the distance is the absolute difference in the number of children
    if len(tree1) != len(tree2):
        return abs(len(tree1) - len(tree2))
    # If the trees have the same number of children, calculate the distance between each pair of corresponding children
    child_distances = [tree_distance(tree1[i], tree2[i]) for i in range(len(tree1))]

    # Return the sum of the distances between the corresponding children
    return sum(child_distances)



This script loads the fine-tuned RoBERTa model and tokenizer, loads the reference telecom trees from a directory containing XML files, prepares an input tensor for prediction, tokenizes the input tensor, makes a prediction with the model, converts the output to a softmax probability distribution, gets the predicted class, loads the predicted telecom protocol from an XML file based on the predicted class, loops through the list of reference telecom trees and calculates the distance between the predicted telecom protocol and each reference telecom tree, and chooses the reference telecom tree with the smallest distance. The script then prints the XML representation of the matched telecom tree.




In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m62.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m99.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1


In [None]:
import os
import torch
from transformers import RobertaForSequenceClassification, RobertaTokenizer
import xml.etree.ElementTree as ET

# Load the fine-tuned model and tokenizer
model = RobertaForSequenceClassification.from_pretrained('fine_tuned_model')
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Load the reference telecom trees from XML files
ref_trees = []
for file in os.listdir('reference_trees'):
    if file.endswith('.xml'):
        ref_trees.append(ET.parse(os.path.join('reference_trees', file)).getroot())

# Prepare the input tensor for prediction
input_data = b'\x00\x01\x02\x03'
input_tensor = torch.tensor([input_data])

# Tokenize the input tensor
input_ids = tokenizer.encode(input_tensor[0].tolist(), add_special_tokens=True, truncation=True, padding=True, max_length=512, return_tensors='pt')

# Make a prediction with the model
output = model(input_ids)[0]

# Convert the output tensor to a softmax probability distribution
probs = torch.nn.functional.softmax(output, dim=-1)

# Get the predicted class
predicted_class = torch.argmax(probs, dim=-1).item()

# Load the predicted telecom protocol from an XML file
predicted_protocol = ET.parse(os.path.join('predicted_protocols', f'protocol_{predicted_class}.xml')).getroot()

# Calculate the distance between the predicted telecom protocol and each reference telecom tree
min_distance = float('inf')
matched_tree = None
for ref_tree in ref_trees:
    distance = tree_distance(predicted_protocol, ref_tree)
    if distance < min_distance:
        min_distance = distance
        matched_tree = ref_tree

# Print the matched telecom tree
print(ET.tostring(matched_tree))


OSError: ignored

Implementing some classes to manipulate data and inference on gpu


In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaForSequenceClassification, RobertaTokenizer
import xml.etree.ElementTree as ET
import torch.nn as nn


def train_test_split(data, test_size, batch_size, device):
      train_data, test_data = torch.utils.data.random_split(data, [len(data) - test_size, test_size])
      train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, pin_memory=True)
      test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False, pin_memory=True)
      return train_loader, test_loader


class TelecomDataset(Dataset):        # binary flow dataset
    def __init__(self, data_dir):
        self.data_dir = data_dir
        self.data = []
        for file in os.listdir(data_dir):
            if file.endswith('.bin'):
                with open(os.path.join(data_dir, file), 'rb') as f:
                    self.data.append(f.read())

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]


class TelecomTreeDataset(Dataset):    # telecom tree dataset
    def __init__(self, data_dir):
        self.data_dir = data_dir
        self.trees = []
        for file in os.listdir(data_dir):
            if file.endswith('.xml'):
                self.trees.append(ET.parse(os.path.join(data_dir, file)).getroot())

    def __len__(self):
        return len(self.trees)

    def __getitem__(self, idx):
        return self.trees[idx]

class TelecomTreeNet(nn.Module):    
    def __init__(self, num_classes):
        super(TelecomTreeNet, self).__init__()
        self.roberta = transformers.RobertaModel.from_pretrained('roberta-base')
        self.num_classes = num_classes                        #a discuter avec aurélien
        self.classifier = nn.Sequential(
            nn.Linear(self.roberta.config.hidden_size, 64),
            nn.ReLU(),
            nn.Linear(64, self.num_classes)
        )
    
    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids, attention_mask)
        pooled_output = outputs.pooler_output
        logits = self.classifier(pooled_output)
        return logits


class TelecomTreeMatcher:
    def __init__(self, model_dir, tree_dir):
        self.model = RobertaForSequenceClassification.from_pretrained(model_dir)
        self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        self.ref_trees = []
        for file in os.listdir(tree_dir):
            if file.endswith('.xml'):
                self.ref_trees.append(ET.parse(os.path.join(tree_dir, file)).getroot())

    def preprocess(self, data):
        input_ids = []
        for d in data:
            input_tensor = torch.tensor([d])
            input_id = self.tokenizer.encode(input_tensor[0].tolist(), add_special_tokens=True, truncation=True,
                                             padding=True, max_length=512, return_tensors='pt')
            input_ids.append(input_id)
        return input_ids

    def match(self, data):
        input_ids = self.preprocess(data)
        predicted_classes = self.predict(input_ids)
        predicted_protocols = [ET.parse(os.path.join('predicted_protocols', f'protocol_{c}.xml')).getroot() for c in
                               predicted_classes]
        distances = [self.tree_distance(p, t) for p, t in zip(predicted_protocols, self.ref_trees)]
        min_distance = min(distances)
        matched_tree = self.ref_trees[distances.index(min_distance)]
        return ET.tostring(matched_tree)

    @staticmethod
    def tree_distance(t1, t2):
        # recursive function to calculate distance between two trees
        pass

    def train(self, epochs, batch_size, patience=3):
        train_loader = DataLoader(self.train_data, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(self.val_data, batch_size=batch_size, shuffle=False)

        model = TelecomTreeNet()
        model = nn.DataParallel(model)  # Distribute model across multiple GPUs
        model.to(self.device)

        optimizer = torch.optim.Adam(model.parameters())
        criterion = nn.MSELoss()

        best_loss = float('inf')
        early_stop_count = 0

        for epoch in range(epochs):
            train_loss = 0
            val_loss = 0

            # Training loop
            model.train()
            for batch in train_loader:
                optimizer.zero_grad()
                inputs, labels = batch
                inputs = inputs.to(self.device)
                labels = labels.to(self.device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
                train_loss += loss.item()

            # Validation loop
            model.eval()
            with torch.no_grad():
                for batch in val_loader:
                    inputs, labels = batch
                    inputs = inputs.to(self.device)
                    labels = labels.to(self.device)
                    outputs = model(inputs)
                    loss = criterion(outputs, labels)
                    val_loss += loss.item()

            # Early stopping
            if val_loss < best_loss:
                best_loss = val_loss
                early_stop_count = 0
            else:
                early_stop_count += 1
                if early_stop_count == patience:
                    print(f'Validation loss did not improve for {patience} epochs. Training stopped.')
                    break

            print(f'Epoch {epoch+1}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}')

        self.model = model.module  # Get the underlying model after training

    def predict(self, tensor_data):
        with torch.no_grad():
            self.model.eval()
            inputs = tensor_data.to(self.device)
            outputs = self.model(inputs)
            return self._tensor_to_tree(outputs)

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import transformers
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

class SDHProtocolTrainer:
    def __init__(self, model, train_dataset, val_dataset, batch_size, learning_rate, num_epochs, patience=3):
        self.model = model
        self.train_dataset = train_dataset
        self.val_dataset = val_dataset
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.num_epochs = num_epochs
        self.patience = patience
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.optimizer = transformers.AdamW(model.parameters(), lr=learning_rate)
        self.loss_fn = nn.CrossEntropyLoss()
        self.scheduler = transformers.get_linear_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=0,
            num_training_steps=len(self.train_dataset) // self.batch_size * self.num_epochs
        )

    def train(self):
        best_val_loss = np.inf
        epochs_since_improvement = 0
        train_loader = DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)
        val_loader = DataLoader(self.val_dataset, batch_size=self.batch_size, shuffle=False)
        for epoch in range(1, self.num_epochs + 1):
            print(f'Epoch {epoch}/{self.num_epochs}:')
            train_loss = self.train_epoch(train_loader)
            val_loss, val_acc = self.evaluate(val_loader)
            print(f'Training loss: {train_loss:.4f}, Validation loss: {val_loss:.4f}, Validation accuracy: {val_acc:.4f}')
            self.scheduler.step()
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                epochs_since_improvement = 0
                torch.save(self.model.state_dict(), 'best_model.pth')
                print('Saved new best model')
            else:
                epochs_since_improvement += 1
                if epochs_since_improvement >= self.patience:
                    print(f'Validation loss did not improve for {self.patience} epochs. Training stopped.')
                    break

    def train_epoch(self, train_loader):
        self.model.train()
        train_loss = 0
        for batch in train_loader:
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(self.device)
            attention_mask = attention_mask.to(self.device)
            labels = labels.to(self.device)
            self.optimizer.zero_grad()
            logits = self.model(input_ids, attention_mask)
            loss = self.loss_fn(logits, labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
            self.optimizer.step()
            train_loss += loss.item()
        return train_loss / len(train_loader)


    def evaluate(self, dataloader):
        self.model.eval()
        y_true = []
        y_pred = []
        with torch.no_grad():
            for batch in dataloader:
                inputs, labels = batch
                inputs = inputs.to(self.device)
                labels = labels.to(self.device)
                outputs = self.model(inputs)
                _, preds = torch.max(outputs, 1)
                y_true.extend(labels.cpu().numpy().tolist())
                y_pred.extend(preds.cpu().numpy().tolist())
                
        accuracy = accuracy_score(y_true, y_pred)
        precision, recall, f1_score, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
        print('Accuracy: {:.4f}'.format(accuracy))
        print('Precision: {:.4f}'.format(precision))
        print('Recall: {:.4f}'.format(recall))
        print('F1 Score: {:.4f}'.format(f1_score))


## Some visualisations


Bar chart of encaspulated protocols

In [None]:
import matplotlib.pyplot as plt

unique_protocols = set(encapsulated_protocols)
counts = [encapsulated_protocols.count(proto) for proto in unique_protocols]

plt.bar(unique_protocols, counts)
plt.title('Distribution of Encapsulated Protocols')
plt.xlabel('Encapsulated Protocols')
plt.ylabel('Count')
plt.show()


Distributions of frame lengths

In [None]:
plt.hist(sdh_frame_lengths, bins=50)
plt.title('Distribution of SDH Frame Lengths')
plt.xlabel('SDH Frame Lengths')
plt.ylabel('Count')
plt.show()

Correlation Matrix

In [None]:
import numpy as np
import seaborn as sns

corr_matrix = np.corrcoef(features.T)

sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix of Features')
plt.show()

PCA Analysis

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca_features = pca.fit_transform(features)

plt.scatter(pca_features[:,0], pca_features[:,1], c=encapsulated_protocols)
plt.title('PCA Scatterplot of Encapsulated Protocols')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()


Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix

# Assuming 'model' is your trained model and 'X_test' and 'y_test' are your test set
y_pred = model.predict(X_test)
conf_mat = confusion_matrix(y_test, y_pred)

sns.heatmap(conf_mat, annot=True, cmap='coolwarm')
plt.title('Confusion Matrix of Test Set')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()


## SHAP implementation to gain insights from our outputs

In [20]:
!pip install shap

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting shap
  Downloading shap-0.41.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (572 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m572.6/572.6 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
Collecting slicer==0.0.7
  Downloading slicer-0.0.7-py3-none-any.whl (14 kB)
Installing collected packages: slicer, shap
Successfully installed shap-0.41.0 slicer-0.0.7


In [23]:
import shap

class TreeExplainer:
    def __init__(self, model, data):
        self.model = model
        self.data = data
        self.explainer = shap.Explainer(model)
        self.shap_values = self.explainer(data)
    
    def visualize(self, idx):
        shap.plots.waterfall(self.shap_values[idx])

# create an explainer object for the trained model
explainer = shap.TreeExplainer(model)

# get a sample of the test data to compute SHAP values for
sample = test_data[0:100]

# compute the SHAP values for the sample data
shap_values = explainer.shap_values(sample)

# summarize the SHAP values for the first sample instance
shap.summary_plot(shap_values[0], feature_names=['feat1', 'feat2', 'feat3', ...])


NameError: ignored