# Functions to Save and Read Data in JSON Format Using Python

In [1]:
import json

In [2]:
def save_to_json(data, file_path):
    """
    Save data to a JSON file.
    
    Args:
    - data: Python object (e.g., dict or list) to save.
    - file_path: Path to the JSON file.
    """
    try:
        with open(file_path, "w") as json_file:
            json.dump(data, json_file, indent=4)  # Save with pretty formatting
        print(f"Data successfully saved to {file_path}")
    except Exception as e:
        print(f"Error saving data to JSON: {e}")

In [3]:
def read_from_json(file_path):
    """
    Read data from a JSON file.
    
    Args:
    - file_path: Path to the JSON file.
    
    Returns:
    - The Python object (e.g., dict or list) loaded from the JSON file.
    """
    try:
        with open(file_path, "r") as json_file:
            data = json.load(json_file)
        print(f"Data successfully loaded from {file_path}")
        return data
    except Exception as e:
        print(f"Error reading data from JSON: {e}")
        return None

---

# Tokenization and Feature Extraction for Dataset Processing with RoBERTa Model

In [4]:
from transformers import AutoTokenizer, AutoModel
import torch

In [5]:
def tokenize_datasets_from_dict(dataset_dict, tokenizer_name="roberta-base", padding_strategy="max_length", batch_size=16):
    """
    Tokenizes multiple datasets given as dictionaries and returns tokenized outputs with consistent padding.

    Args:
        dataset_dict (dict): A dictionary with dataset names as keys (e.g., 'train', 'dev', 'test')
                            and lists of dictionaries as values. Each dictionary contains text, labels, and metadata.
        tokenizer_name (str): Name of the tokenizer to load (default: "roberta-base").
        padding_strategy (str): Padding strategy, default is "max_length".
                                Options: "max_length", "longest" (custom longest sequence).
        batch_size (int): Size of the batch to process at a time.

    Returns:
        dict: A dictionary with tokenized datasets.
    """
    # Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    max_model_length = tokenizer.model_max_length
    print(f"Maximum model length for {tokenizer_name}: {max_model_length}")

    # Compute the maximum sequence length across all datasets
    max_sequence_length = 0
    for dataset_name, data_list in dataset_dict.items():
        for data_entry in data_list:
            # Assuming the text is in the first key of the dictionary
            text_key = next(iter(data_entry))  # Take the first key in each entry
            text = data_entry[text_key]
            tokens = tokenizer(text, truncation=False, add_special_tokens=True)
            sequence_length = len(tokens["input_ids"])
            max_sequence_length = max(max_sequence_length, sequence_length)

    print(f"Max sequence length in datasets: {max_sequence_length}")

    # Tokenize all datasets with consistent padding
    tokenized_datasets = {}
    for dataset_name, data_list in dataset_dict.items():
        tokenized_list = []
        for i in range(0, len(data_list), batch_size):
            batch = [data_entry[next(iter(data_entry))] for data_entry in data_list[i:i+batch_size]]
            tokenized = tokenizer(
                batch,
                padding=padding_strategy,
                max_length=max_sequence_length if padding_strategy == "max_length" else None,
                truncation=True,
                add_special_tokens=True,
                return_tensors="pt"  # Outputs PyTorch tensors
            )
            tokenized_list.append(tokenized)

        # Combine all tokenized batches into one large dataset
        tokenized_datasets[dataset_name] = {
            'input_ids': torch.cat([t['input_ids'] for t in tokenized_list], dim=0),
            'attention_mask': torch.cat([t['attention_mask'] for t in tokenized_list], dim=0),
            'token_type_ids': torch.cat([t['token_type_ids'] for t in tokenized_list], dim=0) if 'token_type_ids' in tokenized_list[0] else None
        }
        print(f"{dataset_name} tokenized shape: {tokenized_datasets[dataset_name]['input_ids'].shape}")

    return tokenized_datasets

In [6]:
def extract_features_from_datasets(tokenized_datasets, dataset_dict, model_name="roberta-base", batch_size=16):
    """
    Extracts features from tokenized datasets using the pre-trained model.

    Args:
        tokenized_datasets (dict): A dictionary with tokenized datasets (output from the tokenizer function).
        dataset_dict (dict): Original input dataset to align features with labels.
        model_name (str): The model name to load (default: "roberta-base").
        batch_size (int): Size of the batch to process at a time.

    Returns:
        dict: A dictionary with extracted features and corresponding labels for each dataset.
    """
    # Determine the device to use (GPU if available, else CPU)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Load the pre-trained model and move it to the device
    model = AutoModel.from_pretrained(model_name).to(device)

    # Extract features for each dataset
    features_datasets = {
        "train": [],
        "dev": [],
        "test": []
    }

    for dataset_name, tokenized in tokenized_datasets.items():
        with torch.no_grad():
            # Process in smaller batches to reduce memory consumption
            for i in range(0, tokenized['input_ids'].shape[0], batch_size):
                # Move each batch to the device
                batch_input_ids = tokenized['input_ids'][i:i+batch_size].to(device)
                batch_attention_mask = tokenized['attention_mask'][i:i+batch_size].to(device)

                # Pass the tokenized inputs through the model
                outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask)

                # Extract the hidden states (the last hidden state is commonly used for features)
                hidden_states = outputs.last_hidden_state  # Shape: (batch_size, sequence_length, hidden_size)

                # Use the [CLS] token's hidden state as the feature vector
                cls_features = hidden_states[:, 0, :]  # Shape: (batch_size, hidden_size)

                # Extract features for each entry in the batch
                for idx, data_entry in enumerate(dataset_dict[dataset_name][i:i+batch_size]):
                    text_key = next(iter(data_entry))  # Extract the first key (text identifier, e.g., "0_0")
                    text = data_entry.get(f"{text_key}", None)
                    label = data_entry.get("label", None)
                    y = data_entry.get("y", None)

                    temp_dict = {
                        text_key: text,
                        f"{text_key}_RoBERTa": cls_features[idx].cpu().tolist(),  # Move tensor to CPU before converting to list
                        "y": y,
                        "label": label
                    }

                    features_datasets[dataset_name].append(temp_dict)

        print(f"{dataset_name} features shape: {cls_features.shape}")

    return features_datasets

---

# Example Usage of Tokenization and Feature Extraction with RoBERTa Model

In [7]:
# Example usage
if __name__ == "__main__":
    # Example input data
    text_data = {
    "train": [
        {
            "0_0": "also I was the point person on my company\u0092s transition from the KL-5 to GR-6 system.",
            "y": 0,
            "label": "neutral"
        },
        {
            "0_1": "You must\u0092ve had your hands full.",
            "y": 0,
            "label": "neutral"
        },
        {
            "0_2": "That I did. That I did.",
            "y": 0,
            "label": "neutral"
        }

        ],
    
    "dev": [
        {
            "0_0": "Oh my God, he\u0092s lost it. He\u0092s totally lost it.",
            "y": 3,
            "label": "sadness"
        },
        {
            "0_1": "What?",
            "y": 1,
            "label": "surprise"
        },
        {
            "1_0": "Or! Or, we could go to the bank, close our accounts and cut them off at the source.",
            "y": 0,
            "label": "neutral"
        }

        ],
    
    "test": [
        {
            "0_0": "Why do all you\u0092re coffee mugs have numbers on the bottom?",
            "y": 1,
            "label": "surprise"
        },
        {
            "0_1": "Oh. That\u0092s so Monica can keep track. That way if one on them is missing, she can be like, \u0091Where\u0092s number 27?!\u0092",
            "y": 6,
            "label": "anger"
        },
        {
            "0_2": "Y'know what?",
            "y": 0,
            "label": "neutral"
        }
    ]
        
    }


    tokenizer_name = "roberta-base"
    tokenized_datasets = tokenize_datasets_from_dict(text_data, tokenizer_name)
    features_datasets = extract_features_from_datasets(tokenized_datasets, text_data, tokenizer_name)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Maximum model length for roberta-base: 512
Max sequence length in datasets: 39
train tokenized shape: torch.Size([3, 39])
dev tokenized shape: torch.Size([3, 39])
test tokenized shape: torch.Size([3, 39])
Using device: cuda


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train features shape: torch.Size([3, 768])
dev features shape: torch.Size([3, 768])
test features shape: torch.Size([3, 768])


---

# Tokenization and Feature Extraction for MELD Text Data with RoBERTa Model

In [8]:
# Example Usage
if __name__ == "__main__":
    # Filepath
    MELD_textual_data_file_path = "/kaggle/input/meld-emotion-recognition/JSON files/JSON files/Final Format/MELD_Textual_Data_Cleaned_Processed.json"

    # Read data from JSON
    MELD_textual_data = read_from_json(MELD_textual_data_file_path)

    tokenizer_name = "roberta-base"
    tokenized_MELD_textual_data = tokenize_datasets_from_dict(MELD_textual_data, tokenizer_name)
    textual_features_roberta = extract_features_from_datasets(tokenized_MELD_textual_data, MELD_textual_data, tokenizer_name)

Data successfully loaded from /kaggle/input/meld-emotion-recognition/JSON files/JSON files/Final Format/MELD_Textual_Data_Cleaned_Processed.json
Maximum model length for roberta-base: 512
Max sequence length in datasets: 92
train tokenized shape: torch.Size([9988, 92])
dev tokenized shape: torch.Size([1108, 92])


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


test tokenized shape: torch.Size([2610, 92])
Using device: cuda
train features shape: torch.Size([4, 768])
dev features shape: torch.Size([4, 768])
test features shape: torch.Size([2, 768])


---

# Save the Textual Features Extracted with RoBERTa

In [9]:
# Example Usage
if __name__ == "__main__":
    # Filepath
    textual_features_roberta_file_path = "/kaggle/working/Textual_Features_RoBERTa.json"

    # Save data to JSON
    save_to_json(textual_features_roberta, textual_features_roberta_file_path)

Data successfully saved to /kaggle/working/Textual_Features_RoBERTa.json


---

# Structuring Extracted Textual Features

In [10]:
# Extracted Features from Text Using RoBERTa (Textual_Features_RoBERTa.json)

"""

Textual_Features_RoBERTa = {
    "train": [
        {
            "0_0": "also I was the point person on my company\u0092s transition from the KL-5 to GR-6 system.",
            
            "0_0_RoBERTa": [
            
                -0.09924783557653427, 
                
                .................................................. (so on)
                
            ],
            
            "y": 0,
            
            "label": "neutral"
            
        },
        {
            "0_1": "You must\u0092ve had your hands full.",
            
            "0_1_RoBERTa": [
            
                -0.13033033907413483,
                
               .................................................. (so on)
               
            ],
            
            "y": 0,
            
            "label": "neutral"
            
        },
        {
            "0_2": "That I did. That I did.",
            "0_2_RoBERTa": [
                -0.11354167014360428,

              .................................................. (so on)
              
            ],
            
            "y": 0,
            
            "label": "neutral"
            
        },

        .................................................. (so on)

        
    ],
    "dev": [
        {
            "0_0": "Oh my God, he\u0092s lost it. He\u0092s totally lost it.",
            
            "0_0_RoBERTa": [
            
                -0.1152895987033844,

               .................................................. (so on)
               
            ],
            
            "y": 3,
            
            "label": "sadness"
        },
        {
            "0_1": "What?",
            
            "0_1_RoBERTa": [
            
                -0.08149679750204086,

                .................................................. (so on)
                
            ],
            
            "y": 1,
            
            "label": "surprise"
            
        },
        {
            "1_0": "Or! Or, we could go to the bank, close our accounts and cut them off at the source.",
            
            "1_0_RoBERTa": [
            
                -0.06942201405763626,

                .................................................. (so on)
                
            ],
            
            "y": 0,
            
            "label": "neutral"
            
        },

        .................................................. (so on)

        
    ],
    "test": [
        {
            "0_0": "Why do all you\u0092re coffee mugs have numbers on the bottom?",
            
            "0_0_RoBERTa": [
            
                -0.05271094664931297,

                .................................................. (so on)
                
            ],
            
            "y": 1,
            
            "label": "surprise"
            
        },
        {
            "0_1": "Oh. That\u0092s so Monica can keep track. That way if one on them is missing, she can be like, \u0091Where\u0092s number 27?!\u0092",
            
            "0_1_RoBERTa": [
            
                -0.11013950407505035,

                .................................................. (so on)
                
            ],
            
            "y": 6,
            
            "label": "anger"
            
        },
        {
            "0_2": "Y'know what?",
            
            "0_2_RoBERTa": [
            
                -0.06442241370677948,

                .................................................. (so on)
                
            ],
            
            "y": 0,
            
            "label": "neutral"
            
        },

        .................................................. (so on)

        
    ]
}

"""

'\n\nTextual_Features_RoBERTa = {\n    "train": [\n        {\n            "0_0": "also I was the point person on my company\x92s transition from the KL-5 to GR-6 system.",\n            \n            "0_0_RoBERTa": [\n            \n                -0.09924783557653427, \n                \n                .................................................. (so on)\n                \n            ],\n            \n            "y": 0,\n            \n            "label": "neutral"\n            \n        },\n        {\n            "0_1": "You must\x92ve had your hands full.",\n            \n            "0_1_RoBERTa": [\n            \n                -0.13033033907413483,\n                \n               .................................................. (so on)\n               \n            ],\n            \n            "y": 0,\n            \n            "label": "neutral"\n            \n        },\n        {\n            "0_2": "That I did. That I did.",\n            "0_2_RoBERTa": [\n      