##Please install##
pip install transformers sentencepiece 

In [1]:
def display_dataset_info(dataset):
    info = dataset.info
    dataset_name = info.dataset_name
    splits_info = info.splits
    features = info.features
    print(f"Dataset Name: {dataset_name}")
    print("Splits Info:")
    for split_name, split_info in splits_info.items():
        num_examples = split_info.num_examples
        print(f" - Split: {split_name}, Num Examples: {num_examples}")
    print("Features:")
    for feature_name, feature_info in features.items():
        print(f" - {feature_name}: {feature_info}")

In [2]:
from datasets import load_dataset
import os

def save_dataset(random_sample_size=0, save_path='sampled_datasets'):  
    # Load the dataset
    dataset_name = 'stanfordnlp/imdb'
    dataset = load_dataset(dataset_name)
    display_dataset_info(dataset['test'])
    # Access the train, test splits
    train_dataset = dataset['train']
    test_dataset = dataset['test']

    # Random sample the dataset, only use random_sample_size
    if(random_sample_size != 0):
        train_dataset = train_dataset.shuffle(seed=42).select(range(random_sample_size))
        test_dataset = test_dataset.shuffle(seed=42).select(range(random_sample_size))

    train_save_path = os.path.join(save_path, f"{dataset_name}_train_{random_sample_size}")
    test_save_path = os.path.join(save_path, f"{dataset_name}_test_{random_sample_size}")

    train_dataset.save_to_disk(train_save_path)
    test_dataset.save_to_disk(test_save_path)

save_dataset()

Dataset Name: imdb
Splits Info:
 - Split: train, Num Examples: 25000
 - Split: test, Num Examples: 25000
 - Split: unsupervised, Num Examples: 50000
Features:
 - text: Value(dtype='string', id=None)
 - label: ClassLabel(names=['neg', 'pos'], id=None)


Saving the dataset (0/1 shards):   0%|          | 0/25000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/25000 [00:00<?, ? examples/s]

In [3]:
from datasets import load_from_disk
import textwrap
def load_dataset_from_disk(save_path='sampled_datasets/stanfordnlp', train_path='imdb_train_0', test_path='imdb_test_0'):
    try:
        train_save_path = os.path.join(save_path, train_path)
        test_save_path = os.path.join(save_path, test_path)

        train_dataset = load_from_disk(train_save_path)
        test_dataset = load_from_disk(test_save_path)
        
        print(">>Train Dataset loaded<<")
        # Print a sample from the loaded datasets to verify
        print("Text:", textwrap.fill(train_dataset[0]["text"], width=60), "\nLabel:", train_dataset[0]["label"])
        
        print("\n>>Test Dataset loaded<<")
        # Print a sample from the loaded test datasets to verify
        print("Text:", textwrap.fill(test_dataset[0]["text"], width=60), "\nLabel:", test_dataset[0]["label"])
        return train_dataset, test_dataset
    except Exception as e:
        print(f"An error occurred while loading the datasets: {e}")
    return train_dataset, test_dataset

train_dataset, test_dataset = load_dataset_from_disk()

>>Train Dataset loaded<<
Text: I rented I AM CURIOUS-YELLOW from my video store because of
all the controversy that surrounded it when it was first
released in 1967. I also heard that at first it was seized
by U.S. customs if it ever tried to enter this country,
therefore being a fan of films considered "controversial" I
really had to see this for myself.<br /><br />The plot is
centered around a young Swedish drama student named Lena who
wants to learn everything she can about life. In particular
she wants to focus her attentions to making some sort of
documentary on what the average Swede thought about certain
political issues such as the Vietnam War and race issues in
the United States. In between asking politicians and
ordinary denizens of Stockholm about their opinions on
politics, she has sex with her drama teacher, classmates,
and married men.<br /><br />What kills me about I AM
CURIOUS-YELLOW is that 40 years ago, this was considered
pornographic. Really, the sex and nudity scen

In [4]:
import torch
from transformers import pipeline
device = 0 if torch.cuda.is_available() else -1

model_T5 = pipeline("feature-extraction", model="google-t5/t5-base", device=device)

In [18]:
from transformers import T5Tokenizer, T5Model
import torch
from tqdm import tqdm
import numpy as np
from sklearn.svm import SVC

def process_dataset(dataset, model_name, key="text", truncation=True, padding=True, max_length=512, use_mean_pooling=True):
    # Check if CUDA is available and set device accordingly
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Initialize the tokenizer and model
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5Model.from_pretrained(model_name).to(device)
    
    embeddings = []
    labels = []
    
    for data in tqdm(dataset, desc="Encoding text"):
        text = data[key]
        label = data.get("label", None)
        
        # Tokenize the input text
        inputs = tokenizer(text, truncation=truncation, padding=padding, max_length=max_length, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        # Get encoder outputs
        with torch.no_grad():
            encoder_outputs = model.encoder(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
        
        # Pooling to get a single vector for each input
        if use_mean_pooling:
            attention_mask = inputs['attention_mask']
            last_hidden_state = encoder_outputs.last_hidden_state
            mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
            sum_embeddings = torch.sum(last_hidden_state * mask_expanded, 1)
            sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
            embedding = sum_embeddings / sum_mask
        else:
            embedding = encoder_outputs.last_hidden_state[:, 0, :]
        
        embeddings.append(embedding.cpu().numpy().flatten())  # Flatten the embeddings
        if label is not None:
            labels.append(label)
    
    return np.array(embeddings), np.array(labels)

# Usage
train_embeddings, train_labels = process_dataset(dataset=train_dataset, model_name="t5-base", key="text", 
                                                 truncation=True, padding=True, max_length=512, use_mean_pooling=True)

test_embeddings, test_labels = process_dataset(dataset=test_dataset, model_name="t5-base", key="text", 
                                               truncation=True, padding=True, max_length=512, use_mean_pooling=True)




Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Encoding text: 100%|██████████| 25000/25000 [02:50<00:00, 146.39it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Encoding text: 100%|██████████| 25000/25000 [02:48<00:00, 148.25it/s]


In [19]:
import textwrap

def modify_dataset(example):
    example['text'] = "" + example['text']
    return example

# Apply the function to the dataset
train_dataset_aug = train_dataset.map(modify_dataset)

# Check the first review
example = train_dataset_aug['text'][0]
print(textwrap.fill(example, width=60))

I rented I AM CURIOUS-YELLOW from my video store because of
all the controversy that surrounded it when it was first
released in 1967. I also heard that at first it was seized
by U.S. customs if it ever tried to enter this country,
therefore being a fan of films considered "controversial" I
really had to see this for myself.<br /><br />The plot is
centered around a young Swedish drama student named Lena who
wants to learn everything she can about life. In particular
she wants to focus her attentions to making some sort of
documentary on what the average Swede thought about certain
political issues such as the Vietnam War and race issues in
the United States. In between asking politicians and
ordinary denizens of Stockholm about their opinions on
politics, she has sex with her drama teacher, classmates,
and married men.<br /><br />What kills me about I AM
CURIOUS-YELLOW is that 40 years ago, this was considered
pornographic. Really, the sex and nudity scenes are few and
far between, eve

In [20]:
from datetime import datetime
import os, json
import numpy as np
def save_embeddings(embeddings, model_name, save_path="data"):
    timestamp = datetime.now().strftime("%m-%d_%H:%M")
    # Calculate the average shape of tensors
    tensor_shapes = [tensor.shape for tensor in embeddings]
    avg_shape = np.mean(tensor_shapes, axis=0).tolist()
    
    embedding_info = {
        'model_name': model_name,
        'num_embeddings': len(embeddings),
        'avg_embedding_shape': avg_shape,
        'created_at': timestamp
    }
    
    os.makedirs(save_path, exist_ok=True)
    embedding_file = os.path.join(save_path, f"{model_name}_embeddings.npy")
    metadata_file = os.path.join(save_path, f"{model_name}_metadata.json")
    
    np.save(embedding_file, embeddings)

    with open(metadata_file, 'w') as f:
        json.dump(embedding_info, f)
    
    print(f"Embeddings and metadata saved for {model_name} at {timestamp}")

In [21]:
import numpy as np
def load_embeddings(embedding_file, metadata_file):
    embeddings = np.load(embedding_file)
    with open(metadata_file, 'r') as f:
        metadata = json.load(f)
    
    print(f"Loaded embeddings from model: {metadata['model_name']}")
    print(f"Number of embeddings: {metadata['num_embeddings']}")
    print(f"Average embedding shape: {metadata['avg_embedding_shape']}")
    print(f"Created at: {metadata['created_at']}")
    
    return embeddings

In [22]:
save_path="data"
model_name="T5"
save_embeddings(train_embeddings, model_name, save_path)

Embeddings and metadata saved for T5 at 05-28_10:09


In [23]:
save_path="data"
model_name="T5"

embedding_file = os.path.join(save_path, f"{model_name}_embeddings.npy")
metadata_file = os.path.join(save_path, f"{model_name}_metadata.json")
load_embeddings(embedding_file,metadata_file)

Loaded embeddings from model: T5
Number of embeddings: 25000
Average embedding shape: [768.0]
Created at: 05-28_10:09


array([[-0.13081071, -0.01385116, -0.10628047, ..., -0.02006963,
        -0.04306481, -0.05588393],
       [-0.17791957, -0.07671215, -0.07560244, ..., -0.03279287,
         0.01519491, -0.11346459],
       [-0.20971401,  0.04312025, -0.16568156, ..., -0.11546075,
        -0.05622438, -0.06795987],
       ...,
       [-0.18125014,  0.01628946, -0.17004627, ..., -0.08820023,
        -0.07438255, -0.06964385],
       [-0.16426188,  0.0124423 , -0.02539338, ...,  0.03038486,
        -0.04227945, -0.08596442],
       [-0.21023269, -0.00792499,  0.00213612, ..., -0.07231782,
        -0.11123351, -0.09665132]], dtype=float32)

In [24]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Train an SVM model
svm_model = SVC(kernel='linear')
svm_model.fit(train_embeddings, train_labels)

In [25]:
# Predict and evaluate
predicted_labels = svm_model.predict(test_embeddings)
print(classification_report(y_true=test_labels, y_pred=predicted_labels))

              precision    recall  f1-score   support

           0       0.92      0.92      0.92     12500
           1       0.92      0.92      0.92     12500

    accuracy                           0.92     25000
   macro avg       0.92      0.92      0.92     25000
weighted avg       0.92      0.92      0.92     25000

