In [1]:
def display_dataset_info(dataset):
    info = dataset.info
    dataset_name = info.dataset_name
    splits_info = info.splits
    features = info.features
    print(f"Dataset Name: {dataset_name}")
    print("Splits Info:")
    for split_name, split_info in splits_info.items():
        num_examples = split_info.num_examples
        print(f" - Split: {split_name}, Num Examples: {num_examples}")
    print("Features:")
    for feature_name, feature_info in features.items():
        print(f" - {feature_name}: {feature_info}")

In [2]:
from datasets import load_dataset
import os

def save_dataset(random_sample_size=0, save_path='sampled_datasets'):  
    # Load the dataset
    dataset_name = 'stanfordnlp/imdb'
    dataset = load_dataset(dataset_name)
    display_dataset_info(dataset['test'])
    # Access the train, test splits
    train_dataset = dataset['train']
    test_dataset = dataset['test']

    # Random sample the dataset, only use random_sample_size
    if(random_sample_size != 0):
        train_dataset = train_dataset.shuffle(seed=42).select(range(random_sample_size))
        test_dataset = test_dataset.shuffle(seed=42).select(range(random_sample_size))

    train_save_path = os.path.join(save_path, f"{dataset_name}_train_{random_sample_size}")
    test_save_path = os.path.join(save_path, f"{dataset_name}_test_{random_sample_size}")

    train_dataset.save_to_disk(train_save_path)
    test_dataset.save_to_disk(test_save_path)

save_dataset()

Dataset Name: imdb
Splits Info:
 - Split: train, Num Examples: 25000
 - Split: test, Num Examples: 25000
 - Split: unsupervised, Num Examples: 50000
Features:
 - text: Value(dtype='string', id=None)
 - label: ClassLabel(names=['neg', 'pos'], id=None)


Saving the dataset (0/1 shards):   0%|          | 0/25000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/25000 [00:00<?, ? examples/s]

In [3]:
from datasets import load_from_disk
import textwrap
def load_dataset_from_disk(save_path='sampled_datasets/stanfordnlp', train_path='imdb_train_0', test_path='imdb_test_0'):
    try:
        train_save_path = os.path.join(save_path, train_path)
        test_save_path = os.path.join(save_path, test_path)

        train_dataset = load_from_disk(train_save_path)
        test_dataset = load_from_disk(test_save_path)
        
        print(">>Train Dataset loaded<<")
        # Print a sample from the loaded datasets to verify
        print("Text:", textwrap.fill(train_dataset[0]["text"], width=60), "\nLabel:", train_dataset[0]["label"])
        
        print("\n>>Test Dataset loaded<<")
        # Print a sample from the loaded test datasets to verify
        print("Text:", textwrap.fill(test_dataset[0]["text"], width=60), "\nLabel:", test_dataset[0]["label"])
        return train_dataset, test_dataset
    except Exception as e:
        print(f"An error occurred while loading the datasets: {e}")
    return train_dataset, test_dataset

train_dataset, test_dataset = load_dataset_from_disk()

>>Train Dataset loaded<<
Text: I rented I AM CURIOUS-YELLOW from my video store because of
all the controversy that surrounded it when it was first
released in 1967. I also heard that at first it was seized
by U.S. customs if it ever tried to enter this country,
therefore being a fan of films considered "controversial" I
really had to see this for myself.<br /><br />The plot is
centered around a young Swedish drama student named Lena who
wants to learn everything she can about life. In particular
she wants to focus her attentions to making some sort of
documentary on what the average Swede thought about certain
political issues such as the Vietnam War and race issues in
the United States. In between asking politicians and
ordinary denizens of Stockholm about their opinions on
politics, she has sex with her drama teacher, classmates,
and married men.<br /><br />What kills me about I AM
CURIOUS-YELLOW is that 40 years ago, this was considered
pornographic. Really, the sex and nudity scen

In [8]:
import torch
from transformers import GPT2Tokenizer, GPT2Model
from torch.utils.data import DataLoader

def process_dataset(dataset, model_name="gpt2", key="text", truncation=True, padding=True, max_length=512, use_mean_pooling=True, batch_size=16):
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    
    # Set pad_token to eos_token
    tokenizer.pad_token = tokenizer.eos_token

    model = GPT2Model.from_pretrained(model_name)
    model.eval()

    # Use GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    def embed_texts(batch):
        inputs = tokenizer(batch, truncation=truncation, padding=padding, max_length=max_length, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        if use_mean_pooling:
            embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling
        else:
            embeddings = outputs.last_hidden_state[:, 0, :]  # Use CLS token (if applicable)
        return embeddings.cpu().numpy()

    dataloader = DataLoader(dataset, batch_size=batch_size)
    
    all_embeddings = []
    all_labels = []

    for batch in dataloader:
        texts = batch[key]
        embeddings = embed_texts(texts)
        all_embeddings.extend(embeddings)
        all_labels.extend(batch['label'].numpy())
    
    return all_embeddings, all_labels

train_embeddings, train_labels = process_dataset(dataset=train_dataset, model_name="gpt2", key="text", 
                                                 truncation=True, padding=True, max_length=512, use_mean_pooling=True)
test_embeddings, test_labels = process_dataset(dataset=test_dataset, model_name="gpt2", key="text", 
                                               truncation=True, padding=True, max_length=512, use_mean_pooling=True)

In [9]:
import textwrap

def modify_dataset(example):
    example['text'] = "" + example['text']
    return example

# Apply the function to the dataset
train_dataset_aug = train_dataset.map(modify_dataset)

# Check the first review
example = train_dataset_aug['text'][0]
print(textwrap.fill(example, width=60))

I rented I AM CURIOUS-YELLOW from my video store because of
all the controversy that surrounded it when it was first
released in 1967. I also heard that at first it was seized
by U.S. customs if it ever tried to enter this country,
therefore being a fan of films considered "controversial" I
really had to see this for myself.<br /><br />The plot is
centered around a young Swedish drama student named Lena who
wants to learn everything she can about life. In particular
she wants to focus her attentions to making some sort of
documentary on what the average Swede thought about certain
political issues such as the Vietnam War and race issues in
the United States. In between asking politicians and
ordinary denizens of Stockholm about their opinions on
politics, she has sex with her drama teacher, classmates,
and married men.<br /><br />What kills me about I AM
CURIOUS-YELLOW is that 40 years ago, this was considered
pornographic. Really, the sex and nudity scenes are few and
far between, eve

In [10]:
from datetime import datetime
import os, json
import numpy as np
def save_embeddings(embeddings, model_name, save_path="data"):
    timestamp = datetime.now().strftime("%m-%d_%H:%M")
    # Calculate the average shape of tensors
    tensor_shapes = [tensor.shape for tensor in embeddings]
    avg_shape = np.mean(tensor_shapes, axis=0).tolist()
    
    embedding_info = {
        'model_name': model_name,
        'num_embeddings': len(embeddings),
        'avg_embedding_shape': avg_shape,
        'created_at': timestamp
    }
    
    os.makedirs(save_path, exist_ok=True)
    embedding_file = os.path.join(save_path, f"{model_name}_embeddings.npy")
    metadata_file = os.path.join(save_path, f"{model_name}_metadata.json")
    
    np.save(embedding_file, embeddings)

    with open(metadata_file, 'w') as f:
        json.dump(embedding_info, f)
    
    print(f"Embeddings and metadata saved for {model_name} at {timestamp}")

In [11]:
import numpy as np
def load_embeddings(embedding_file, metadata_file):
    embeddings = np.load(embedding_file)
    with open(metadata_file, 'r') as f:
        metadata = json.load(f)
    
    print(f"Loaded embeddings from model: {metadata['model_name']}")
    print(f"Number of embeddings: {metadata['num_embeddings']}")
    print(f"Average embedding shape: {metadata['avg_embedding_shape']}")
    print(f"Created at: {metadata['created_at']}")
    
    return embeddings

In [12]:
save_path="data"
model_name="GPT2"
save_embeddings(train_embeddings, model_name, save_path)

Embeddings and metadata saved for GPT2 at 05-28_21:29


In [13]:
save_path="data"
model_name="GPT2"

embedding_file = os.path.join(save_path, f"{model_name}_embeddings.npy")
metadata_file = os.path.join(save_path, f"{model_name}_metadata.json")
load_embeddings(embedding_file,metadata_file)

Loaded embeddings from model: GPT2
Number of embeddings: 25000
Average embedding shape: [768.0]
Created at: 05-28_21:29


array([[ 0.01628887,  0.16235308, -0.36071613, ...,  0.04537376,
        -0.10686247, -0.02329521],
       [ 0.13334286,  0.18759903, -0.33198926, ...,  0.10624611,
        -0.03493104,  0.04586201],
       [ 0.05417421, -0.09070875, -0.07028615, ...,  0.0975593 ,
        -0.09904678,  0.01274938],
       ...,
       [-0.0116824 ,  0.00273553, -0.05081039, ..., -0.02027671,
        -0.13268858, -0.00678555],
       [-0.0541323 ,  0.04904725, -0.20206001, ..., -0.03556992,
        -0.10749355, -0.08424992],
       [-0.02461178, -0.09264515,  0.04751003, ...,  0.11467135,
        -0.14804234,  0.06516315]], dtype=float32)

In [14]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Train an SVM model
svm_model = SVC(kernel='linear')
svm_model.fit(train_embeddings, train_labels)

In [15]:
# Predict and evaluate
predicted_labels = svm_model.predict(test_embeddings)
print(classification_report(y_true=test_labels, y_pred=predicted_labels))

              precision    recall  f1-score   support

           0       0.90      0.90      0.90     12500
           1       0.90      0.90      0.90     12500

    accuracy                           0.90     25000
   macro avg       0.90      0.90      0.90     25000
weighted avg       0.90      0.90      0.90     25000

