In [4]:
from datasets import load_dataset 

data_dir = '/data/postgraduate/wyb/codes/LLMs/data/cornell-movie-review-data/rotten_tomatoes' 
data = load_dataset(data_dir) 
data  

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
})

In [None]:
data['train'].shape # (8350, 2) 
print(data['train'][0]) 
print(data['train'][1]) 
print(data['train'][0, 1]) # 这种方式就会同时返回0号和1号的信息 返回的字典里键对应的就是列表了 

{'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .', 'label': 1}
{'text': 'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth .', 'label': 1}
{'text': ['the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .', 'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth .'], 'label': [1, 1]}


In [None]:
import os 
os.environ['cuDA_VISIBLE_DEVICES'] = '0' 

from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig, pipeline 

model_path = '/data/postgraduate/wyb/codes/LLMs/weights/cardiffnlp/twitter-roberta-base-sentiment-latest' 

tokenizer = AutoTokenizer.from_pretrained(model_path) 
config = AutoConfig.from_pretrained(model_path) 
model = AutoModelForSequenceClassification.from_pretrained(model_path) 

pipe = pipeline(
    "sentiment-analysis", # 要在这加上这个pipeline是针对什么类型的任务 
    model=model,
    tokenizer=tokenizer,  
    return_all_scores=True,
    device='cuda:0'
) 

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file /data/postgraduate/wyb/codes/LLMs/weights/cardiffnlp/twitter-roberta-base-sentiment-latest/config.json
Model config RobertaConfig {
  "_name_or_path": "/data/postgraduate/wyb/codes/LLMs/weights/cardiffnlp/twitter-roberta-base-sentiment-latest",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "negative",
    "1": "neutral",
    "2": "positive"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "negative": 0,
    "neutral": 1,
    "positive": 2
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_l

In [20]:
# run forward in the test dataset 
import numpy as np 
from tqdm import tqdm 
from transformers.pipelines.pt_utils import KeyDataset 

y_pred = [] 
for output in tqdm(pipe(KeyDataset(data['test'], 'text')), total=len(data['test'])): 
    negative_score = output[0]['score'] 
    positive_score = output[2]['score'] 
    assignment = np.argmax([negative_score, positive_score]) 
    y_pred.append(assignment)  

Disabling tokenizer parallelism, we're using DataLoader multithreading already
100%|██████████| 1066/1066 [00:13<00:00, 78.48it/s]


In [23]:
from sklearn.metrics import classification_report 

def evaluate_performance(y_true, y_pred):
    performance = classification_report(
        y_true, y_pred, target_names=['Negative Reviews', 'Positive Reviews']
    )
    print(performance) 

evaluate_performance(data['test']['label'], y_pred)  

                  precision    recall  f1-score   support

Negative Reviews       0.76      0.88      0.81       533
Positive Reviews       0.86      0.72      0.78       533

        accuracy                           0.80      1066
       macro avg       0.81      0.80      0.80      1066
    weighted avg       0.81      0.80      0.80      1066



In [8]:
import torch
import torch.nn.functional as F

from tqdm import tqdm 
from transformers import AutoTokenizer, AutoModel 

import os 
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# load model from the local disk 
model_dir = '/data/postgraduate/wyb/codes/LLMs/weights/sentence-transformers/all-mpnet-base-v2'
tokenizer = AutoTokenizer.from_pretrained(model_dir) 
model = AutoModel.from_pretrained(model_dir) 

# to GPU 
# tokenizer = tokenizer.to('cuda') 
model = model.to('cuda')

# create the embeddings 
sentences_list = [data['train']['text'], data['test']['text']] 
embeddings_list = [] 
for sentences in sentences_list: 
    # tokenize the sentences 
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt') 
    encoded_input = encoded_input.to('cuda') 

    # compute the token embeddings 
    with torch.no_grad(): 
        model_output = model(**encoded_input) 

    # perform pooling 
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']) 

    # normalize embeddings 
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1) 

    # append the embeddings to the list
    embeddings_list.append(sentence_embeddings)

train_embeddings, test_embeddings = embeddings_list 

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.90 GiB. GPU 0 has a total capacty of 10.74 GiB of which 1.09 GiB is free. Including non-PyTorch memory, this process has 9.14 GiB memory in use. Of the allocated memory 8.87 GiB is allocated by PyTorch, and 83.74 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF