In [None]:
%pip install datasets
from datasets import load_dataset

# Load the dataset
dataset = load_dataset('yelp_review_full')

In [None]:
print(dataset)
print(dataset['train'][:5])
df = dataset['train'].to_pandas()
print(df.head())

In [None]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("LiYuan/amazon-review-sentiment-analysis")
model = AutoModelForSequenceClassification.from_pretrained("LiYuan/amazon-review-sentiment-analysis")

LiYuan_pipeline = pipeline(model=model, tokenizer=tokenizer, task="sentiment-analysis")

# Initialize an empty list to store the outputs
LiYuan_output = []

# Iterate over the Yelp reviews
for review in dataset['train']['text']:
    # Encode the review
    encoded_review = tokenizer.encode(review)

    # Check if the encoded review needs to be truncated
    if len(encoded_review) > 500:
        # Truncate the encoded review to the first 512 tokens
        encoded_review = encoded_review[:500]

    # Perform sentiment analysis and store the output
    LiYuan_output.append(LiYuan_pipeline(tokenizer.decode(encoded_review)))

import pandas as pd
# convert the output to a dataframe including the text
df = pd.DataFrame(LiYuan_output)
df["text"] = dataset['train']['text']
print(df)


In [None]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.utils.data import DataLoader

# If a GPU is available, move the model to the GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = AutoTokenizer.from_pretrained("LiYuan/amazon-review-sentiment-analysis")
model = AutoModelForSequenceClassification.from_pretrained("LiYuan/amazon-review-sentiment-analysis").to(device)

LiYuan_pipeline = pipeline(model=model, tokenizer=tokenizer, task="sentiment-analysis", device=0 if torch.cuda.is_available() else -1)

# Initialize an empty list to store the outputs
LiYuan_output = []

# Define the batch size
batch_size = 16

# Create a DataLoader for parallelizing the data loading
data_loader = DataLoader(dataset['train']['text'], batch_size=batch_size)



# Iterate over the Yelp reviews in batches
for batch_reviews in data_loader:
    # Iterate over the Yelp reviews
    for review in batch_reviews:
        # Encode the review
        encoded_review = tokenizer.encode(review)

        # Check if the encoded review needs to be truncated
        if len(encoded_review) > 500:
            # Truncate the encoded review to the first 512 tokens
            encoded_review = encoded_review[:500]

        # Perform sentiment analysis and store the output
        LiYuan_output.append(LiYuan_pipeline(tokenizer.decode(encoded_review)))
    # # Truncate the reviews to the first 500 tokens
    # truncated_reviews = [review[:500] for review in batch_reviews]

    # # Perform sentiment analysis and store the output
    # LiYuan_output.extend(LiYuan_pipeline(truncated_reviews))

import pandas as pd
# convert the output to a dataframe including the text
df = pd.DataFrame(LiYuan_output)
df["text"] = dataset['train']['text']
print(df)


In [None]:
from transformers import pipeline
import pandas as pd

# Create the model pipeline
model_pipeline = pipeline(model="distilbert-base-uncased-finetuned-sst-2-english")

# Yelp reviews
yelp_reviews = dataset['train']['text']

# Initialize an empty list to store the outputs
model_output = []

# Iterate over the Yelp reviews
for review in yelp_reviews:
    # Check if the review needs to be truncated
    if len(tokenizer.encode(review)) > 512:
        # Truncate the review to the first 512 tokens
        review = tokenizer.decode(tokenizer.encode(review)[:512])

    # Perform sentiment analysis and store the output
    model_output.append(model_pipeline(review))

# Convert the output to a dataframe including the text
df = pd.DataFrame(model_output)
df["text"] = yelp_reviews
print(df)


In [None]:
# --------------------
# Import pipeline
# --------------------
from transformers import pipeline

# --------------------
# Create the task pipeline
# --------------------
task_pipeline = pipeline(task="sentiment-analysis")

# --------------------
# Create the model pipeline
# --------------------
model_pipeline = pipeline(model="siebert/sentiment-roberta-large-english")
# Yelp reviews
yelp_reviews = dataset['train']['text']

# Predict the sentiment of multiple sentences
task_output = task_pipeline(yelp_reviews)
model_output = model_pipeline(yelp_reviews)

import pandas as pd
# convert the output to a dataframe including the text
df = pd.DataFrame(task_output)
df["text"] = yelp_reviews
print(df)