In [18]:
pip install transformers torch pandas tqdm matplotlib

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [19]:
pip install transformers[torch]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [20]:
pip install accelerate -U

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [21]:
import accelerate
print(accelerate.__version__)

import torch
print(torch.__version__)

1.6.0
2.4.0


In [None]:
import pandas as pd
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EvalPrediction
from datasets import Dataset
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import matplotlib.pyplot as plt
import sys
from torch.utils.data import DataLoader
import torch.cuda.amp as amp

infile = "preprocessed_combined.tsv"  
outfile = "combined_out.csv"
plot_bar = "combined_fine_top_five_products_bar.png"
plot_scatter = "combined_fine_reviews_vs_positive_percentage.png"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model = model.to(device)

print("Reading preprocessed dataset...")
try:
    df = pd.read_csv(infile, sep='\t', usecols=["product_id", "product_title", "review_body", "star_rating"])
except FileNotFoundError:
    print(f"Error: {infile} not found. Please ensure the file exists in the correct directory.")
    sys.exit(1)
except Exception as e:
    print(f"Error reading file: {e}")
    sys.exit(1)

df['sentiment_label'] = df['star_rating'].apply(lambda x: "Positive" if x >= 4 else ("Negative" if x <= 2 else None))
df = df.dropna(subset=['sentiment_label'])
df['label'] = df['sentiment_label'].map({"Positive": 1, "Negative": 0})

df["review_body"] = df["review_body"].fillna("").astype(str)

print("Analyzing review lengths...")
sample_df = df.sample(10000, random_state=42)
review_lengths = sample_df["review_body"].apply(lambda x: len(tokenizer.encode(x, add_special_tokens=True)))
print(f"Median review length: {review_lengths.median()}")
print(f"95th percentile: {review_lengths.quantile(0.95)}")
max_length = 512  
train_samples = 100000  
val_samples = 20000     

print("Splitting dataset...")
train_df, val_df = train_test_split(
    df,
    train_size=train_samples,
    test_size=val_samples,
    stratify=df['label'],
    random_state=42
)
print(f"Training set size: {len(train_df)}, Validation set size: {len(val_df)}")

train_dataset = Dataset.from_pandas(train_df[['review_body', 'label']])
val_dataset = Dataset.from_pandas(val_df[['review_body', 'label']])

def tokenize_function(examples):
    texts = [str(text) for text in examples["review_body"]]
    return tokenizer(texts, padding="max_length", truncation=True, max_length=512)

print("Tokenizing training and validation datasets...")
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)

tokenized_train = tokenized_train.rename_column("label", "labels")
tokenized_val = tokenized_val.rename_column("label", "labels")

tokenized_train.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_val.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

def compute_metrics(p: EvalPrediction):
    preds = p.predictions.argmax(-1)
    labels = p.label_ids
    accuracy = (preds == labels).mean()
    return {"accuracy": accuracy}

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,  
    remove_unused_columns=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    compute_metrics=compute_metrics,
)

print("Fine-tuning the model...")
trainer.train()

eval_results = trainer.evaluate()
print(f"Validation Loss: {eval_results['eval_loss']:.4f}")
print(f"Validation Accuracy: {eval_results['eval_accuracy']:.4f}")

print("Pre-tokenizing all reviews...")
all_texts = df["review_body"].astype(str).tolist()
full_dataset = Dataset.from_dict({"review_body": all_texts})
def tokenize_function(examples):
    return tokenizer(examples["review_body"], padding="max_length", truncation=True, max_length=max_length)
tokenized_full = full_dataset.map(tokenize_function, batched=True)
tokenized_full.set_format("torch", columns=["input_ids", "attention_mask"])

batch_size = 512  
dataloader = DataLoader(tokenized_full, batch_size=batch_size, shuffle=False, num_workers=4)

if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs for inference")
    model = nn.DataParallel(model)
model = model.to(device)

def classify_sentiment_dataloader(dataloader):
    model.eval()
    results = []
    with torch.inference_mode():  
        for batch in tqdm(dataloader, desc="Classifying batches"):
            inputs = {
                "input_ids": batch["input_ids"].to(device),
                "attention_mask": batch["attention_mask"].to(device)
            }
            with amp.autocast(): 
                outputs = model(**inputs)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            predictions = predictions.cpu().numpy() 
            labels = ["Negative" if pred == 0 else "Positive" for pred in predictions]
            results.extend(labels)
    return results

# Classify all reviews
print("Classifying all reviews...")
sentiments = classify_sentiment_dataloader(dataloader)
df["predicted_sentiment"] = sentiments

# Filtering out any failed classifications
df = df[df["predicted_sentiment"].notna()]

product_counts = df.groupby(["product_id", "product_title", "predicted_sentiment"]).size().unstack(fill_value=0)
product_counts.columns = ["Negative_Count", "Positive_Count"]
product_counts = product_counts.reset_index()

product_counts["Total_Reviews"] = product_counts["Negative_Count"] + product_counts["Positive_Count"]
product_counts["Positive_Percentage"] = (product_counts["Positive_Count"] / product_counts["Total_Reviews"]) * 100

print(f"Saving results to {outfile}...")
product_counts.to_csv(outfile, index=False)
print("Done!")

# top five products by total reviews
top_five = product_counts.sort_values("Total_Reviews", ascending=False).head(5)
print("\nTop 5 products by total reviews:")
print(top_five[['product_id', 'product_title', 'Negative_Count', 'Positive_Count', 'Total_Reviews', 'Positive_Percentage']])

print("\nSample of 20 reviews for manual evaluation:")
sample_reviews = df.sample(20)
for index, row in sample_reviews.iterrows():
    print(f"Review: {row['review_body'][:100]}...")
    print(f"Predicted Sentiment: {row['predicted_sentiment']}, Original Label: {row['sentiment_label']}")
    print()

print("Generating bar chart for top five products...")
top_five_plot = top_five.set_index('product_title')
top_five_plot[['Negative_Count', 'Positive_Count']].plot(kind='bar', stacked=True, figsize=(10, 6))
plt.title('Top 5 Products by Total Reviews')
plt.xlabel('Product Title')
plt.ylabel('Number of Reviews')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig(plot_bar)
plt.close()
print(f"Bar chart saved to {plot_bar}")

print("Generating scatter plot for top 1000 products...")
top_1000 = product_counts.sort_values("Total_Reviews", ascending=False).head(1000)
plt.figure(figsize=(10, 6))
plt.scatter(top_1000['Total_Reviews'], top_1000['Positive_Percentage'], alpha=0.5)
plt.title('Total Reviews vs. Positive Percentage')
plt.xlabel('Total Reviews')
plt.ylabel('Positive Percentage')
plt.xscale('log')
plt.grid(True)
plt.savefig(plot_scatter)
plt.close()
print(f"Scatter plot saved to {plot_scatter}")

overall_positive_percentage = (df['predicted_sentiment'] == 'Positive').mean() * 100
print(f"\nOverall positive review percentage: {overall_positive_percentage:.2f}%")

highest_positive = product_counts.loc[product_counts['Positive_Percentage'].idxmax()]
print(f"Product with highest positive percentage: {highest_positive['product_title']} ({highest_positive['Positive_Percentage']:.2f}%)")

accuracy = (df['predicted_sentiment'] == df['sentiment_label']).mean()
print(f"Overall accuracy on the dataset: {accuracy:.2f}")

Using device: cuda
Reading preprocessed dataset...
Analyzing review lengths...


Token indices sequence length is longer than the specified maximum sequence length for this model (716 > 512). Running this sequence through the model will result in indexing errors


Median review length: 37.0
95th percentile: 232.0
Splitting dataset...
Training set size: 100000, Validation set size: 20000
Tokenizing training and validation datasets...


Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Fine-tuning the model...


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1183,0.146307,0.96165
2,0.2143,0.422624,0.96455
3,0.1564,0.47491,0.9651


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(devic

Validation Loss: 0.4749
Validation Accuracy: 0.9651
Pre-tokenizing all reviews...


Map:   0%|          | 0/16314472 [00:00<?, ? examples/s]

Using 2 GPUs for inference
Classifying all reviews...


Classifying batches:   0%|          | 0/31865 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has alread

Saving results to combined_out.csv...
Done!

Top 5 products by total reviews:
       product_id                                  product_title  \
12038  B00FAPF5U0                               Candy Crush Saga   
11689  B00E8KLWB4           The Secret Society® - Hidden Mystery   
9428   B00992CF6W                                      Minecraft   
11480  B00DR0PDNE  Google Chromecast HDMI Streaming Media Player   
9771   B009UX2YAC                                 Subway Surfers   

       Negative_Count  Positive_Count  Total_Reviews  Positive_Percentage  
12038            3332           40992          44324            92.482628  
11689            2410           31315          33725            92.853966  
9428             2750           30213          32963            91.657313  
11480            5925           24423          30348            80.476473  
9771              983           28430          29413            96.657940  

Sample of 20 reviews for manual evaluation:
Review: The 