In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download necessary NLTK resources (if not already downloaded)
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('vader_lexicon')
nltk.download('punkt_tab')




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [1]:
# 1. Data Loading and Preprocessing
import pandas as pd # import the pandas library and assign it to the alias 'pd'
try:
    df = pd.read_csv("product_info.csv", encoding='utf-8')
except FileNotFoundError:
    print("Error: product_info.csv not found.")
    exit()

# Print the actual column names in your DataFrame
print(df.columns)

#remove duplicate reviews
# Replace 'Review Text' with the actual column name from the output above
df.drop_duplicates(subset = ['reviews'], inplace = True, ignore_index=True)

#fill missing Review Text with Review Title
# Replace 'Review Text' and 'Review Title' with the actual column names
df['reviews'].fillna(df['reviews'], inplace = True)

#drop rows with missing Review Text
# Replace 'Review Text' with the actual column name
df.dropna(subset = ['reviews'], inplace = True)

# Ensure the 'reviews' column is of string type before applying string operations
df['reviews'] = df['reviews'].astype(str)  # Convert to string type

# Remove non-breaking space characters
df['reviews'] = df['reviews'].str.replace('\xa0', ' ', regex=False)
df['reviews'] = df['reviews'].str.replace('\u200b', '', regex=False)
df['reviews'] = df['reviews'].str.replace('\u2009', '', regex=False)




Index(['product_id', 'product_name', 'brand_id', 'brand_name', 'loves_count',
       'rating', 'reviews', 'size', 'variation_type', 'variation_value',
       'variation_desc', 'ingredients', 'price_usd', 'value_price_usd',
       'sale_price_usd', 'limited_edition', 'new', 'online_only',
       'out_of_stock', 'sephora_exclusive', 'highlights', 'primary_category',
       'secondary_category', 'tertiary_category', 'child_count',
       'child_max_price', 'child_min_price'],
      dtype='object')


In [10]:
!pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/126.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [12]:
# Simplified text cleaning (keep it relatively simple for BERT)
import re #Import the 're' module for regular expressions
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
#Import train_test_split here
from sklearn.model_selection import train_test_split

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)  # Keep punctuation more than before. BERT needs more context
    text = re.sub(r'\n', '', text)
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    text = ' '.join(filtered_sentence)
    return text

df['cleaned_review'] = df['reviews'].apply(clean_text)

# Map sentiment to numerical labels (required for BERT)
def map_sentiment_to_label(sentiment):
    if sentiment == 'positive':
        return 2
    elif sentiment == 'neutral':
        return 1
    elif sentiment == 'negative':
        return 0
    else:
        return None  # Handle unexpected values

#This function uses the existing sentiments to train bert. If you dont have it use a library like VADER to determine it first
def get_sentiment(text):
    analyzer = SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(text)
    compound_score = scores['compound']

    if compound_score >= 0.05:
        return "positive"
    elif compound_score <= -0.05:
        return "negative"
    else:
        return "neutral"

#If sentiment isn't available determine it using VADER
df['sentiment'] = df['cleaned_review'].apply(get_sentiment) #apply sentiment after cleaning

df['label'] = df['sentiment'].apply(map_sentiment_to_label)
df = df.dropna(subset=['label'])  # Drop rows with invalid labels


# Split data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['cleaned_review'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
)



In [15]:
# 2. Tokenization and Data Preparation for BERT

import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

model_name = "bert-base-uncased"  # Or "roberta-base", "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize the text data
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)


# Define a PyTorch Dataset
class SkincareDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = SkincareDataset(train_encodings, train_labels)
val_dataset = SkincareDataset(val_encodings, val_labels)



In [16]:
# 3. Model Definition and Training
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)  # 3 labels: negative, neutral, positive

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    num_train_epochs=2,              # Number of training epochs (adjust as needed)
    per_device_train_batch_size=16,   # Batch size (adjust based on GPU memory)
    per_device_eval_batch_size=64,    # Eval batch size
    warmup_steps=500,                # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # Weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,
    evaluation_strategy="steps",       # Evaluate every `logging_steps`
    eval_steps=50,
    save_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy'
)

trainer = Trainer(
    model=model,                         # The instantiated 🤗 Transformers model to be trained
    args=training_args,                  # Training arguments, defined above
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=val_dataset,            # Evaluation dataset
    compute_metrics=compute_metrics
)

trainer.train()



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msakhamuri-bandhavi[0m ([33msakhamuri-bandhavi-solarwinds[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
50,0.4726,0.271637,1.0,1.0,1.0,1.0
100,0.0143,0.007843,1.0,1.0,1.0,1.0
150,0.0037,0.002376,1.0,1.0,1.0,1.0


TrainOutput(global_step=156, training_loss=0.3141270015651408, metrics={'train_runtime': 174.2865, 'train_samples_per_second': 14.275, 'train_steps_per_second': 0.895, 'total_flos': 6392833821360.0, 'train_loss': 0.3141270015651408, 'epoch': 2.0})

In [18]:
# 4. Evaluation
print(trainer.evaluate())


# 5. Example Prediction (Optional)
example_text = "This product is amazing! It really helped my skin."
tokenized_example = tokenizer(example_text, truncation=True, padding=True, return_tensors='pt')

# Move tokenized_example to the same device as the model
if torch.cuda.is_available():
    device = torch.device('cuda:0')  # Use the first GPU
    tokenized_example = {k: v.to(device) for k, v in tokenized_example.items()}

with torch.no_grad():
    outputs = model(**tokenized_example)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predicted_class = torch.argmax(predictions).item()  # Get the index of the predicted class

sentiment_labels = {0: "negative", 1: "neutral", 2: "positive"}
predicted_sentiment = sentiment_labels[predicted_class]

print(f"Example Text: {example_text}")
print(f"Predicted Sentiment: {predicted_sentiment}")

{'eval_loss': 0.2716365456581116, 'eval_accuracy': 1.0, 'eval_f1': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_runtime': 0.2774, 'eval_samples_per_second': 1124.805, 'eval_steps_per_second': 18.026, 'epoch': 2.0}
Example Text: This product is amazing! It really helped my skin.
Predicted Sentiment: positive
