# Step 1: Prepare Aspect–Sentiment Pairs

In [3]:
import pandas as pd
import spacy
from textblob import TextBlob
import nltk
import re
from tqdm import tqdm

tqdm.pandas()
nlp = spacy.load("en_core_web_lg")

# Load cleaned data
df = pd.read_csv("final_extracted_aspects.csv")

# Check if the 'aspect_candidates' column or similar is used in your setup, adjust accordingly
reviews = df['aspect_candidates'].dropna().tolist()

# Define frequent aspects to search for
aspects = ["video", "quiz", "assignment", "teacher", "course", "platform", "content"]


# Step 2: Extract Aspect–Opinion Sentences

In [5]:
def extract_aspect_opinion_pairs(text, aspects):
    doc = nlp(text)
    pairs = []
    for sent in doc.sents:
        for aspect in aspects:
            if aspect in sent.text.lower():
                polarity = TextBlob(sent.text).sentiment.polarity
                if polarity > 0.1:
                    sentiment = 'positive'
                elif polarity < -0.1:
                    sentiment = 'negative'
                else:
                    sentiment = 'neutral'
                pairs.append((aspect, sent.text.strip(), sentiment))
    return pairs

all_pairs = []

for text in tqdm(reviews):
    pairs = extract_aspect_opinion_pairs(text, aspects)
    all_pairs.extend(pairs)

aspect_df = pd.DataFrame(all_pairs, columns=["aspect", "sentence", "sentiment"])
aspect_df.to_csv("aspect_sentiment_dataset.csv", index=False)
aspect_df.head()

100%|██████████| 519886/519886 [48:58<00:00, 176.91it/s]  


Unnamed: 0,aspect,sentence,sentiment
0,course,"['many test take', 'dry able pass complete wat...",positive
1,video,"['text', 'well experience video screen shot sh...",neutral
2,video,"able view slide instructor show', 'video scree...",positive
3,video,['information perfect program little annoying ...,positive
4,quiz,['information perfect program little annoying ...,positive


# Step 3: Model Training – Traditional ML (SVM, Logistic Regression, etc..)

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier, Perceptron, RidgeClassifier, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier, BaggingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

# Load dataset
df = pd.read_csv("aspect_sentiment_dataset.csv")

# Vectorize sentences with fewer features
tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=2000)
X = tfidf.fit_transform(df['sentence'])
y = df['sentiment'].map({'negative': 0, 'neutral': 1, 'positive': 2})

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE with automatic balancing (up to majority class size)
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Subset for LDA and QDA to manage memory
subset_size = 5000
subset_indices = np.random.choice(X_train_smote.shape[0], subset_size, replace=False)
X_train_subset = X_train_smote[subset_indices].toarray()
y_train_subset = y_train_smote[subset_indices]
X_test_subset = X_test.toarray()

# List of models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Support Vector Machine": LinearSVC(),
    "Naive Bayes": MultinomialNB(),
    "Decision Tree": DecisionTreeClassifier()
}

# Train and evaluate sparse-compatible models
results = []
for name, model in models.items():
    model.fit(X_train_smote, y_train_smote)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred) * 100
    f1 = f1_score(y_test, y_pred, average='weighted') * 100
    roc_auc = None
    if hasattr(model, 'predict_proba'):
        y_prob = model.predict_proba(X_test)
        roc_auc = roc_auc_score(y_test, y_prob, multi_class='ovr', average='weighted') * 100
    results.append({
        "Model": name,
        "Accuracy (%)": acc,
        "F1-Score (Weighted, %)": f1,
        "AUC-ROC (Weighted, %)": roc_auc if roc_auc is not None else "N/A"
    })

# Train LDA and QDA on the subset
for name, model in [("Linear Discriminant Analysis", LinearDiscriminantAnalysis()), 
                    ("Quadratic Discriminant Analysis", QuadraticDiscriminantAnalysis())]:
    model.fit(X_train_subset, y_train_subset)
    y_pred = model.predict(X_test_subset)
    acc = accuracy_score(y_test, y_pred) * 100
    f1 = f1_score(y_test, y_pred, average='weighted') * 100
    roc_auc = None
    if hasattr(model, 'predict_proba'):
        y_prob = model.predict_proba(X_test_subset)
        roc_auc = roc_auc_score(y_test, y_prob, multi_class='ovr', average='weighted') * 100
    results.append({
        "Model": name,
        "Accuracy (%)": acc,
        "F1-Score (Weighted, %)": f1,
        "AUC-ROC (Weighted, %)": roc_auc if roc_auc is not None else "N/A"
    })

# Display results
results_df = pd.DataFrame(results).sort_values(by="F1-Score (Weighted, %)", ascending=False)
print("\nModel Comparison (Sorted by F1-Score):\n")
print(results_df.to_string(index=False))


Model Comparison (Sorted by F1-Score):

                          Model  Accuracy (%)  F1-Score (Weighted, %) AUC-ROC (Weighted, %)
                  Decision Tree     94.277944               94.397382             93.217173
            Logistic Regression     92.474000               92.902043             98.445365
         Support Vector Machine     91.912317               92.299194                   N/A
                    Naive Bayes     80.235879               83.116727             93.040727
   Linear Discriminant Analysis     70.628511               75.198439             85.404733
Quadratic Discriminant Analysis     48.068277               60.049723             69.533504


# Step 4: Bert-based Model Training (In Progress)

In [2]:
# Step 1: Install required packages
# pip install accelerate -U
# pip install transformers[torch] datasets sklearn

import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Load your dataset
df = pd.read_csv("aspect_sentiment_dataset.csv")

# Filter to only 3 classes
label_map = {"positive": 2, "neutral": 1, "negative": 0}
df['label'] = df['sentiment'].map(label_map)

# Print dataset info
print(f"Dataset shape: {df.shape}")
print(f"Label distribution: {df['label'].value_counts()}")

# Tokenize data
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(
        examples["sentence"], 
        padding="max_length", 
        truncation=True, 
        max_length=128
    )

# Convert to Hugging Face Dataset format
dataset = Dataset.from_pandas(df[["sentence", "label"]])
dataset = dataset.train_test_split(test_size=0.2, seed=42)

# Apply tokenization
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# OPTION 1: If you can install accelerate, use this code block
try:
    from transformers import Trainer, TrainingArguments
    
    # Define custom compute_metrics function
    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        
        accuracy = accuracy_score(labels, predictions)
        f1 = f1_score(labels, predictions, average='weighted')
        
        return {
            "accuracy": accuracy,
            "f1": f1
        }
    
    # Load BERT model
    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased", 
        num_labels=3,
        id2label={0: "negative", 1: "neutral", 2: "positive"},
        label2id={"negative": 0, "neutral": 1, "positive": 2}
    )
    
    # Define Trainer with smaller batch size and gradient accumulation
    training_args = TrainingArguments(
        output_dir="./sentiment_results",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        gradient_accumulation_steps=2,
        fp16=False,
        report_to="none"
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["test"],
        compute_metrics=compute_metrics
    )
    
    # Train the model
    print("Starting training...")
    trainer.train()
    
    # Evaluate on test set
    print("Evaluating model...")
    eval_results = trainer.evaluate()
    print(f"Evaluation results: {eval_results}")
    
    # Get predictions on test set for detailed analysis
    predictions = trainer.predict(tokenized_dataset["test"])
    preds = np.argmax(predictions.predictions, axis=1)
    labels = predictions.label_ids
    
    model.save_pretrained("./sentiment_bert_model")
    tokenizer.save_pretrained("./sentiment_bert_model")
    print("Model saved to ./sentiment_bert_model")

except ImportError:
    print("Could not use Trainer API. Using manual PyTorch training instead.")
    # OPTION 2: Manual PyTorch training if accelerate cannot be installed
    from torch.utils.data import DataLoader
    from torch.optim import AdamW
    
    # Create dataloaders
    train_dataloader = DataLoader(
        tokenized_dataset["train"], 
        batch_size=8, 
        shuffle=True
    )
    
    eval_dataloader = DataLoader(
        tokenized_dataset["test"], 
        batch_size=8
    )
    
    # Load model
    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased", 
        num_labels=3
    )
    
    # Setup training
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    optimizer = AdamW(model.parameters(), lr=2e-5)
    num_epochs = 3
    
    # Training loop
    print("Starting manual training loop...")
    for epoch in range(num_epochs):
        # Training
        model.train()
        train_loss = 0
        for batch in train_dataloader:
            # FIX: Extract labels from batch before moving to device
            labels = batch.pop("label").to(device)
            inputs = {k: v.to(device) for k, v in batch.items()}
            
            # Forward pass with labels as a separate parameter
            outputs = model(**inputs, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            train_loss += loss.item()
        
        avg_train_loss = train_loss / len(train_dataloader)
        print(f"Epoch {epoch+1}, Train loss: {avg_train_loss:.4f}")
        
        # Evaluation
        model.eval()
        predictions = []
        references = []
        eval_loss = 0
        
        with torch.no_grad():
            for batch in eval_dataloader:
                # FIX: Store labels in a separate variable before popping
                label_ids = batch["label"].to(device)
                labels = batch.pop("label").to(device)  # Now pop labels from batch
                inputs = {k: v.to(device) for k, v in batch.items()}
                
                outputs = model(**inputs, labels=labels)
                eval_loss += outputs.loss.item()
                logits = outputs.logits
                
                preds = torch.argmax(logits, dim=-1).cpu().numpy()
                labels_cpu = label_ids.cpu().numpy()  # Use the saved labels
                
                predictions.extend(preds)
                references.extend(labels_cpu)
        
        avg_eval_loss = eval_loss / len(eval_dataloader)
        accuracy = accuracy_score(references, predictions)
        f1 = f1_score(references, predictions, average="weighted")
        
        print(f"Epoch {epoch+1}, Eval loss: {avg_eval_loss:.4f}")
        print(f"Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}")
    
    # Print final evaluation metrics
    print("\nFinal Evaluation Results:")
    print(classification_report(
        references, 
        predictions, 
        target_names=["negative", "neutral", "positive"]
    ))
    
    # Save the model
    model.save_pretrained("./sentiment_bert_model")
    tokenizer.save_pretrained("./sentiment_bert_model")
    print("Model saved to ./sentiment_bert_model")

# Example: How to use your trained model for inference (works with either approach)
def predict_sentiment(text, model_instance, tokenizer_instance):
    model_instance.eval()
    device = next(model_instance.parameters()).device
    inputs = tokenizer_instance(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model_instance(**inputs)
    
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predicted_class = torch.argmax(probabilities, dim=-1).item()
    
    # Map prediction to label
    sentiment_classes = ["negative", "neutral", "positive"]
    return sentiment_classes[predicted_class], probabilities[0].cpu().tolist()

# Example usage
example_texts = [
    "I really love this product, it's amazing!",
    "It's okay, nothing special.",
    "This is terrible, I want my money back."
]

print("\nExample predictions:")
for text in example_texts:
    sentiment, probs = predict_sentiment(text, model, tokenizer)
    print(f"Text: {text}")
    print(f"Predicted sentiment: {sentiment}")
    print(f"Probabilities: {probs}")
    print("-" * 50)

  from .autonotebook import tqdm as notebook_tqdm


Dataset shape: (365861, 4)
Label distribution: label
2    298468
1     57333
0     10060
Name: count, dtype: int64


Map: 100%|██████████| 292688/292688 [03:16<00:00, 1487.99 examples/s]
Map: 100%|██████████| 73173/73173 [00:48<00:00, 1506.52 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: Accelerator.__init__() got an unexpected keyword argument 'dispatch_batches'