In [118]:
import pandas as pd
import swifter
import numpy as np
import time
import logging
import os
import concurrent.futures

from concurrent.futures import ThreadPoolExecutor
from tqdm.notebook import tqdm
import multiprocessing as mp




import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm



from sklearn.compose import make_column_selector
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import MinMaxScaler


from transformers import pipeline
from transformers import BertTokenizer



sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = [12, 5]


In [97]:
df = pd.read_excel("./Data/YelpReviews.xlsx")

# Text processing

In [98]:
df['review_text'] = df['review_text'].str.strip('"') # Remove the quotes from the review text

df['text_character_length'] = df['review_text'].str.len() # Calculate the length of the review text

df['text_word_count'] = df['review_text'].str.split().str.len() # Calculate the word count of the review text

## Sentiment analysis

In [99]:
model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
sentiment_pipeline = pipeline("sentiment-analysis",
                              model=model_name,
                              device='mps')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps


Tokens are the smallest units of text that the model can process. They can be words, parts of words, or punctuation marks. For example, the word "tokenization" might be split into "token" and "##ization". It's how AI processes texts.


In [100]:
def count_tokens(text):
    return len(tokenizer.tokenize(text))

df['token_count'] = df['review_text'].apply(count_tokens) 

In [132]:
logging.basicConfig(
    filename='sentiment_analysis.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

output_file = 'sentiment_results.csv'   

def save_result(result, idx, output_file=output_file):
    """Safely save a single result to CSV"""
    df = pd.DataFrame([result], index=[idx])
    
    if not os.path.exists(output_file):
        df.to_csv(output_file)
    else:
        # Open in append mode without headers
        df.to_csv(output_file, mode='a', header=False)

def analyze_long_text(text, idx, chunk_size=512):
        # Tokenize and chunk the text
        tokens = sentiment_pipeline.tokenizer.tokenize(text)
        chunks = [tokens[i:i + 512] for i in range(0, len(tokens), 512)]
        results = []
        
        # Process each chunk
        for chunk in chunks:
            chunk_text = sentiment_pipeline.tokenizer.convert_tokens_to_string(chunk)
            result = sentiment_pipeline(chunk_text)[0]
            results.append(result)
        
        # Aggregate results
        pos_scores = [r['score'] for r in results if r['label'] == 'positive']
        neg_scores = [r['score'] for r in results if r['label'] == 'negative']
        neu_scores = [r['score'] for r in results if r['label'] == 'neutral']
        
        pos_avg = sum(pos_scores) / len(pos_scores) if pos_scores else 0
        neg_avg = sum(neg_scores) / len(neg_scores) if neg_scores else 0
        neu_avg = sum(neu_scores) / len(neu_scores) if neu_scores else 0
        
        result = {}
        # Determine sentiment
        if pos_avg > neg_avg and pos_avg > neu_avg:
            result = {"label": "positive", "score": pos_avg}
        elif neg_avg > pos_avg and neg_avg > neu_avg:
            result = {"label": "negative", "score": neg_avg}
        else:
            result = {"label": "neutral", "score": neu_avg}
            
        # Save result safely
        save_result(result, idx)
        logging.info(f"Processed review {idx} with sentiment: {result['label']} (score: {result['score']:.4f})")
        return idx, result

def process_reviews(df, num_workers=4, output_file=output_file):
    # Load existing results if any
    if os.path.exists(output_file):
        existing_results = pd.read_csv(output_file, index_col=0)
        processed_indices = set(existing_results.index)
        logging.info(f"Found {len(processed_indices)} existing results")
    else:
        processed_indices = set()
    
    # Prepare unprocessed reviews
    reviews_to_process = [
        (text, idx) for idx, text in df['review_text'].items()
        if idx not in processed_indices
    ]
    
    if not reviews_to_process:
        logging.info("All reviews already processed")
        return pd.read_csv(output_file, index_col=0)
    
    logging.info(f"Processing {len(reviews_to_process)} reviews")
    
    # Process reviews using ThreadPoolExecutor
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        futures = []
        for text, idx in reviews_to_process:
            futures.append(executor.submit(analyze_long_text, text, idx))
        
        # Show progress bar
        for _ in tqdm(
            concurrent.futures.as_completed(futures),
            total=len(reviews_to_process),
            desc="Processing reviews"
        ):
            pass
    
    # Return final results
    return pd.read_csv(output_file, index_col=0)

In [135]:
num_workers = mp.cpu_count() - 1  # Leave one CPU free

# sentiment_results = process_reviews(df, num_workers)



Processing reviews:   0%|          | 0/21761 [00:00<?, ?it/s]

In [142]:
sentiment_results = pd.read_csv('sentiment_results.csv', index_col=0)
sentiment_results.sort_index(inplace=True)
sentiment_results.rename(columns={'label': 'sentiment', 'score': 'sentiment_confidence'}, inplace=True)
sentiment_results

Unnamed: 0,sentiment,sentiment_confidence
0,positive,0.984932
1,positive,0.871294
2,neutral,0.628548
3,positive,0.986272
4,neutral,0.685562
...,...,...
21761,positive,0.872460
21762,positive,0.806394
21763,positive,0.927083
21764,positive,0.979736


In [None]:
df = df.join(sentiment_results)


## Emotions

In [182]:
# ... existing code ...

# Initialize emotion classifier
emotion_pipeline = pipeline(
    "text-classification",
    model="SamLowe/roberta-base-go_emotions",
    device='mps',  # Using your existing MPS setup
    top_k=None
)

def process_emotions(texts, max_length=514):
    return emotion_pipeline(
        list(texts),  # Convert to list for batch processing
        truncation=True,
        padding=True,
        max_length=512,
        batch_size=32  # Add batching for efficiency
    )

# df.head(100)['review_text'].apply(emotion_pipeline)

Device set to use mps


In [191]:
emotion_pipeline(df.head(1)['review_text'].to_list())

[[{'label': 'admiration', 'score': 0.7070626020431519},
  {'label': 'approval', 'score': 0.4086913764476776},
  {'label': 'joy', 'score': 0.08597956597805023},
  {'label': 'neutral', 'score': 0.07182684540748596},
  {'label': 'pride', 'score': 0.024678487330675125},
  {'label': 'relief', 'score': 0.0151624521240592},
  {'label': 'optimism', 'score': 0.01454014889895916},
  {'label': 'caring', 'score': 0.014052682556211948},
  {'label': 'realization', 'score': 0.0136788971722126},
  {'label': 'gratitude', 'score': 0.012014185078442097},
  {'label': 'excitement', 'score': 0.011641857214272022},
  {'label': 'annoyance', 'score': 0.006794773042201996},
  {'label': 'love', 'score': 0.005275102332234383},
  {'label': 'disapproval', 'score': 0.0043527353554964066},
  {'label': 'amusement', 'score': 0.002929864451289177},
  {'label': 'desire', 'score': 0.002471545012667775},
  {'label': 'disappointment', 'score': 0.0020226691849529743},
  {'label': 'anger', 'score': 0.0014279893366619945},
  {

In [184]:
results

[{'label': 'admiration', 'score': 0.7070626616477966},
 {'label': 'admiration', 'score': 0.7006131410598755},
 {'label': 'admiration', 'score': 0.8200393319129944},
 {'label': 'admiration', 'score': 0.8702364563941956},
 {'label': 'joy', 'score': 0.41233178973197937}]