In [7]:
import pandas as pd
import torch
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm
import os


In [10]:
# --- Configuration ---
input_path = 'stratifiedSentimentData/socialMedia_stratified.csv'
text_column = 'attributes.search_data_fields.all_text'
output_dir = os.path.dirname(input_path)
output_filename = os.path.join(output_dir, 'socialMedia_with_advanced_sentiment.csv')



In [3]:
# --- 1. VADER Sentiment Analysis ---
print("Running VADER analysis...")
vader_analyzer = SentimentIntensityAnalyzer()

def get_vader_scores(text):
    if pd.isna(text) or not str(text).strip():
        return {'compound': 0, 'neg': 0, 'neu': 0, 'pos': 0}
    return vader_analyzer.polarity_scores(str(text))

# Apply VADER
data = pd.read_csv(input_path)
data['vader_scores'] = data[text_column].apply(get_vader_scores)

# Extract VADER scores
data['vader_neg'] = data['vader_scores'].apply(lambda x: x['neg'])
data['vader_neu'] = data['vader_scores'].apply(lambda x: x['neu'])
data['vader_pos'] = data['vader_scores'].apply(lambda x: x['pos'])
data['vader_compound'] = data['vader_scores'].apply(lambda x: x['compound'])



Running VADER analysis...


  data = pd.read_csv(input_path)


In [4]:
# --- 2. RoBERTa-Twitter Sentiment Analysis ---
print("Initializing RoBERTa-Twitter...")
try:
    # Load model with error fallback
    tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
    model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
    roberta = pipeline(
        "sentiment-analysis",
        model=model,
        tokenizer=tokenizer,
        device=0 if torch.cuda.is_available() else -1
    )
except Exception as e:
    print(f"Error loading model: {e}")
    raise

# Batch processing for efficiency
def analyze_batch(texts, batch_size=8):
    results = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Processing batches"):
        batch = texts[i:i+batch_size]
        try:
            inputs = tokenizer(
                batch,
                return_tensors="pt",
                truncation=True,
                max_length=512,  # RoBERTa's 512-token limit
                padding="max_length"
            )
            if torch.cuda.is_available():
                inputs = {k: v.to('cuda') for k, v in inputs.items()}
            
            with torch.no_grad():
                outputs = model(**inputs)
                probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
            
            batch_results = [
                {
                    'label': model.config.id2label[torch.argmax(prob).item()],
                    'score': torch.max(prob).item()
                }
                for prob in probs
            ]
            results.extend(batch_results)
        except Exception as e:
            print(f"Batch failed: {str(e)[:200]}...")
            results.extend([{'label': 'error', 'score': 0}] * len(batch))
    return results

# Apply RoBERTa
texts = data[text_column].fillna("").astype(str).tolist()
data['roberta_raw'] = analyze_batch(texts)

# Extract RoBERTa scores
data['roberta_label'] = data['roberta_raw'].apply(lambda x: x['label'])
data['roberta_score'] = data['roberta_raw'].apply(lambda x: x['score'])

# Map labels to numeric values for comparison
label_map = {'negative': -1, 'neutral': 0, 'positive': 1}
data['roberta_numeric'] = data['roberta_label'].map(label_map).fillna(0)



Initializing RoBERTa-Twitter...


config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Device set to use cpu
Processing batches: 100%|██████████| 4375/4375 [6:22:59<00:00,  5.25s/it]     


In [5]:
# --- Save Results ---
data.to_csv(output_filename, index=False)
print(f"Analysis complete. Results saved to:\n{output_filename}")

Analysis complete. Results saved to:
stratifiedSentimentData\socialMedia_with_advanced_sentiment.csv
