In [19]:
import pandas as pd
from tqdm import tqdm
from transformers import pipeline
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.tokenize import sent_tokenize
import numpy as np

# Initialize analyzers
vader = SentimentIntensityAnalyzer()
model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
sentiment_analyzer = pipeline(
    "sentiment-analysis",
    model=model_name,
    return_all_scores=True
)

def analyze_sentence(sentence):
    """Analyze a single sentence with hybrid approach"""
    # VADER analysis
    vader_score = vader.polarity_scores(sentence)['compound']  # -1 to 1
    
    # Transformer analysis
    try:
        result = sentiment_analyzer(sentence)[0]
        trans_scores = {item['label'].lower(): item['score'] for item in result}
    except:
        trans_scores = {'positive': 0, 'negative': 0, 'neutral': 0}
    
    # Hybrid scoring (adjust weights as needed)
    hybrid_scores = {
        'positive': 0.6 * trans_scores.get('positive', 0) + 0.4 * max(0, vader_score),
        'negative': 0.6 * trans_scores.get('negative', 0) + 0.4 * max(0, -vader_score),
        'neutral': 0.6 * trans_scores.get('neutral', 0) + 0.4 * (1 - abs(vader_score))
    }
    
    dominant = max(hybrid_scores.items(), key=lambda x: x[1])
    return {
        'sentence': sentence,
        'sentiment': dominant[0],
        'confidence': dominant[1],
        **hybrid_scores
    }

def analyze_transcript(transcript):
    """Analyze a full transcript"""
    if pd.isna(transcript) or not str(transcript).strip():
        return {
            'positive': 0,
            'negative': 0,
            'neutral': 0,
            'avg_score': 0,
            'sentence_count': 0
        }
    
    sentences = sent_tokenize(str(transcript))
    results = []
    
    for sentence in sentences:
        results.append(analyze_sentence(sentence))
    
    if not results:
        return {
            'positive': 0,
            'negative': 0,
            'neutral': 0,
            'avg_score': 0,
            'sentence_count': 0
        }
    
    # Aggregate results
    sentiment_counts = pd.DataFrame(results)['sentiment'].value_counts()
    avg_score = np.mean([r['confidence'] if r['sentiment'] in ['positive','negative'] else 0 
                       for r in results])
    
    return {
        'positive': sentiment_counts.get('positive', 0),
        'negative': sentiment_counts.get('negative', 0),
        'neutral': sentiment_counts.get('neutral', 0),
        'avg_score': avg_score,
        'sentence_count': len(results),
        'sentence_details': results  # Optional: store individual sentence results
    }

def analyze_dataframe(df, text_column='text'):
    """Analyze all transcripts in a DataFrame"""
    results = []
    
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Analyzing transcripts"):
        transcript = row[text_column]
        analysis = analyze_transcript(transcript)
        results.append(analysis)
    
    # Create new columns
    result_cols = pd.DataFrame(results)
    result_cols['positive_prop'] = result_cols['positive'] / result_cols['sentence_count']
    result_cols['negative_prop'] = result_cols['negative'] / result_cols['sentence_count']
    result_cols['neutral_prop'] = result_cols['neutral'] / result_cols['sentence_count']
    
    # Merge with original data
    return pd.concat([df.reset_index(drop=True), result_cols], axis=1)

# Usage
input_path = 'stratifiedSentimentData/stratified_radio_sample_TextBlob.csv'  # Example input path
data = pd.read_csv(input_path)
analyzed_data = analyze_dataframe(data)  # Automatically uses 'text' column

# Save results
output_path = input_path.replace('.csv', '_HYBRID.csv')
analyzed_data.to_csv(output_path, index=False)
print(f"Analysis complete. Results saved to: {output_path}")

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu
Analyzing transcripts: 100%|██████████| 624/624 [52:51:20<00:00, 304.94s/it]       


Analysis complete. Results saved to: stratifiedSentimentData/stratified_radio_sample_TextBlob_HYBRID.csv
