# Valence Aware Dictionary and sEntiment Reasoner

In [1]:
# Author: bbaasan
# File: vader.ipynb
# Created: 2023-09-13
# Email: bbaasan@gmu.edu
# Purpose: adding sentiment score to the data

In [3]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [4]:
aggregated = pd.read_pickle('../data/Aggregated.pkl')

In [22]:
vader_analyzer = SentimentIntensityAnalyzer()

def apply_vader_sentiment(dataframe: pd.DataFrame=None):
    output_dict = {'neg':[],'neu':[],'pos':[]}
    for idx, row in enumerate(dataframe.to_dict('records')):
        if idx%5==0:
            print(f'working on the: {idx}')
        text = [ line.strip() for line in row['transcript']]  # stript escape line character s
        single_sentence = ' '.join(text)
        sentiment_scores = vader_analyzer.polarity_scores(single_sentence)
        output_dict['neg'].append(sentiment_scores['neg'])
        output_dict['neu'].append(sentiment_scores['neu'])
        output_dict['pos'].append(sentiment_scores['pos'])
     
    return pd.DataFrame(output_dict)

In [25]:
import pandas as pd
import threading

# Assume you have vader_analyzer initialized already
# You can also import vader_analyzer from a module

# Define your apply_vader_sentiment function
def apply_vader_sentiment(dataframe: pd.DataFrame, thread_id, num_threads):
    output_dict = {'neg': [], 'neu': [], 'pos': []}
    for idx, row in enumerate(dataframe.to_dict('records')):
        if idx % 5 == 0:
            print(f'Thread {thread_id}: working on row {idx}')
        if idx % num_threads != thread_id:
            continue  # Skip rows not assigned to this thread
        text = [line.strip() for line in row['transcript']]
        single_sentence = ' '.join(text)
        sentiment_scores = vader_analyzer.polarity_scores(single_sentence)
        output_dict['neg'].append(sentiment_scores['neg'])
        output_dict['neu'].append(sentiment_scores['neu'])
        output_dict['pos'].append(sentiment_scores['pos'])
    return pd.DataFrame(output_dict)

# Define the number of threads to use
num_threads = 4

# Calculate the chunk size for each thread
chunk_size = len(aggregated) // num_threads

# Create a list to store the results from each thread
results = [None] * num_threads

# Create and start the threads
threads = []
for i in range(num_threads):
    start_idx = i * chunk_size
    end_idx = (i + 1) * chunk_size if i < num_threads - 1 else None
    sub_df = aggregated.iloc[start_idx:end_idx]
    thread = threading.Thread(target=lambda i=i: results.__setitem__(i, apply_vader_sentiment(sub_df, i, num_threads)))
    threads.append(thread)
    thread.start()

# Wait for all threads to finish
for thread in threads:
    thread.join()

# Combine the results from all threads if needed
combined_result = pd.concat(results, ignore_index=True)

# Now, combined_result contains the sentiment analysis results for the entire DataFrame


Thread 3: working on row 0
Thread 2: working on row 0
Thread 1: working on row 0
Thread 0: working on row 0
Thread 1: working on row 5
Thread 3: working on row 5
Thread 1: working on row 10
Thread 2: working on row 5
Thread 0: working on row 5
Thread 3: working on row 10
Thread 2: working on row 10
Thread 3: working on row 15
Thread 1: working on row 15
Thread 0: working on row 10
Thread 0: working on row 15
Thread 2: working on row 15
Thread 2: working on row 20
Thread 1: working on row 20
Thread 3: working on row 20
Thread 0: working on row 20
Thread 2: working on row 25
Thread 1: working on row 25
Thread 3: working on row 25
Thread 0: working on row 25
Thread 0: working on row 30
Thread 2: working on row 30
Thread 1: working on row 30
Thread 3: working on row 30
Thread 3: working on row 35
Thread 0: working on row 35
Thread 1: working on row 35
Thread 2: working on row 35
Thread 0: working on row 40
Thread 2: working on row 40
Thread 2: working on row 45
Thread 3: working on row 40


In [42]:
def process_vader_sentiment(dataframe: pd.DataFrame=None)->pd.DataFrame:
    output_dict = {'neg':[],'neu':[],'pos':[]}
    for idx, row in enumerate(dataframe.to_dict('records')):
        if idx % 50 == 0:
            print(f'print {idx}')
        text = [line.strip() for line in row['transcript']]
        single_sentence = ' '.join(text)
        sentiment_scores = vader_analyzer.polarity_scores(single_sentence)
        output_dict['neg'].append(sentiment_scores['neg'])
        output_dict['neu'].append(sentiment_scores['neu'])
        output_dict['pos'].append(sentiment_scores['pos'])
    pd.DataFrame(output_dict).to_pickle('../data/Eleven/vader_sentiment.pkl')

In [None]:
# Load the Loughran-McDonald sentiment word lists
positive_words = set(open('LoughranMcDonald_Positive.csv', 'r').read().splitlines())
negative_words = set(open('LoughranMcDonald_Negative.csv', 'r').read().splitlines())

# Sample text for sentiment analysis
text = "The company reported record-breaking profits and strong revenue growth."

# Tokenize the text (you may need to install a tokenization library like NLTK)
from nltk.tokenize import word_tokenize
tokens = word_tokenize(text.lower())  # Convert to lowercase for case-insensitive matching

# Initialize sentiment scores
positive_score = 0
negative_score = 0

# Calculate sentiment scores based on the Loughran-McDonald dictionary
for token in tokens:
    if token in positive_words:
        positive_score += 1
    elif token in negative_words:
        negative_score += 1