In [3]:
## Project Setup

import pandas as pd
import re

# Load dataset
df = pd.read_csv('test(in).csv')

# Display first few rows of df
print("DataFrame head:")
print(df.head())

# Display info about df
print("\nDataFrame info:")
print(df.info())

DataFrame head:
                                        Subject  \
0                          EnronOptions Update!   
1                                  (No Subject)   
2  Phone Screen  Interview - Shannon L. Burnham   
3                         RE: My new work email   
4                                           Bet   

                                                body       date  \
0  EnronOptions Announcement\n\n\nWe have updated...  5/10/2010   
1  Marc,\n\nUnfortunately, today is not going to ...  7/29/2010   
2  When: Wednesday, June 06, 2001 10:00 AM-11:00 ...  7/25/2011   
3  we were thinking papasitos (we can meet somewh...  3/25/2010   
4  Since you never gave me the $20 for the last t...  5/21/2011   

                      from  
0     sally.beck@enron.com  
1      eric.bass@enron.com  
2     sally.beck@enron.com  
3  johnny.palmer@enron.com  
4  lydia.delgado@enron.com  

DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2191 entries, 0 to 2190
Data colu

In [5]:
## Task 1: Sentiment Labeling
# Automatically label each message as positive, negative, or neutral

# Define function get sentiment of message using LLM by using negation and intensity considerations
def get_sentiment(message):
    message = str(message).lower()

    # Define sentiment lexicons
    positive_words = ['great', 'good', 'excellent', 'happy', 'positive', 'successful', 'appreciate', 'thank', 'thanks',
                      'wonderful', 'pleasure', 'agree', 'awesome', 'best', 'effective', 'improve', 'support', 'resolve',
                      'progress', 'opportunity', 'strong', 'confident', 'efficient', 'benefit', 'forward']
    negative_words = ['bad', 'poor', 'negative', 'unsuccessful', 'concern', 'issue', 'problem', 'difficult', 'unfortunately',
                      'unable', 'deny', 'cancelled', 'delay', 'crisis', 'error', 'failed', 'failure', 'stress', 'terrible',
                      'trouble', 'wrong', 'reject', 'decline', 'risk', 'disappoint', 'complain', 'frustrate', 'worry']
    negation_words = ['not', 'no', 'never', 'none', 'nor', 'hardly', 'barely', 'scarcely', 'don\'t', 'doesn\'t', 'didn\'t',
                      'isn\'t', 'aren\'t', 'wasn\'t', 'weren\'t', 'haven\'t', 'hasn\'t', 'hadn\'t', 'won\'t', 'wouldn\'t',
                      'can\'t', 'couldn\'t', 'shouldn\'t', 'mightn\'t', 'mustn\'t']
    intensity_words = {'very': 2.0, 'extremely': 2.5, 'really': 1.8, 'quite': 0.8, 'slightly': 0.5}

    # Tokenize words
    # re.findall(pattern, string) finds all non-overlapping matches of a pattern in a string, returns as list of strings
    # r'' - raw string
    # \b - word boundary
    # \w+ - one or more word characters
    words = re.findall(r'\b\w+\b', message)
    
    score = 0
    i = 0
    while i < len(words):
        word = words[i]

        # Initialize modifiers for the current word
        current_multiplier = 1.0
        # Flag to negate 'next' sentiment word
        negate_next = False
        
        # check for negation words
        if word in negation_words:
            negate_next = True
            # Consume negation word, move to next word to apply negation
            i += 1
            # Check if at end of message
            if i >= len(words):
                break
            # Get next word that might be intensified
            word = words[i]
        if word in intensity_words:
            current_multiplier = intensity_words[word]
            # Consume intensity word, move to next word to apply
            i += 1
            if i>= len(words):
                break
            word = words[i]
        # Process sentiment of word after considering modifiers
        if word in positive_words:
            sentiment_value = 1
        elif word in negative_words:
            sentiment_value = -1
        else:
            sentiment_value = 0
        # Apply negation
        if negate_next:
            sentiment_value *= -1
        # Apply intensity
        sentiment_value *= current_multiplier
        score += sentiment_value
        i += 1
    if score > 0:
        return 'Positive'
    elif score < 0:
        return 'Negative'
    else:
        return 'Neutral'

In [9]:
## Main for Task 1
# 1. Load dataset
try:
    df = pd.read_csv('test(in).csv')
except FileNotFoundError:
    print("Error: 'test(in).csv' not found")
    exit()
# 2. Augment dataset with additional sentiment column
df['Sentiment'] = df['body'].apply(get_sentiment)
# 3. Display first few rows
print(df.head())
# 4. Display distribution of sentiment labels
print(df['Sentiment'].value_counts())
# 5. Save augmented dataframe to new csv file
df.to_csv('test_augmented.csv', index=False)

                                        Subject  \
0                          EnronOptions Update!   
1                                  (No Subject)   
2  Phone Screen  Interview - Shannon L. Burnham   
3                         RE: My new work email   
4                                           Bet   

                                                body       date  \
0  EnronOptions Announcement\n\n\nWe have updated...  5/10/2010   
1  Marc,\n\nUnfortunately, today is not going to ...  7/29/2010   
2  When: Wednesday, June 06, 2001 10:00 AM-11:00 ...  7/25/2011   
3  we were thinking papasitos (we can meet somewh...  3/25/2010   
4  Since you never gave me the $20 for the last t...  5/21/2011   

                      from Sentiment  
0     sally.beck@enron.com   Neutral  
1      eric.bass@enron.com   Neutral  
2     sally.beck@enron.com   Neutral  
3  johnny.palmer@enron.com   Neutral  
4  lydia.delgado@enron.com   Neutral  
Sentiment
Neutral     1326
Positive     797
Negative    