In [270]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re
import math

In [271]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aryangoel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [272]:
# Load the Excel file
file_path = '/Users/aryangoel/Desktop/UCR/Fall 2024/CS173/CS173-published-sheet.xlsx'
df = pd.read_excel(file_path)

In [273]:
# Remove lexicon columns
relevant_columns = df.columns[1::2]
df_filtered = df[relevant_columns]

In [274]:
def preprocess_sentence(sentence):
    """
    Tokenizes a sentence, removes English stop words, and joins the tokens back into a single string.

    Parameters:
    - sentence (str): The sentence to preprocess, represented as a string.

    Returns:
    - str: The preprocessed sentence, with stop words removed and tokens joined back into a single string.
    """
    
    tokens = re.findall(r'\b\w+\b|[.!?]', sentence)
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(filtered_tokens)  # Join tokens back into a single string for DataFrame storage

# Apply preprocessing to each column in the DataFrame
processed_df = filtered_df.copy()
for col in df_filtered.columns:
    processed_df[col] = df_filtered[col].fillna('').apply(preprocess_sentence)  # Fill NaNs with empty strings

pd.set_option('display.max_colwidth', -1)

In [275]:
def split_dataframe(df, train_size, validation_size, test_size):
    """
    Splits the given DataFrame into training, validation, and testing sets.

    Parameters:
    - df (pd.DataFrame): The DataFrame to split.
    - train_size (int): Number of rows for the training set.
    - validation_size (int): Number of rows for the validation set.
    - test_size (int): Number of rows for the testing set.

    Returns:
    - train (pd.DataFrame): The training set.
    - validation (pd.DataFrame): The validation set.
    - test (pd.DataFrame): The testing set.
    """

    if train_size + validation_size + test_size > len(df):
        raise ValueError("Sum of sizes exceeds DataFrame length.")
    
    train_df = df.iloc[0:train_size]
    validation_df = df.iloc[train_size:train_size + validation_size]
    test_df = df.iloc[train_size + validation_size:train_size + validation_size + test_size]
    
    return train_df, validation_df, test_df

train_df, validation_df, test_df = split_dataframe(processed_df, 29, 10, 10)

In [276]:
def create_emotion_lists(df, emotions):
    """
    Creates separate lists for each emotion by concatenating relevant columns.

    Parameters:
    - df (pd.DataFrame): The dataframe with the emotional sentences.
    - emotions (list): A list of emotion names to create separate lists for.

    Returns:
    - dict: A dictionary where keys are emotion names and values are the corresponding lists of sentences.
    """
    emotion_lists = {emotion: [] for emotion in emotions}

    for emotion in emotions:
        for column in df.columns:
            if emotion in column:
                emotion_lists[emotion].extend(df[column].tolist())

    return emotion_lists

emotions = ['Fear', 'Anger', 'Surprise', 'Disgust', 'Sadness', 'Joy']
train_emotion_lists = create_emotion_lists(train_df, emotions)
test_emotion_lists = create_emotion_lists(test_df, emotions)

In [277]:
# Calculate Prior Probabilities
emotion_counts = {emotion: len([s for s in train_emotion_lists[emotion] if s]) for emotion in emotions}
total_sentence_count = sum(emotion_counts.values())
prior_probabilities = {emotion: emotion_counts[emotion]/total_sentence_count for emotion in emotions}
print(prior_probabilities)

{'Fear': 0.20140515222482436, 'Anger': 0.1358313817330211, 'Surprise': 0.1288056206088993, 'Disgust': 0.13114754098360656, 'Sadness': 0.20140515222482436, 'Joy': 0.20140515222482436}


In [278]:
def word_likelihoods(test_sentence: str, document: list):
    """
    Calculates likelihood probabilities for every word in the test_sentence (uses add-one smoothing)

    Parameters:
    - test_sentence (str): The test sentence with words seperated by spaces.
    - document (list): A list of sentences of a particular emotion.

    Returns:
    - dict: A dictionary where keys are the words from the test sentence and values are the corresponding likelihood probabilities.
    """
    
    words = test_sentence.split()    
    word_count = {}
    unique_words = set()
    total_words = 0
    
    # Count occurrences of each word in the document
    for sentence in document:
        for word in sentence.split():
            total_words += 1
            unique_words.add(word)
            if word in word_count:
                word_count[word] += 1
            else:
                word_count[word] = 1
                
    vocab = len(unique_words)
    
    # Calculate probabilities for each word in the test sentence
    probabilities = {}
    for word in words:
        if word in word_count:
            probabilities[word] = (word_count[word] + 1) / (total_words + vocab)
        else:
            probabilities[word] = 1 / (total_words + vocab)
    
    return probabilities

def calc_likelihood_prob(sentence):
    return {emotion: word_likelihoods(sentence, train_emotion_lists[emotion]) for emotion in emotions}

# Example usage
sentence = "As she hugged her daughter goodbye on the first day of college, she felt both sad to see her go and joyful knowing that she was embarking on a new and exciting chapter in her life."
likelihood_probabilities = calc_likelihood_prob(sentence)
print(likelihood_probabilities)

{'Fear': {'As': 0.000462962962962963, 'she': 0.000462962962962963, 'hugged': 0.000462962962962963, 'her': 0.000462962962962963, 'daughter': 0.000925925925925926, 'goodbye': 0.000462962962962963, 'on': 0.000462962962962963, 'the': 0.000462962962962963, 'first': 0.000925925925925926, 'day': 0.000462962962962963, 'of': 0.000462962962962963, 'college,': 0.000462962962962963, 'felt': 0.006944444444444444, 'both': 0.000462962962962963, 'sad': 0.000462962962962963, 'to': 0.000462962962962963, 'see': 0.0023148148148148147, 'go': 0.000925925925925926, 'and': 0.000462962962962963, 'joyful': 0.000462962962962963, 'knowing': 0.000925925925925926, 'that': 0.000462962962962963, 'was': 0.000462962962962963, 'embarking': 0.000462962962962963, 'a': 0.000462962962962963, 'new': 0.001388888888888889, 'exciting': 0.000462962962962963, 'chapter': 0.000462962962962963, 'in': 0.000462962962962963, 'life.': 0.000462962962962963}, 'Anger': {'As': 0.000779423226812159, 'she': 0.000779423226812159, 'hugged': 0.0

In [279]:
def naive_bayes_probability(emotions, prior_probabilities, likelihood_probabilities):
    """
    Calculates the naive bayes probability for every emotion using the prior and liklihood probabilities

    Parameters:
    - emotions (list): List of all 6 emotions
    - prior_probabilities (dict): A dictionary where keys are the emotions and values are the prior probabilities
    - likelihood_probabilities (dict{dict}): A dictionary of dictionaries where keys are the emotions and values are dictionaries of the likelihood probabilities of each word in the test sentence

    Returns:
    - emotion (str): The emotion with the highest naive bayes probability
    """
    
    naive_bayes_probabilities = {}

    for emotion in emotions:
        prior = prior_probabilities[emotion]

        likelihood_product = 0
        
        for word, likelihood in likelihood_probabilities[emotion].items():
            likelihood_product += math.log(likelihood)

        joint_probability = math.log(prior) + likelihood_product
        
        naive_bayes_probabilities[emotion] = joint_probability
        
    print(naive_bayes_probabilities)
    print(naive_bayes_probabilities[highest_emotion])
    
    return max(naive_bayes_probabilities, key=naive_bayes_probabilities.get)

highest_emotion = naive_bayes_probability(emotions, prior_probabilities, likelihood_probabilities)
print(highest_emotion)

{'Fear': -223.74965261287713, 'Anger': -212.02290071402712, 'Surprise': -209.86551571711357, 'Disgust': -210.02665242437368, 'Sadness': -219.27512630278184, 'Joy': -221.7037991294918}
-209.86551571711357
Surprise


In [254]:
# Create confusion matrix
predictions_list = []
matrix_col = []
confusion_matrix_data = {'X': ['Fear', 'Anger', 'Surprise', 'Disgust', 'Sadness', 'Joy'], 'Fear': [0,0,0,0,0,0], 'Anger': [0,0,0,0,0,0], 'Surprise': [0,0,0,0,0,0], 'Disgust': [0,0,0,0,0,0], 'Sadness': [0,0,0,0,0,0], 'Joy': [0,0,0,0,0,0]}
confusion_matrix = pd.DataFrame(confusion_matrix_data)
confusion_matrix
for emotion in emotions:
    for test_sentence in test_emotion_lists[emotion]:
        predictions_list.append(naive_bayes_probability(emotions, prior_probabilities, calc_likelihood_prob(test_sentence)))
    matrix_col.append(predictions_list.count('Fear'))
    matrix_col.append(predictions_list.count('Anger'))
    matrix_col.append(predictions_list.count('Surprise'))
    matrix_col.append(predictions_list.count('Disgust'))
    matrix_col.append(predictions_list.count('Sadness'))
    matrix_col.append(predictions_list.count('Joy'))
    confusion_matrix[emotion] = matrix_col
    predictions_list = []
    matrix_col = []

print(confusion_matrix)
    

          X  Fear  Anger  Surprise  Disgust  Sadness  Joy
0  Fear      1     0      1         1        1        1  
1  Anger     8     8      1         2        8        6  
2  Surprise  11    7      15        6        6        12 
3  Disgust   8     5      3         11       8        8  
4  Sadness   1     0      0         0        6        2  
5  Joy       1     0      0         0        1        1  


In [286]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report  # Import the classification report function

X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
print("Number of mislabeled points out of a total %d points : %d"
      % (X_test.shape[0], (y_test != y_pred).sum()))
print(classification_report(y_test, y_pred))

Number of mislabeled points out of a total 75 points : 4
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        21
           1       0.88      1.00      0.94        30
           2       1.00      0.83      0.91        24

    accuracy                           0.95        75
   macro avg       0.96      0.94      0.95        75
weighted avg       0.95      0.95      0.95        75

