In [8]:
import spacy
spacy_nlp = spacy.load("en_core_web_sm")
from spacy.lang.en.stop_words import STOP_WORDS
import utility_functions as utils
import importlib
import pandas as pd
from transformers import pipeline, DistilBertTokenizer

importlib.reload(utils)

custom_stop_words = list(STOP_WORDS)  # Existing stop words
custom_stop_words.extend(["ll", "ve", "'em", "em", "ho", "fo", "ah", "de"])  # Tokens which doesn't really make sense to have them.

# Topic Model 0 - Street
# Topic Model 1 - Violence

In [9]:
df = pd.read_pickle('/Users/borosabel/Documents/Uni/Thesis/PopMIR/Data/Excel/df_w_emotion.pkl')

In [10]:
# df['Lyrics'] = df['Lyrics'].apply(utils.cleanup)

In [11]:
tokenizer = DistilBertTokenizer.from_pretrained('bhadresh-savani/distilbert-base-uncased-emotion')
classifier = pipeline("text-classification", model='bhadresh-savani/distilbert-base-uncased-emotion', top_k=None)

def get_sentiment(text):
    max_length = 512
    chunk_overlap = 50

    tokens = tokenizer.tokenize(text)
    chunks = [tokens[i:i + max_length] for i in range(0, len(tokens), max_length - chunk_overlap)]

    all_scores = []

    for chunk in chunks:
        # Convert tokens to text
        chunk_text = tokenizer.convert_tokens_to_string(chunk)

        # Get sentiment for each chunk, use truncation and padding to handle input size
        results = classifier(chunk_text, truncation=True, max_length=max_length, padding='max_length')

        # Append scores for each emotion
        for result in results:
            all_scores.append({emotion['label']: emotion['score'] for emotion in result})

    # Aggregate scores across chunks
    aggregated_scores = {}
    for score_dict in all_scores:
        for emotion, score in score_dict.items():
            if emotion in aggregated_scores:
                aggregated_scores[emotion] += score
            else:
                aggregated_scores[emotion] = score

    # Average the scores
    num_entries = len(all_scores)
    aggregated_scores = {emotion: score / num_entries for emotion, score in aggregated_scores.items()}

    return aggregated_scores


In [12]:
from tqdm.auto import tqdm

tqdm.pandas(desc="Extracting entities")
df['Emotion Scores'] = df['Lyrics'].progress_apply(get_sentiment)

In [13]:
emotion_df = df['Emotion Scores'].apply(pd.Series)
df = pd.concat([df, emotion_df], axis=1).drop('Emotion Scores', axis=1)
emotion_columns = emotion_df.columns.tolist()

In [14]:
df

In [15]:
# df.to_pickle('df_w_emotion.pkl')

In [16]:
# Calculating average emotion scores by coast
average_emotions = df.groupby('Coast')[['joy', 'anger', 'sadness', 'fear', 'love', 'surprise']].mean().reset_index()

In [17]:
import seaborn as sns
import matplotlib.pyplot as plt

# Set the aesthetic style of the plots
sns.set_style("whitegrid")

# Initialize the matplotlib figure
plt.figure(figsize=(12, 6))

# Plotting
sns.barplot(data=average_emotions.melt(id_vars=['Coast']), x='variable', y='value', hue='Coast', palette={"east_coast": "red", "west_coast": "blue"})
plt.title('Comparison of Average Emotion Scores between East Coast and West Coast')
plt.xlabel('Emotions')
plt.ylabel('Average Score')
plt.legend(title='Coast')
plt.savefig('avg_emotions_per_coast.png')
plt.show()

In [19]:
df_w_topics_0 = df[df['Topic Model Index'] == 0]
df_w_topics_1 = df[df['Topic Model Index'] == 1]

In [20]:
# Calculating average emotion scores by coast
average_emotions_0 = df_w_topics_0.groupby('Coast')[['joy', 'anger', 'sadness', 'fear', 'love', 'surprise']].mean().reset_index()
average_emotions_1 = df_w_topics_1.groupby('Coast')[['joy', 'anger', 'sadness', 'fear', 'love', 'surprise']].mean().reset_index()

In [21]:
import seaborn as sns
import matplotlib.pyplot as plt

# Set the aesthetic style of the plots
sns.set_style("whitegrid")

# Initialize the matplotlib figure
plt.figure(figsize=(12, 6))

# Plotting
sns.barplot(data=average_emotions_0.melt(id_vars=['Coast']), x='variable', y='value', hue='Coast', palette={"east_coast": "red", "west_coast": "blue"})
plt.title('Comparison of Average Emotion Scores between East Coast and West Coast for the 0 label Topic Model')
plt.xlabel('Emotions')
plt.ylabel('Average Score')
plt.legend(title='Coast')

plt.show()

In [22]:
import seaborn as sns
import matplotlib.pyplot as plt

# Set the aesthetic style of the plots
sns.set_style("whitegrid")

# Initialize the matplotlib figure
plt.figure(figsize=(12, 6))

# Plotting
sns.barplot(data=average_emotions_1.melt(id_vars=['Coast']), x='variable', y='value', hue='Coast', palette={"east_coast": "red", "west_coast": "blue"})
plt.title('Comparison of Average Emotion Scores between East Coast and West Coast for the 1 label Topic Model')
plt.xlabel('Emotions')
plt.ylabel('Average Score')
plt.legend(title='Coast')

plt.show()

In [165]:
# Coherent multimodal analysis

In [191]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE

In [192]:
emotion_data = df[emotion_columns]

In [198]:
# Apply PCA
tsne = TSNE(n_components=2, random_state=42)
tsne_result = tsne.fit_transform(emotion_data.values)

# Create a DataFrame with t-SNE results
tsne_df = pd.DataFrame(tsne_result, columns=['t-SNE1', 't-SNE2'])

# Determine the predominant emotion for coloring
df['predominant_emotion'] = emotion_df.idxmax(axis=1)

# Plotting
plt.figure(figsize=(12, 8))
# sns.scatterplot(x='t-SNE1', y='t-SNE2', hue=df['predominant_emotion'], data=tsne_df)
sns.scatterplot(x='t-SNE1', y='t-SNE2', hue=df['Coast'], data=tsne_df, palette={'east_coast': 'red', 'west_coast': 'blue'})
plt.title('2D Visualization of Songs Based on Emotion Analysis')
plt.xlabel('TSNE-1')
plt.ylabel('TSNE-2')
plt.legend(title='Predominant Emotion')
plt.show()

In [185]:
df_0 = df[df['Topic Model Index'] == 0].reset_index(drop=True)

In [186]:
emotion_data = df_0[emotion_columns]

In [199]:
# Apply PCA
tsne = TSNE(n_components=2, random_state=42)
tsne_result = tsne.fit_transform(emotion_data.values)

# Create a DataFrame with t-SNE results
tsne_df = pd.DataFrame(tsne_result, columns=['t-SNE1', 't-SNE2'])

# Determine the predominant emotion for coloring
df['predominant_emotion'] = emotion_df.idxmax(axis=1)

# Plotting
plt.figure(figsize=(12, 8))
# sns.scatterplot(x='t-SNE1', y='t-SNE2', hue=df['predominant_emotion'], data=tsne_df)
sns.scatterplot(x='t-SNE1', y='t-SNE2', hue=df_0['Coast'], data=tsne_df, palette={'east_coast': 'red', 'west_coast': 'blue'})
plt.title('2D Visualization of Songs Based on Emotion Analysis')
plt.xlabel('TSNE-1')
plt.ylabel('TSNE-2')
plt.legend(title='Predominant Emotion')
plt.show()

In [200]:
df_1 = df[df['Topic Model Index'] == 1].reset_index(drop=True)

In [201]:
emotion_data = df_1[emotion_columns]

In [202]:
# Apply PCA
tsne = TSNE(n_components=2, random_state=42)
tsne_result = tsne.fit_transform(emotion_data.values)

# Create a DataFrame with t-SNE results
tsne_df = pd.DataFrame(tsne_result, columns=['t-SNE1', 't-SNE2'])

# Determine the predominant emotion for coloring
df['predominant_emotion'] = emotion_df.idxmax(axis=1)

# Plotting
plt.figure(figsize=(12, 8))
# sns.scatterplot(x='t-SNE1', y='t-SNE2', hue=df['predominant_emotion'], data=tsne_df)
sns.scatterplot(x='t-SNE1', y='t-SNE2', hue=df_1['Coast'], data=tsne_df, palette={'east_coast': 'red', 'west_coast': 'blue'})
plt.title('2D Visualization of Songs Based on Emotion Analysis')
plt.xlabel('TSNE-1')
plt.ylabel('TSNE-2')
plt.legend(title='Predominant Emotion')
plt.show()