In [2]:
import spacy
spacy_nlp = spacy.load("en_core_web_sm")

import utility_functions as utils
import importlib
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from spacy.lang.en.stop_words import STOP_WORDS

importlib.reload(utils)

data = '/Users/borosabel/Documents/Uni/Thesis/PopMIR/Data/Audio/test.json'

custom_stop_words = list(STOP_WORDS)  # Existing stop words
custom_stop_words.extend(["ll", "ve", "'em", "em", "ho", "fo", "ah", "de"])  # Tokens which doesn't really make sense to have them.

In [3]:
df = pd.read_excel('/Users/borosabel/Documents/Uni/Thesis/PopMIR/Data/Excel/baseline_data.xlsx', engine='openpyxl')

In [4]:
# I apply a general cleanup function to the lyrics (Either way it gets preprocessed later with Octis)
df['Lyrics'] = df['Lyrics'].apply(utils.cleanup)

In [23]:
# NUMBER OF SONGS BY RELEASE_YEAR AND COAST
grouped = df.groupby(['Release Year', 'Coast']).size().reset_index(name='Counts')

plt.style.use('ggplot')

# Create a bar plot with custom colors
color_scheme = ['red', 'blue']  # Define your color scheme
pivot_table = grouped.pivot(index='Release Year', columns='Coast', values='Counts')
ax = pivot_table.plot(kind='bar', figsize=(10, 6), color=color_scheme)

# Adding titles and labels
plt.title('Counts of Songs by Coast and Year')
plt.xlabel('Release Year')
plt.ylabel('Count of Songs')
plt.xticks(rotation=0)  # Rotate the x labels for better readability

# Show legend and plot
plt.legend(title='Coast')

# Save the figure
plt.savefig('song_counts_by_coast_and_year.png')  # Saves the plot as a PNG file
plt.show()

In [24]:
# NUMBER OF SONGS BY COAST
grouped = df.groupby(['Coast']).size().reset_index(name='Counts')

In [25]:
grouped

In [29]:
# NUMBER OF DIFFERENT ARTISTS
grouped = df.groupby(['Coast', 'Artist']).size().reset_index(name='Counts')

In [33]:
grouped[grouped['Coast'] == 'east_coast']

In [34]:
grouped[grouped['Coast'] == 'west_coast']

In [43]:
grouped = df.groupby(['Artist', 'Coast']).size().reset_index(name='Counts')
pivot_table = grouped.pivot(index='Artist', columns='Coast', values='Counts')

pivot_table['Total'] = pivot_table.sum(axis=1)
pivot_table_sorted = pivot_table.sort_values(by='Total', ascending=False)
pivot_table_sorted = pivot_table_sorted.drop(columns=['Total'])

ax = pivot_table_sorted.plot(kind='bar', figsize=(10, 6), color=['red', 'blue'])

# Add plot title and labels
plt.title('Counts of Songs by Artist and Coast (Sorted)')
plt.xlabel('Artist')
plt.ylabel('Count of Songs')
plt.xticks(rotation=90)  # Rotate artist names for better visibility

# Add a legend to the plot
plt.legend(title='Coast', loc='upper right')
plt.tight_layout()  # Adjust layout

# Save the figure with appropriate bounding box settings
plt.savefig('sorted_song_counts_by_artist_and_coast.png', bbox_inches='tight')

# Display the plot
plt.show()

In [5]:
df['Lyrics_Length'] = df['Lyrics'].apply(len)

In [6]:
df

In [8]:
plt.figure(figsize=(12, 6))
sns.histplot(data=df, x='Lyrics_Length', hue='Coast', kde=True, element='step',
             palette={'east_coast': 'red', 'west_coast': 'blue'}, multiple="layer", alpha=0.5)  # Adjust alpha for transparency
plt.title('Distribution of Lyrics Length by Coast')
plt.xlabel('Lyrics Length')
plt.ylabel('Frequency')

plt.savefig('lyrics_length_distribution.png')
plt.show()

In [25]:
# Average east coast lyrics length:
np.mean(df[df['Coast'] == 'east_coast']['Lyrics_Length'])

In [26]:
# Average west coast lyrics length:
np.mean(df[df['Coast'] == 'west_coast']['Lyrics_Length'])

In [40]:
import nltk
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Set of English stopwords
stop_words = set(stopwords.words('english'))

custom_stop_words = ['yeah', 'oh', 'like', 'just', 'know', 'got', 'gonna', 'na', 'ain', 'cause', 'let', 'get']
stop_words.update(custom_stop_words)

def tokenize_lyrics(lyrics):
    # Convert to lowercase to ensure uniformity
    tokens = word_tokenize(lyrics.lower())
    # Remove stopwords and non-alphabetic tokens
    words = [word for word in tokens if word.isalpha() and word not in stop_words]
    return words

# Apply the tokenization to each row
df['Tokens'] = df['Lyrics'].apply(tokenize_lyrics)

# Function to count words by coast
def count_words(df, coast):
    # Concatenate all lists of tokens from the given coast into a single list
    words = sum(df[df['Coast'] == coast]['Tokens'].tolist(), [])
    # Count the words and return the most common ones
    return Counter(words).most_common(10)

# Get the top 10 words for each coast
east_words = count_words(df, 'east_coast')
west_words = count_words(df, 'west_coast')

print("East Coast Top Words:", east_words)
print("West Coast Top Words:", west_words)

In [41]:
def plot_word_frequencies(word_frequencies, title, color, filename):
    # Unpack words and their frequencies
    words, frequencies = zip(*word_frequencies)

    # Create figure and axis
    plt.figure(figsize=(10, 5))
    plt.bar(words, frequencies, color=color)
    plt.title(title)
    plt.xlabel('Words')
    plt.ylabel('Frequency')
    plt.xticks(rotation=45)  # Rotate labels to avoid overlap

    # Save the plot to a file
    plt.savefig(filename, format='png', bbox_inches='tight')  # Adjust format as needed

    # Display the plot
    plt.show()

plot_word_frequencies(east_words, 'Top Words on the East Coast', 'red', 'east_coast_words_freq.png')
plot_word_frequencies(west_words, 'Top Words on the West Coast', 'blue', 'west_coast_words_freq.png')