In [9]:
# In case you need to download the stopwords package.
# import nltk
# nltk.download('stopwords')

import spacy
spacy_nlp = spacy.load("en_core_web_sm")
import utility_functions as utils
import importlib
import pandas as pd
importlib.reload(utils)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/abelboros/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


<module 'utility_functions' from '/Users/abelboros/Documents/Personal/Thesis/PopMusicInformationRetrieval/utility_functions.py'>

In [11]:
# This is the basic data, the lyrics is not processed at all.
df = pd.read_excel('./Data/excel_data/baseline_data.xlsx', engine='openpyxl')
# The cleanup function makes the whole lyrics preprocessing
df[['Tokens', 'Processed_Lyrics']] = df['Lyrics'].apply(lambda x: pd.Series(utils.cleanup(x)))

In [13]:
df[['Lyrics', 'Processed_Lyrics']].head()

Unnamed: 0,Lyrics,Processed_Lyrics
0,"\n\nYo\nMy crew is in the house\nTerra, Herb M...",crew house terra herb mcgruff buddah bless big...
1,\n\n[Intro:]\nIt's the number one crew in the ...,number one crew area big lightin nigga incense...
2,\n\n[Verse 1]\nThere are too many MC's who are...,many mcs overrated ask even supposed make rap ...
3,\n\n[Verse 1:]\nLet me get to the point real q...,let get point real quick pocket thick mad chic...
4,\n\n[Big L]\n A yo spark up the phillies and p...,spark phillies pass stout make quick money gri...


In [None]:
# The dataset with the preprocessed lyrics will be saved into the preprocessed_df.pkl file.
df.to_pickle('preprocessed_df.pkl')

<b>From here we only do some experimental analysis</b>

In [14]:
filtered_df = utils.filter_tokens_by_document_frequency(df, column_name='Tokens', min_doc_frequency=0.03, max_doc_frequency=0.85)

In [15]:
def save_unique_tokens(tokens_list, output_file_path):
    # Flatten the list of tokens (if you have multiple documents)
    all_tokens = [token for tokens in tokens_list for token in tokens]

    # Get unique tokens
    unique_tokens = sorted(set(all_tokens))  # Sorted for easier review

    # Write unique tokens to a file
    with open(output_file_path, 'w', encoding='utf-8') as file:
        for token in unique_tokens:
            file.write(f"{token}\n")

list_of_unique_tokens = list(df['Tokens'])
save_unique_tokens(list_of_unique_tokens, "unique_tokens.txt")

In [16]:
len(list_of_unique_tokens)

1368

<b>Statistics</b>

In [18]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# Assuming df is your dataframe that contains the 'Tokens' column with lists of strings (tokens)
# Example:
# df = pd.read_csv('your_lyrics_data.csv')

# Step 1: Flatten all the tokens from the 'Tokens' column into a single list
all_tokens = [token for tokens_list in df['Tokens'] for token in tokens_list]

# Step 2: Count the frequency of each token using Counter
token_counts = Counter(all_tokens)

# Step 3: Convert the counts to a DataFrame
token_counts_df = pd.DataFrame(token_counts.items(), columns=['Word', 'Count'])

# Step 4: Sort the words by frequency in descending order and get the top N words
top_n = 20  # You can change this to however many words you want to see
token_counts_df = token_counts_df.sort_values(by='Count', ascending=False).head(top_n)

# Step 5: Plot the most used words and their counts using Seaborn
plt.figure(figsize=(12, 8))
sns_barplot = sns.barplot(x='Count', y='Word', data=token_counts_df, palette='crest')  # Using Seaborn's palette

# Step 6: Add frequency numbers on top of each bar
for index, value in enumerate(token_counts_df['Count']):
    plt.text(value + 1, index, str(value), color='black', va='center', fontsize=10)  # `+1` for slight space from bar

# Set labels and title
plt.xlabel('Frequency')
plt.ylabel('Words')
plt.title(f'Top {top_n} Most Used Words in Lyrics')
plt.tight_layout()
plt.savefig('./Plots/word-frequency.png')
plt.show()

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

# Combine all tokens into single text
all_text = [' '.join(tokens) for tokens in df['Tokens']]

# Use CountVectorizer to get bigrams and trigrams
vectorizer = CountVectorizer(ngram_range=(2, 3))
X = vectorizer.fit_transform(all_text)

# Get the top n-grams
ngram_counts = X.sum(axis=0).A1
ngram_names = vectorizer.get_feature_names_out()
ngram_df = pd.DataFrame({'ngram': ngram_names, 'count': ngram_counts})
ngram_df = ngram_df.sort_values(by='count', ascending=False).head(20)

# Plot the top n-grams
plt.figure(figsize=(12, 8))
sns.barplot(x='count', y='ngram', data=ngram_df, palette='viridis')
plt.xlabel('Frequency')
plt.ylabel('N-Grams')
plt.savefig('./Plots/n-grams-frequency.png')
plt.title('Top 20 Most Frequent Bigrams/Trigrams in Lyrics')
plt.tight_layout()
plt.show()