<a href="https://colab.research.google.com/github/ZachryRamkissoon/COMP-3610-Project/blob/main/COMP_3610_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import Libraries

In [1]:
import pandas as pd
import re
import unicodedata

#Data Collection
Load the first CSV file containing years 2000-2024

In [2]:
try:
    df_spotify = pd.read_csv('billboard_24years_lyrics_spotify.csv', encoding='utf-8')
except UnicodeDecodeError:
    df_spotify = pd.read_csv('billboard_24years_lyrics_spotify.csv', encoding='latin1')

Load the second CSV file containing years 1964-2015

In [3]:
try:
    df_1964_2015 = pd.read_csv('billboard_lyrics_1964-2015.csv', encoding='utf-8')
except UnicodeDecodeError:
    df_1964_2015 = pd.read_csv('billboard_lyrics_1964-2015.csv', encoding='latin1')

#Data Preprocessing

Filter for years 2016 to 2024

In [4]:
df_spotify_filtered = df_spotify[df_spotify['year'].between(2016, 2024)].copy()

Remove duplicate records based on all columns

In [5]:
df_spotify_filtered = df_spotify_filtered.drop_duplicates()

Function to clean lyrics

In [6]:
def clean_lyrics_ascii(text):
    if pd.isna(text):  # Handle missing values
        return ''
    text = text.lower() # Convert to lowercase

    text = re.sub(r'[^\x00-\x7F]+', ' ', text) # Remove non-ASCII characters

    text = re.sub(r'\s*\'\s*', '', text) # Remove apostrophes and merge contractions

    text = re.sub(r'[^a-z0-9\s]', ' ', text) # Remove all punctuation except spaces, keep letters and numbers

    text = re.sub(r'\d+embed$', '', text) # Remove number followed by "embed" at the end

    text = re.sub(r'\s+', ' ', text).strip() # Replace multiple spaces with a single space and remove leading/trailing whitespace

    return text

df_spotify_filtered['lyrics_cleaned'] = df_spotify_filtered['lyrics'].apply(clean_lyrics_ascii)

Select and rename columns for consistency

In [7]:
df_spotify_cleaned = df_spotify_filtered[['song', 'band_singer', 'year', 'ranking', 'lyrics_cleaned']]

df_spotify_cleaned = df_spotify_cleaned.rename(columns={
    'band_singer': 'artist'
})

Rename columns to match the first dataset

In [8]:
df_1964_2015 = df_1964_2015.rename(columns={
    'Song': 'song',
    'Artist': 'artist',
    'Year': 'year',
    'Rank': 'ranking',
    'Lyrics': 'lyrics_cleaned'
})

Select only the required columns

In [9]:
df_1964_2015_cleaned = df_1964_2015[['song', 'artist', 'year', 'ranking', 'lyrics_cleaned']]

Combine the datasets

In [10]:
df_combined = pd.concat([df_1964_2015_cleaned, df_spotify_cleaned], ignore_index=True)

Remove duplicates across the combined dataset (based on all columns)

In [11]:
df_combined = df_combined.drop_duplicates()

Inspect the first few rows of the combined DataFrame

In [12]:
print("Combined Cleaned DataFrame (First 5 rows):")
print(df_combined.head())
print("\nOriginal vs Cleaned Lyrics (First 5 rows from 2016-2024 data):")
for i in range(min(5, len(df_spotify_filtered))):
    print(f"Song: {df_spotify_filtered['song'].iloc[i]} (Year: {df_spotify_filtered['year'].iloc[i]}, Rank: {df_spotify_filtered['ranking'].iloc[i]})")
    print(f"Original: {df_spotify_filtered['lyrics'].iloc[i]}")
    print(f"Cleaned: {df_spotify_filtered['lyrics_cleaned'].iloc[i]}")
    print("-" * 50)

Combined Cleaned DataFrame (First 5 rows):
                                       song                         artist  \
0                               wooly bully  sam the sham and the pharaohs   
1  i cant help myself sugar pie honey bunch                      four tops   
2                i cant get no satisfaction             the rolling stones   
3                       you were on my mind                        we five   
4              youve lost that lovin feelin         the righteous brothers   

   year  ranking                                     lyrics_cleaned  
0  1965        1  sam the sham miscellaneous wooly bully wooly b...  
1  1965        2   sugar pie honey bunch you know that i love yo...  
2  1965        3                                                     
3  1965        4   when i woke up this morning you were on my mi...  
4  1965        5   you never close your eyes anymore when i kiss...  

Original vs Cleaned Lyrics (First 5 rows from 2016-2024 data):
Song

Save the combined cleaned data to a new CSV file

In [13]:
df_combined.to_csv('billboard_1964_2024_lyrics_cleaned.csv', index=False)
print("Combined cleaned data saved to 'billboard_1964_2024_lyrics_cleaned.csv'")
print(f"Number of unique records: {len(df_combined)}")

Combined cleaned data saved to 'billboard_1964_2024_lyrics_cleaned.csv'
Number of unique records: 6280


A. Basic Statistical Features
Length of lyrics (word count, character count)

Unique word count (lexical richness)

Average word length

Stopword ratio (percentage of common words like "the", "and")

In [14]:
#Length of lyrics (word count, character count) for each song
df_combined['word_count'] = df_combined['lyrics_cleaned'].apply(lambda x: len(str(x).split()))
df_combined['char_count'] = df_combined['lyrics_cleaned'].apply(lambda x: len(str(x)))
print(df_combined.head())



                                       song                         artist  \
0                               wooly bully  sam the sham and the pharaohs   
1  i cant help myself sugar pie honey bunch                      four tops   
2                i cant get no satisfaction             the rolling stones   
3                       you were on my mind                        we five   
4              youve lost that lovin feelin         the righteous brothers   

   year  ranking                                     lyrics_cleaned  \
0  1965        1  sam the sham miscellaneous wooly bully wooly b...   
1  1965        2   sugar pie honey bunch you know that i love yo...   
2  1965        3                                                      
3  1965        4   when i woke up this morning you were on my mi...   
4  1965        5   you never close your eyes anymore when i kiss...   

   word_count  char_count  
0         125         662  
1         204         995  
2           0       

In [15]:
#Unique word count (lexical richness)
from collections import Counter

In [16]:
#df_combined['unique_words'] = df_combined['lyrics_cleaned'].apply(lambda x: Counter(str(x).split()))
df_combined['unique_words'] = df_combined['lyrics_cleaned'].apply(
    lambda x: [word for word, count in Counter(str(x).split()).items() if count == 1]
)
print(df_combined['unique_words'].head())

0    [miscellaneous, pharaohs, domingo, samudio, un...
1    [elsein, out, life, go, leaving, just, picture...
2                                                   []
3    [so, went, the, corner, came, home, again, mii...
4    [never, close, anymore, kiss, lips, tenderness...
Name: unique_words, dtype: object


In [17]:
#Average word length
import numpy as np
#df_combined['avg_word_length'] = np.mean(df_combined['strings'].apply(lambda x: len([words for words in x.split(" ") if isinstance(x, str)])))
df_combined['avg_word_length'] = df_combined['lyrics_cleaned'].apply(
    lambda x: np.mean([len(word) for word in str(x).split()]) if x else 0
)
print(df_combined['avg_word_length'].head(10))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


0    4.280000
1    3.872549
2         NaN
3    3.546053
4    4.051724
5    4.573222
6    3.780702
7    3.888372
8    3.979730
9    3.450980
Name: avg_word_length, dtype: float64


In [18]:
#Stopword ratio (percentage of common words like "the", "and")
import nltk
from nltk.corpus import stopwords

# Download stopwords if not already downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

df_combined['stopword_ratio'] = df_combined['lyrics_cleaned'].apply(
    lambda x: np.round(
        sum(1 for word in str(x).split() if word.lower() in stop_words) / max(len(str(x).split()), 1),
        2  # Round to 2 decimal places
    )
)

print(df_combined['stopword_ratio'].head())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0    0.25
1    0.48
2    0.00
3    0.53
4    0.40
Name: stopword_ratio, dtype: float64


B. Linguistic & Sentiment Features
Sentiment Analysis (positive/negative sentiment score)

Emotion detection (anger, joy, sadness, etc.)

Lexical diversity (ratio of unique words to total words)


In [19]:
#Sentiment Analysis (positive/negative sentiment score)
from nltk.sentiment import SentimentIntensityAnalyzer
#Sentiment Analysis (positive/negative sentiment score)
nltk.download('vader_lexicon')

# Initialize the sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Function to calculate sentiment ratios
def sentiment_ratios(text):
    scores = sia.polarity_scores(str(text))  # Get sentiment scores
    total = scores['pos'] + scores['neg'] + scores['neu']  # Total sentiment score sum
    
    if total == 0:  # Avoid division by zero
        return (0, 0)
    
    pos_ratio = scores['pos'] / total  # Positive sentiment ratio
    neg_ratio = scores['neg'] / total  # Negative sentiment ratio
    
    return pos_ratio, neg_ratio

# Apply function to each row
df_combined[['pos_ratio', 'neg_ratio']] = df_combined['lyrics_cleaned'].apply(
    lambda x: pd.Series(sentiment_ratios(x))
)

print(df_combined[['lyrics_cleaned', 'pos_ratio', 'neg_ratio']].head())


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


                                      lyrics_cleaned  pos_ratio  neg_ratio
0  sam the sham miscellaneous wooly bully wooly b...   0.020000   0.352000
1   sugar pie honey bunch you know that i love yo...   0.137000   0.159000
2                                                      0.000000   0.000000
3   when i woke up this morning you were on my mi...   0.060939   0.185814
4   you never close your eyes anymore when i kiss...   0.266000   0.085000


In [20]:
print(df_combined.columns)


Index(['song', 'artist', 'year', 'ranking', 'lyrics_cleaned', 'word_count',
       'char_count', 'unique_words', 'avg_word_length', 'stopword_ratio',
       'pos_ratio', 'neg_ratio'],
      dtype='object')


In [None]:
import pandas as pd
import re

df = pd.read_csv('song_mycs_scores_with_lyrics.csv')

df['lyrics'] = df['lyrics'].fillna('')

def clean_lyrics(text):
    if not isinstance(text, str):
        return ""
    
    text = re.sub(r'^.*?(?=\[Intro|\[Verse|\[Chorus|\[Bridge|\[Pre-Chorus|\[Post-Chorus|\[Outro])', '', text, flags=re.DOTALL)

    text = re.sub(r'(\[[^\]]+\])', r'\n\1\n', text)

    text = re.sub(r'(?<=[.!?])\s+(?=\w)', r'\n', text)

    text = re.sub(r'\n{2,}', '\n\n', text)  # prevent excessive blank lines
    text = re.sub(r'[ \t]+', ' ', text)     # collapse extra spaces
    return text.strip()

df['clean_lyrics'] = df['lyrics'].apply(clean_lyrics)

for idx, row in df.head(3).iterrows():  # you can increase the number if you want
    print(f"\n--- Cleaned Lyrics for Song #{idx + 1} ---")
    print(row['clean_lyrics'])
    print("\n" + "-" * 50)
