Import Libraries

In [1]:
!pip install pandas



In [2]:
import pandas as pd
import re
import unicodedata

#Data Collection
Load the first CSV file containing years 2000-2024

In [3]:
try:
    df_spotify = pd.read_csv('billboard_24years_lyrics_spotify.csv', encoding='utf-8')
except UnicodeDecodeError:
    df_spotify = pd.read_csv('billboard_24years_lyrics_spotify.csv', encoding='latin1')

Load the second CSV file containing years 1964-2015

In [4]:
try:
    df_1964_2015 = pd.read_csv('billboard_lyrics_1964-2015.csv', encoding='utf-8')
except UnicodeDecodeError:
    df_1964_2015 = pd.read_csv('billboard_lyrics_1964-2015.csv', encoding='latin1')

#Data Preprocessing

Filter for years 2016 to 2024

In [5]:
df_spotify_filtered = df_spotify[df_spotify['year'].between(2016, 2024)].copy()

Remove duplicate records based on all columns

In [6]:
df_spotify_filtered = df_spotify_filtered.drop_duplicates()

Function to clean lyrics

In [7]:
def clean_lyrics_ascii(text):
    if pd.isna(text):  # Handle missing values
        return ''
    text = text.lower() # Convert to lowercase

    text = re.sub(r'[^\x00-\x7F]+', ' ', text) # Remove non-ASCII characters

    text = re.sub(r'\s*\'\s*', '', text) # Remove apostrophes and merge contractions

    text = re.sub(r'[^a-z0-9\s]', ' ', text) # Remove all punctuation except spaces, keep letters and numbers

    text = re.sub(r'\d+embed$', '', text) # Remove number followed by "embed" at the end

    text = re.sub(r'\s+', ' ', text).strip() # Replace multiple spaces with a single space and remove leading/trailing whitespace

    return text

df_spotify_filtered['lyrics_cleaned'] = df_spotify_filtered['lyrics'].apply(clean_lyrics_ascii)

Select and rename columns for consistency

In [8]:
df_spotify_cleaned = df_spotify_filtered[['song', 'band_singer', 'year', 'ranking', 'lyrics_cleaned']]

df_spotify_cleaned = df_spotify_cleaned.rename(columns={
    'band_singer': 'artist'
})

Rename columns to match the first dataset

In [9]:
df_1964_2015 = df_1964_2015.rename(columns={
    'Song': 'song',
    'Artist': 'artist',
    'Year': 'year',
    'Rank': 'ranking',
    'Lyrics': 'lyrics_cleaned'
})

Select only the required columns

In [10]:
df_1964_2015_cleaned = df_1964_2015[['song', 'artist', 'year', 'ranking', 'lyrics_cleaned']]

Combine the datasets

In [11]:
df_combined = pd.concat([df_1964_2015_cleaned, df_spotify_cleaned], ignore_index=True)

Remove duplicates across the combined dataset (based on all columns)

In [12]:
df_combined = df_combined.drop_duplicates()

Inspect the first few rows of the combined DataFrame

In [13]:
print("Combined Cleaned DataFrame (First 5 rows):")
print(df_combined.head())
print("\nOriginal vs Cleaned Lyrics (First 5 rows from 2016-2024 data):")
for i in range(min(5, len(df_spotify_filtered))):
    print(f"Song: {df_spotify_filtered['song'].iloc[i]} (Year: {df_spotify_filtered['year'].iloc[i]}, Rank: {df_spotify_filtered['ranking'].iloc[i]})")
    print(f"Original: {df_spotify_filtered['lyrics'].iloc[i]}")
    print(f"Cleaned: {df_spotify_filtered['lyrics_cleaned'].iloc[i]}")
    print("-" * 50)

Combined Cleaned DataFrame (First 5 rows):
                                       song                         artist  \
0                               wooly bully  sam the sham and the pharaohs   
1  i cant help myself sugar pie honey bunch                      four tops   
2                i cant get no satisfaction             the rolling stones   
3                       you were on my mind                        we five   
4              youve lost that lovin feelin         the righteous brothers   

   year  ranking                                     lyrics_cleaned  
0  1965        1  sam the sham miscellaneous wooly bully wooly b...  
1  1965        2   sugar pie honey bunch you know that i love yo...  
2  1965        3                                                     
3  1965        4   when i woke up this morning you were on my mi...  
4  1965        5   you never close your eyes anymore when i kiss...  

Original vs Cleaned Lyrics (First 5 rows from 2016-2024 data):
Song

Save the combined cleaned data to a new CSV file

In [14]:
df_combined.to_csv('billboard_1964_2024_lyrics_cleaned.csv', index=False)
print("Combined cleaned data saved to 'billboard_1964_2024_lyrics_cleaned.csv'")
print(f"Number of unique records: {len(df_combined)}")

Combined cleaned data saved to 'billboard_1964_2024_lyrics_cleaned.csv'
Number of unique records: 6280


Generating a basic score for judging a song's performance

In [15]:
import pandas as pd


df = pd.read_csv('billboard_1964_2024_lyrics_cleaned.csv')

# Group by song, artist, and year to ensure uniqueness
grouped = df.groupby(['song', 'artist']).agg({
    'ranking': list,   # List of rankings across years
    'year': list,      # List of years charted
    'lyrics_cleaned': 'first'  # Keep first instance of lyrics
}).reset_index()


def calculate_mycs(rankings, alpha=0.2):
    """
    Compute Multi-Year Chart Score for a song.

    Args:
        rankings (list): List of rankings (1-100) across years
        alpha (float): Longevity weight (default: 0.2)

    Returns:
        float: MYCS score
    """
    # Yearly rank scores: (101 - rank)/100
    yearly_scores = [(101 - rank)/100 for rank in rankings]
    total_rank_score = sum(yearly_scores)

    # Longevity multiplier
    years = len(rankings)
    longevity_multiplier = 1 + alpha * (years - 1)

    return total_rank_score * longevity_multiplier

grouped['MYCS'] = grouped['ranking'].apply(
    lambda x: calculate_mycs(x, alpha=0.2)
)

grouped = grouped.sort_values('MYCS', ascending=False)


output = grouped[['song', 'artist', 'year', 'ranking', 'MYCS', 'lyrics_cleaned']]

# Save to CSV
output.to_csv('song_mycs_scores.csv', index=False)
print("Saved results to song_mycs_scores.csv")


print("\nTop 5 Songs by MYCS:")
print(output.head(5)[['song', 'artist', 'year', 'ranking', 'MYCS']])


Saved results to song_mycs_scores.csv

Top 5 Songs by MYCS:
                                 song         artist  \
156                   Blinding Lights     The Weeknd   
2788                    how do i live    leann rimes   
861                              Stay  Justin Bieber   
862                              Stay  The Kid Laroi   
71    All I Want for Christmas Is You   Mariah Carey   

                          year           ranking   MYCS  
156               [2020, 2021]            [1, 3]  2.376  
2788              [1997, 1998]            [9, 5]  2.256  
861               [2021, 2022]           [12, 3]  2.244  
862               [2021, 2022]           [12, 3]  2.244  
71    [2020, 2021, 2022, 2023]  [67, 78, 65, 55]  2.224  


A. Basic Statistical Features
Length of lyrics (word count, character count)

Unique word count (lexical richness)

Average word length

Stopword ratio (percentage of common words like "the", "and")

In [16]:

# Load the MYCS file
df_mycs = pd.read_csv('song_mycs_scores.csv')

df_mycs['lyrics_cleaned'] = df_mycs['lyrics_cleaned'].astype(str)

df_mycs['word_count'] = df_mycs['lyrics_cleaned'].apply(lambda x: len(x.split()))
df_mycs['char_count'] = df_mycs['lyrics_cleaned'].apply(len)

print(df_mycs[['song', 'word_count', 'char_count', 'MYCS']].head())


                              song  word_count  char_count   MYCS
0                  Blinding Lights         261        1174  2.376
1                    how do i live         279        1212  2.256
2                             Stay         423        1790  2.244
3                             Stay         423        1790  2.244
4  All I Want for Christmas Is You         391        1843  2.224


In [17]:
#Unique word count (lexical richness)
from collections import Counter

In [18]:
from collections import Counter


# Unique word count: words that appear only once
df_mycs['unique_word_count'] = df_mycs['lyrics_cleaned'].apply(
    lambda x: sum(1 for count in Counter(x.split()).values() if count == 1)
)

# Optional: Store the actual unique words
df_mycs['unique_words'] = df_mycs['lyrics_cleaned'].apply(
    lambda x: [word for word, count in Counter(x.split()).items() if count == 1]
)

# Distinct word count: total number of different words used (vocabulary size)
df_mycs['distinct_word_count'] = df_mycs['lyrics_cleaned'].apply(
    lambda x: len(set(x.split()))
)

# Preview the updated DataFrame
print(df_mycs[['song', 'unique_word_count', 'distinct_word_count', 'word_count', 'MYCS']].head())


                              song  unique_word_count  distinct_word_count  \
0                  Blinding Lights                 43                   96   
1                    how do i live                 27                   66   
2                             Stay                 42                   88   
3                             Stay                 42                   88   
4  All I Want for Christmas Is You                 61                  113   

   word_count   MYCS  
0         261  2.376  
1         279  2.256  
2         423  2.244  
3         423  2.244  
4         391  2.224  


In [19]:
import numpy as np


# Calculate average word length
df_mycs['avg_word_length'] = df_mycs['lyrics_cleaned'].apply(
    lambda x: np.mean([len(word) for word in x.split()]) if x else 0
)

# Preview the result
print(df_mycs[['song', 'avg_word_length']].head(10))


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


                              song  avg_word_length
0                  Blinding Lights         3.501916
1                    how do i live         3.340502
2                             Stay         3.234043
3                             Stay         3.234043
4  All I Want for Christmas Is You         3.716113
5                           Closer         3.985994
6                       Heat Waves         4.190955
7                        As It Was         3.529167
8                           Closer         3.985994
9                           smooth         3.788856


In [20]:
!pip install nltk



In [21]:
import nltk
from nltk.corpus import stopwords
import numpy as np

# Download stopwords if not already downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Calculate stopword ratio for df_mycs
df_mycs['stopword_ratio'] = df_mycs['lyrics_cleaned'].apply(
    lambda x: np.round(
        sum(1 for word in str(x).split() if word.lower() in stop_words) / max(len(str(x).split()), 1),
        2  # Round to 2 decimal places
    )
)

# Preview result
print(df_mycs[['song', 'stopword_ratio']].head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zachr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                              song  stopword_ratio
0                  Blinding Lights            0.43
1                    how do i live            0.56
2                             Stay            0.47
3                             Stay            0.47
4  All I Want for Christmas Is You            0.48


B. Linguistic & Sentiment Features
Sentiment Analysis (positive/negative sentiment score)

Emotion detection (anger, joy, sadness, etc.)

Lexical diversity (ratio of unique words to total words)


In [22]:
#Sentiment Analysis (positive/negative sentiment score)
from nltk.sentiment import SentimentIntensityAnalyzer
#Sentiment Analysis (positive/negative sentiment score)
nltk.download('vader_lexicon')

# Initialize the sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Function to calculate sentiment ratios
def sentiment_ratios(text):
    scores = sia.polarity_scores(str(text))  # Get sentiment scores
    total = scores['pos'] + scores['neg'] + scores['neu']  # Total sentiment score sum

    if total == 0:  # Avoid division by zero
        return (0, 0)

    pos_ratio = scores['pos'] / total  # Positive sentiment ratio
    neg_ratio = scores['neg'] / total  # Negative sentiment ratio
    neu_ratio = scores ['neu'] / total
    return pos_ratio, neg_ratio ,neu_ratio

# Apply function to each row
df_mycs[['pos_ratio', 'neg_ratio','neu_ratio']] = df_mycs['lyrics_cleaned'].apply(
    lambda x: pd.Series(sentiment_ratios(x))
)

print(df_mycs[['lyrics_cleaned', 'pos_ratio', 'neg_ratio', 'neu_ratio']].head())


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\zachr\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


                                      lyrics_cleaned  pos_ratio  neg_ratio  \
0  yeah ive been tryna call ive been on my own fo...      0.077      0.094   
1   how do i get through one night without you if...      0.039      0.071   
2  i do the same thing i told you that i never wo...      0.071      0.095   
3  i do the same thing i told you that i never wo...      0.071      0.095   
4  i dont want a lot for christmas there is just ...      0.103      0.086   

   neu_ratio  
0      0.829  
1      0.890  
2      0.834  
3      0.834  
4      0.811  


In [23]:
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

df_mycs['compound'] = df_mycs['lyrics_cleaned'].apply(
    lambda x: sia.polarity_scores(str(x))['compound']
)
print(df_mycs[['lyrics_cleaned', 'compound']].head())

                                      lyrics_cleaned  compound
0  yeah ive been tryna call ive been on my own fo...   -0.1068
1   how do i get through one night without you if...   -0.1195
2  i do the same thing i told you that i never wo...   -0.8523
3  i do the same thing i told you that i never wo...   -0.8523
4  i dont want a lot for christmas there is just ...    0.9672


Lexical Diversity

In [24]:
# Lexical diversity: ratio of unique words (used only once) to total words
df_mycs['lexical_diversity'] = df_mycs.apply(
    lambda row: row['unique_word_count'] / row['word_count'] if row['word_count'] > 0 else 0,
    axis=1
)

# Preview result
print(df_mycs[['song', 'unique_word_count', 'word_count', 'lexical_diversity']].head())

                              song  unique_word_count  word_count  \
0                  Blinding Lights                 43         261   
1                    how do i live                 27         279   
2                             Stay                 42         423   
3                             Stay                 42         423   
4  All I Want for Christmas Is You                 61         391   

   lexical_diversity  
0           0.164751  
1           0.096774  
2           0.099291  
3           0.099291  
4           0.156010  


Syllable count

In [25]:
!pip install phonemizer



In [26]:
!pip install syllapy



In [27]:
!pip install espeak-ng

ERROR: Could not find a version that satisfies the requirement espeak-ng (from versions: none)
ERROR: No matching distribution found for espeak-ng


In [28]:
import syllapy

# Function to count syllables in a text
def count_syllables(text):
    # Split text into words, count syllables for each word, and sum them
    return sum(syllapy.count(word) for word in text.split())

# Ensure 'lyrics_cleaned' is treated as a string
df_mycs['lyrics_cleaned'] = df_mycs['lyrics_cleaned'].astype(str)

# Calculate syllable count
df_mycs['syllable_count'] = df_mycs['lyrics_cleaned'].apply(count_syllables)

# Preview result
print(df_mycs[['song', 'syllable_count']].head())


                              song  syllable_count
0                  Blinding Lights             299
1                    how do i live             335
2                             Stay             467
3                             Stay             467
4  All I Want for Christmas Is You             473


In [29]:
!pip install pronouncing



In [None]:
import pronouncing


def rhyme_count(lyrics):
    words = str(lyrics).lower().split()
    if not words or lyrics is None or pd.isna(lyrics):
        return 0

    # Get unique words to avoid overcounting repeats
    unique_words = set(words)
    rhyme_pairs = 0

    # Check each word against others for rhymes
    for i, word1 in enumerate(sorted(unique_words)):
        rhymes = pronouncing.rhymes(word1)
        for word2 in sorted(unique_words)[i+1:]:

            if word2 in rhymes:
                rhyme_pairs += 1

    return rhyme_pairs

df_mycs['rhyme_pairs'] = df_mycs['lyrics_cleaned'].apply(rhyme_count)

# Preview the results
print("First 5 rows with rhyme_pairs:")
print(df_mycs[['lyrics_cleaned', 'rhyme_pairs']].head())


In [None]:
df_mycs['rhyme_density'] = df_mycs['rhyme_pairs'] / df_mycs['distinct_word_count']

# Handle cases where distinct_word_count is 0 to avoid division-by-zero
df_mycs['rhyme_density'] = df_mycs['rhyme_density'].fillna(0).replace([float('inf')], 0)

# Preview the results
print("First 5 rows with rhyme_density:")
print(df_mycs[['lyrics_cleaned', 'rhyme_pairs', 'distinct_word_count', 'rhyme_density']].head())

First 5 rows with rhyme_density:
                                      lyrics_cleaned  rhyme_pairs  \
0  yeah ive been tryna call ive been on my own fo...           50   
1   how do i get through one night without you if...           23   
2  i do the same thing i told you that i never wo...           42   
3  i do the same thing i told you that i never wo...           42   
4  i dont want a lot for christmas there is just ...           42   

   distinct_word_count  rhyme_density  
0                   96       0.520833  
1                   66       0.348485  
2                   88       0.477273  
3                   88       0.477273  
4                  113       0.371681  


Rhyme pattern

TOPICS?

what words correlate most with mycs?

In [None]:
print(df_mycs.columns)


Index(['song', 'artist', 'year', 'ranking', 'MYCS', 'lyrics_cleaned',
       'word_count', 'char_count', 'unique_word_count', 'unique_words',
       'distinct_word_count', 'avg_word_length', 'stopword_ratio', 'pos_ratio',
       'neg_ratio', 'neu_ratio', 'compound', 'lexical_diversity',
       'syllable_count', 'rhyme_pairs', 'rhyme_density'],
      dtype='object')


Stopword Repetition Ratio   

In [None]:
# Count total stopwords
df_mycs['stopword_count'] = df_mycs['lyrics_cleaned'].apply(
    lambda x: sum(1 for word in x.split() if word.lower() in stop_words)
)

# Count distinct stopwords
df_mycs['distinct_stopword_count'] = df_mycs['lyrics_cleaned'].apply(
    lambda x: len(set(word.lower() for word in x.split() if word.lower() in stop_words))
)

# Calculate stopword repetition ratio
df_mycs['stopword_repetition_ratio'] = df_mycs.apply(
    lambda row: round(
        1 - (row['distinct_stopword_count'] / row['stopword_count']),
        2
    ) if row['stopword_count'] > 0 else 0,
    axis=1
)

# Preview results
print(df_mycs[['song', 'word_count', 'stopword_count', 'distinct_stopword_count', 'stopword_ratio', 'stopword_repetition_ratio']].head())

                              song  word_count  stopword_count  \
0                  Blinding Lights         261             111   
1                    how do i live         279             156   
2                             Stay         423             199   
3                             Stay         423             199   
4  All I Want for Christmas Is You         391             186   

   distinct_stopword_count  stopword_ratio  stopword_repetition_ratio  
0                       32            0.43                       0.71  
1                       21            0.56                       0.87  
2                       30            0.47                       0.85  
3                       30            0.47                       0.85  
4                       29            0.48                       0.84  


True Repitition Ratio

In [None]:
from collections import Counter

def true_repetition_ratio(text):
    words = text.split()
    total = len(words)
    if total == 0:
        return 0
    counts = Counter(words)
    repeated_word_count = sum(count for word, count in counts.items() if count > 1)
    return round(repeated_word_count / total, 2)

df_mycs['true_repetition_ratio'] = df_mycs['lyrics_cleaned'].apply(true_repetition_ratio)

# Preview results
print(df_mycs[['song', 'word_count', 'true_repetition_ratio']].head())

                              song  word_count  true_repetition_ratio
0                  Blinding Lights         261                   0.84
1                    how do i live         279                   0.90
2                             Stay         423                   0.90
3                             Stay         423                   0.90
4  All I Want for Christmas Is You         391                   0.84


Vocab Redundancy Ratio 

In [None]:
df_mycs['vocab_redundancy_ratio'] = df_mycs.apply(
    lambda row: round(
        1 - (row['unique_word_count'] / row['word_count']),
        2
    ) if row['word_count'] > 0 else 0,
    axis=1
)

# Preview results
print(df_mycs[['song', 'word_count', 'unique_word_count', 'vocab_redundancy_ratio']].head())

                              song  word_count  unique_word_count  \
0                  Blinding Lights         261                 43   
1                    how do i live         279                 27   
2                             Stay         423                 42   
3                             Stay         423                 42   
4  All I Want for Christmas Is You         391                 61   

   vocab_redundancy_ratio  
0                    0.84  
1                    0.90  
2                    0.90  
3                    0.90  
4                    0.84  


Line Breaks

In [None]:
!pip install lyricsgenius 



In [None]:
import lyricsgenius as genius

api=genius.Genius('i5BgTB2QsuVYKN9SBeuNOQ5YkyzYtEEi_prbiJJwQPw689g9MDnJ3UcKJj5Ep2Mi')
# artist = api.search_artist("Mariah Carey", max_songs=3, sort="title")
# print(artist.songs)
song = api.search_song("Blinding Lights", "The Weeknd")
print(song.lyrics)

Searching for "Blinding Lights" by The Weeknd...
Done.
247 ContributorsTranslationsTürkçeSvenskaEspañolSlovenčinaРусскийPortuguêsPolskiNorsk (bokmål / riksmål)日本語ItalianoHebrewDeutschFrançaisفارسیNederlandsDanskShqipالعربيةСрпскиBlinding Lights Lyrics[Intro]
Yeah

[Verse 1]
I've been tryna call
I've been on my own for long enough
Maybe you can show me how to love, maybe
I'm goin' through withdrawals
You don't even have to do too much
You can turn me on with just a touch, baby

[Pre-Chorus]
I look around and
Sin City's cold and empty (Oh)
No one's around to judge me (Oh)
I can't see clearly when you're gone

[Chorus]
I said, ooh, I'm blinded by the lights
No, I can't sleep until I feel your touch
I said, ooh, I'm drowning in the night
Oh, when I'm like this, you're the one I trust
Hey, hey, hey

[Verse 2]
I'm running out of time
'Cause I can see the sun light up the sky
So I hit the road in overdrive, baby, oh


[Pre-Chorus]
The city's cold and empty (Oh)
No one's around to judge me (Oh

In [None]:
!pip install scikit-learn




In [None]:

# --- Topic Modeling using LDA ---
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Vectorize lyrics
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = vectorizer.fit_transform(df_mycs['lyrics_cleaned'])

# Fit LDA model
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(dtm)

# Display top words per topic
def display_topics(model, feature_names, no_top_words):
    for idx, topic in enumerate(model.components_):
        print(f"Topic {idx + 1}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
        print()

display_topics(lda, vectorizer.get_feature_names_out(), 10)


Topic 1:
im know just love youre dont time ill like got

Topic 2:
love oh baby dont im know want just like wanna

Topic 3:
yeah like im got aint know ooh dont bitch just

Topic 4:
la da que ah ha te doo yo lo wild

Topic 5:
na dont like got come im shake rock uh let



In [None]:
import re
import pandas as pd

# Updated pronoun lists with contractions
first_person = [
    "i", "me", "my", "mine", "we", "us", "our", "ours",
    "i'm", "i've", "i'd", "i'll", "we're", "we've", "we'd", "we'll"
]
second_person = [
    "you", "your", "yours", "u",
    "you're", "you've", "you'd", "you'll"
]
third_person = [
    "he", "him", "his", "she", "her", "hers", "they", "them", "their", "theirs",
    "he's", "he'd", "he'll", "she's", "she'd", "she'll",
    "they're", "they've", "they'd", "they'll"
]
male_pronouns = ["he", "him", "his", "he's", "he'd", "he'll"]
female_pronouns = ["she", "her", "hers", "she's", "she'd", "she'll"]
gender_neutral_pronouns = ["they", "they're", "they've", "they'd", "they'll", "them", "their", "theirs", "themself", "themselves"]


def count_pronouns(text):
    text = str(text).lower()
    text = re.sub(r"[^\w\s']", "", text)  # keep apostrophes for contractions
    words = text.split()

    fp = sum(word in first_person for word in words)
    sp = sum(word in second_person for word in words)
    tp = sum(word in third_person for word in words)
    male = sum(word in male_pronouns for word in words)
    female = sum(word in female_pronouns for word in words)
    total = fp + sp + tp

    return pd.Series({
        "first_person": fp,
        "second_person": sp,
        "third_person": tp,
        "male_pronouns": male,
        "female_pronouns": female,
        "total_pronouns": total
    })

# Apply to your lyrics
pronoun_features = df_mycs["lyrics_cleaned"].apply(count_pronouns)
df_mycs = pd.concat([df_mycs, pronoun_features], axis=1)
df_mycs = df_mycs.loc[:, ~df_mycs.columns.duplicated()]  # just in case

# Compute ratios safely
df_mycs["pronoun_word_ratio"] = df_mycs["total_pronouns"] / df_mycs["word_count"].replace(0, 1)
df_mycs["first_person_ratio"] = df_mycs["first_person"] / df_mycs["total_pronouns"].replace(0, 1)
df_mycs["second_person_ratio"] = df_mycs["second_person"] / df_mycs["total_pronouns"].replace(0, 1)
df_mycs["third_person_ratio"] = df_mycs["third_person"] / df_mycs["total_pronouns"].replace(0, 1)
df_mycs["male_pronoun_ratio"] = df_mycs["male_pronouns"] / df_mycs["total_pronouns"].replace(0, 1)
df_mycs["female_pronoun_ratio"] = df_mycs["female_pronouns"] / df_mycs["total_pronouns"].replace(0, 1)

# Preview the results
print(df_mycs[[
    "song", "pronoun_word_ratio", "first_person_ratio", "second_person_ratio",
    "third_person_ratio", "male_pronoun_ratio", "female_pronoun_ratio"
]].head())


                               song  pronoun_word_ratio  first_person_ratio  \
0                   Blinding Lights            0.145594            0.710526   
1                     how do i live            0.261649            0.657534   
2                              Stay            0.236407            0.610000   
3                              Stay            0.236407            0.610000   
4   All I Want for Christmas Is You            0.184143            0.652778   
5                            Closer            0.204482            0.643836   
6                        Heat Waves            0.133166            0.396226   
7                         As It Was            0.108333            0.346154   
8                            Closer            0.204482            0.643836   
9                            smooth            0.140762            0.479167   
10                       Bad Habits            0.139108            0.773585   
11                you make me wanna            0.185