In [1]:
import pandas as pd
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax
import re
from numpy.random import randint 
import urllib.parse
from gensim.parsing.preprocessing import remove_stopwords
import csv





  from .autonotebook import tqdm as notebook_tqdm


## Data loading and initial cleaning

In [None]:
df = pd.read_csv("data/song_lyrics.csv")

df = df[df['language']=="en"]


In [None]:
df = df[df['language']=="en"]


In [None]:
df.shape


In [None]:
df.columns

In [None]:
df.dtypes

In [None]:

df = pd.read_csv("data/en_song_lyrics.csv")

In [None]:
# checking null values 

df.isna().sum()

In [None]:
# Dropping null values

df= df.dropna(how="any")

In [None]:
# Checking for duplicates

df.duplicated().sum()

In [None]:
df.shape

In [None]:
years = df['year'].unique()
years

In [None]:
df.to_csv("data/clean_en_song_lyrics.csv")

# FURTHER DATA CLEANING AND FILTERING

### Creating "song_id" from title and artist name

In [2]:
df=pd.read_csv("data/clean_en_song_lyrics.csv")



In [5]:
df

Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language
0,Killa Cam,rap,Cam'ron,2004,173166,"{""Cam\\'ron"",""Opera Steve""}","[Chorus: Opera Steve & Cam'ron]\nKilla Cam, Ki...",1,en,en,en
1,Can I Live,rap,JAY-Z,1996,468624,{},"[Produced by Irv Gotti]\n\n[Intro]\nYeah, hah,...",3,en,en,en
2,Forgive Me Father,rap,Fabolous,2003,4743,{},Maybe cause I'm eatin\nAnd these bastards fien...,4,en,en,en
3,Down and Out,rap,Cam'ron,2004,144404,"{""Cam\\'ron"",""Kanye West"",""Syleena Johnson""}",[Produced by Kanye West and Brian Miller]\n\n[...,5,en,en,en
4,Fly In,rap,Lil Wayne,2005,78271,{},"[Intro]\nSo they ask me\n""Young boy\nWhat you ...",6,en,en,en
...,...,...,...,...,...,...,...,...,...,...,...
3374066,Everything Is Alright Now,pop,Chuck Bernard,2013,2,{},"Everything is alright now\nOh yes, baby\nEvery...",7882838,en,en,en
3374067,White Lies,pop,ElementD,2019,1,"{""Harley Bird""}",[Verse 1]\nHalf truth and half you\nDidn't we ...,7882840,en,en,en
3374068,Ocean,pop,Effemar,2022,3,{},[Verse 1]\nDance for me now\nKeeping yourself ...,7882842,en,en,en
3374069,Raise Our Hands,pop,"Culture Code, Pag & Mylo",2016,3,"{Elex,""Culture Code / Pag & Mylo""}",[Verse 1]\nHere our purpose feels alive\nWe ar...,7882845,en,en,en


In [4]:
df=df.drop(columns=["Unnamed: 0", "Unnamed: 0.1"])

In [6]:
# Standardize strings in 'artist' and 'title' columns
df['artist_clean'] = df['artist'].str.strip().str.lower().str.replace('"', '').str.replace("'", "").str.replace(",", "").str.replace(".", "").str.replace("!", "").str.replace("?", "").str.replace("&", "and")
df['title_clean'] = df['title'].str.strip().str.lower().str.replace('"', '').str.replace("'", "").str.replace(",", "").str.replace(".", "").str.replace("!", "").str.replace("?", "").str.replace("&", "and")

# Combine 'artist' and 'title' to create 'song_id' column
df['song_id'] = (df['artist_clean'] + '_' + df['title_clean']).str.replace(' ', '_')


In [7]:
#Importing scraped weekly top 100 billboard songs

df_top = pd.read_csv("data/billboard_weekly_1958-2024.csv")

In [8]:
df_top.shape

(30045, 4)

In [9]:
df.shape

(3374071, 14)

In [10]:
# Standardize strings in 'artist' and 'title' columns
df_top['artist_clean'] = df_top['artist'].str.strip().str.lower().str.replace('"', '').str.replace("'", "").str.replace(",", "").str.replace(".", "").str.replace("!", "").str.replace("?", "").str.replace("&", "and")
df_top['title_clean'] = df_top['title'].str.strip().str.lower().str.replace('"', '').str.replace("'", "").str.replace(",", "").str.replace(".", "").str.replace("!", "").str.replace("?", "").str.replace("&", "and")

# Combine 'artist' and 'title' to create 'song_id' column
df_top['song_id'] = (df_top['artist_clean'] + '_' + df_top['title_clean']).str.replace(' ', '_')

In [11]:
df_filtered = df[df['song_id'].isin(df_top['song_id'])].reset_index(drop=True)
print(f"Percentage of songs matched:", round((len(df_filtered)/len(df_top)*100),2))

Percentage of songs matched: 60.23


In [12]:
# Find songs in df_top that are not present in df
missing_songs_top = df_top[~df_top['song_id'].isin(df['song_id'])]

print("Songs in df_top that are not present in df:")
print(missing_songs_top[['artist_clean', 'title_clean']])


Songs in df_top that are not present in df:
                                            artist_clean  \
0                                          groove holmes   
1                                          groove holmes   
3                                          pookie hudson   
13                                     weird al yankovic   
16                                           til tuesday   
...                                                  ...   
30040                            william and nicki minaj   
30041                    william featuring justin bieber   
30042   william featuring mick jagger and jennifer lopez   
30043                      william featuring miley cyrus   
30044  william featuring miley cyrus french montana w...   

                     title_clean  
0                          misty  
1               what now my love  
3                  i know i know  
13               white and nerdy  
16     (believed you were) lucky  
...                      

In [None]:
# Investigating missing songs: 

search_word = "rolling"
artist = "eminem"


matching_titles = df_top[(df_top['artist'].str.contains(search_word, case=False))]
# df_top[(df_top['artist'].fillna('').str.lower() == artist.lower()) & 
                     
matching_titles

In [13]:
# Need to deal with 'featuring' and "and" - extract main artist

def extract_main_artist(artist):
    if 'and' in artist:
        return artist.split(' and ')[0].strip()
    elif 'featuring' in artist:
        return artist.split(' featuring ')[0].strip()
    elif 'feat' in artist:
        return artist.split(' feat ')[0].strip()
    elif 'feat.' in artist:
        return artist.split(' feat.')[0].strip()
    elif 'with' in artist:
        return artist.split(' with ')[0].strip()
    elif ',' in artist:
        return artist.split(',')[0].strip()
    else:
        return artist
    



In [14]:
# Apply the function to create 'main_artist' column in df_top DataFrame
df_top['main_artist'] = df_top['artist_clean'].apply(extract_main_artist)

# Apply the function to create 'main_artist' column in df DataFrame
df['main_artist'] = df['artist_clean'].apply(extract_main_artist)

In [15]:
# make new ids

df_top['song_id2'] = (df_top['main_artist'] + '_' + df_top['title_clean']).str.replace(' ', '_')
df['song_id2'] = (df['artist_clean'] + '_' + df['title_clean']).str.replace(' ', '_')

In [16]:
df_filtered2 = df[df['song_id2'].isin(df_top['song_id2'])].reset_index(drop=True)
print(f"Percentage of songs matched:", round((len(df_filtered2)/len(df_top)*100),2))

Percentage of songs matched: 63.12


In [17]:
df_filtered2.shape

(18964, 16)

In [18]:
#Investigate again:

# Find songs in df_top that are not present in df
missing_songs_top = df_top[~df_top['song_id2'].isin(df['song_id2'])]

print("Songs in df_top that are not present in df:")
print(missing_songs_top[['main_artist', 'title']])

Songs in df_top that are not present in df:
                                             main_artist  \
0                                          groove holmes   
1                                          groove holmes   
3                                          pookie hudson   
13                                     weird al yankovic   
15                                                  $not   
...                                                  ...   
30039                                            william   
30040                                            william   
30041                                            william   
30042                      william featuring mick jagger   
30044  william featuring miley cyrus french montana w...   

                          title  
0                         Misty  
1              What Now My Love  
3                 I Know I Know  
13                White & Nerdy  
15                         Doja  
...                         ...

In [19]:
# Apply the function to create 'main_artist' column in df_top DataFrame
df_top['main_artist'] = df_top['main_artist'].apply(extract_main_artist)

# Apply the function to create 'main_artist' column in df DataFrame
df['main_artist'] = df['main_artist'].apply(extract_main_artist)

In [20]:
df_top['song_id2'] = (df_top['main_artist'] + '_' + df_top['title_clean']).str.replace(' ', '_')
df['song_id2'] = (df['main_artist'] + '_' + df['title_clean']).str.replace(' ', '_')

In [21]:
df_filtered3 = df[df['song_id2'].isin(df_top['song_id2'])].reset_index(drop=True)
print(f"Percentage of songs matched:", round((len(df_filtered3)/len(df_top)*100),2))

Percentage of songs matched: 69.23


Improvement 68% -> 69.23%

In [22]:
df_filtered3.shape

(20801, 16)

In [24]:
df_filtered3.to_csv("data/songs_filtered.csv")

### Text preprocessing

In [None]:
def clean_text(text):

    text = text.replace('\n', ' ')
    text = re.sub(r'[,\.!?]', '', text)
    text = re.sub(r'\[.*?\]', ' ', text)
    text = re.sub(r'\w*\d\w*',' ', text)
    text = re.sub(r'[()]', ' ', text)
    text = text.lower()
    text = re.sub(r'\b(chorus|verse|intro)\b', '', text)
    return text

In [None]:
df_filtered3['lyrics_clean'] = df_filtered3['lyrics'].astype(str).apply(lambda x: clean_text(x))
df_filtered3

In [None]:
# Removing stopwords

df_filtered3['lyrics_clean'] = df_filtered3['lyrics_clean'].astype(str).apply(lambda x: remove_stopwords(x))

## Running Roberta Base Sentiment Model on lyrics

In [None]:
# Setting up model

task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]


model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

In [None]:
df_test = df_filtered3.sample(n=100, random_state=42)

In [None]:
positive_scores = []
neutral_scores = []
negative_scores = []

for index, row in df_test.iterrows():
    lyrics = row['lyrics_clean']
    
    encoded_input = tokenizer(lyrics, return_tensors='pt', max_length=512, truncation=True, padding=True)
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)

    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    
    # Initialize scores for each label
    positive_score = 0.0
    neutral_score = 0.0
    negative_score = 0.0
    
    for i in range(scores.shape[0]):
        l = labels[ranking[i]]
        s = scores[ranking[i]]
        if l == 'positive':
            positive_score = np.round(float(s), 4)
        elif l == 'neutral':
            neutral_score = np.round(float(s), 4)
        elif l == 'negative':
            negative_score = np.round(float(s), 4)
    
    # Append scores to respective lists
    positive_scores.append(positive_score)
    neutral_scores.append(neutral_score)
    negative_scores.append(negative_score)

# Add the score columns to the DataFrame
df_test['positive_score'] = positive_scores
df_test['neutral_score'] = neutral_scores
df_test['negative_score'] = negative_scores


In [49]:
def calculate_compound_score(positive_scores, neutral_scores, negative_scores):
    # Convert scores to numpy arrays
    positive_scores = np.array(positive_scores)
    neutral_scores = np.array(neutral_scores)
    negative_scores = np.array(negative_scores)
    
    # Define weights
    weights = np.array([1.0, 0.0, -1.0])  # POSITIVE: 1.0, NEUTRAL: 0.0, NEGATIVE: -1.0
    
    # Transpose the scores array to align dimensions for dot product
    scores_array = np.array([positive_scores, neutral_scores, negative_scores]).T
    
    # Calculate compound score
    compound_score = np.dot(scores_array, weights)
    
    return compound_score



In [52]:
df2 = df.copy()

df2['compound_score'] = calculate_compound_score(df_test['positive_score'], df_test['neutral_score'], df_test['negative_score'])


NameError: name 'df_test' is not defined

In [None]:
df_test

## Checking matching 

In [28]:
df_top.shape

(30045, 9)

In [26]:
df_top3 = df_top.drop_duplicates(
    subset=['song_id2'],
    keep='first'
).reset_index(drop=True)

df_top3.shape


(30034, 9)

In [30]:
len(df_top)

30045

In [31]:
df_top2 = df_top[df_top['song_id2'].isin(df_filtered3['song_id2'])].reset_index(drop=True)


In [32]:
df_filtered3.shape

(20801, 16)

In [33]:
unique_top2 = df_top2['song_id2'].unique()
unique_df_f3 = df_filtered3['song_id2'].unique()

In [34]:
print(unique_top2.size)
print(unique_df_f3.size)

20549
20549


In [35]:
duplicate_count1 = df_top2['song_id2'].duplicated().sum()
duplicate_count2 = df_filtered3['song_id2'].duplicated().sum()

print(duplicate_count1)

print(duplicate_count2)

9
252


In [36]:
df_top2 = df_top2.drop_duplicates(
    subset=['song_id2'],
    keep='first'
).reset_index(drop=True)

In [37]:
df_filtered3 = df_filtered3.drop_duplicates(
    subset=['song_id2'],
    keep='first'
).reset_index(drop=True)



In [38]:
duplicate_count1 = df_top2['song_id2'].duplicated().sum()
duplicate_count2 = df_filtered3['song_id2'].duplicated().sum()

print(duplicate_count1)

print(duplicate_count2)

0
0


In [44]:
selected_columns = ['song_id2','artist','title','tag' ,'year', 'lyrics']

df_filtered3 = df_filtered3[selected_columns]

In [45]:
df_filtered3 = df_filtered3.rename(columns={"song_id2": "song_id"})


In [47]:
df_filtered3.to_csv("data/songs_lyrics_filtered.csv")

In [41]:
selected_columns2 = ['song_id','artist','title','date','year']
df_top2 = df_top2[selected_columns2]
df_top2 = df_top2.rename(columns={"song_id2": "song_id"})

In [42]:
df_top2.to_csv("data/popular_songs.csv")

In [43]:
df_top2

Unnamed: 0,song_id,artist,title,date,year
0,little_jimmy_dickens_may_the_bird_of_paradise_...,"""Little"" Jimmy Dickens",May The Bird Of Paradise Fly Up Your Nose,1965-12-14,1965
1,weird_al_yankovic_amish_paradise,"""Weird Al"" Yankovic",Amish Paradise,1996-07-07,1996
2,weird_al_yankovic_canadian_idiot,"""Weird Al"" Yankovic",Canadian Idiot,2006-11-04,2006
3,weird_al_yankovic_eat_it,"""Weird Al"" Yankovic",Eat It,1984-05-26,1984
4,weird_al_yankovic_fat,"""Weird Al"" Yankovic",Fat,1988-05-28,1988
...,...,...,...,...,...
20544,twenty_one_pilots_stressed_out,twenty one pilots,Stressed Out,2016-10-01,2016
20545,twenty_one_pilots_tear_in_my_heart,twenty one pilots,Tear In My Heart,2015-08-08,2015
20546,william_i_got_it_from_my_mama,will.i.am,I Got It From My Mama,2007-10-20,2007
20547,william_its_a_new_day,will.i.am,It's A New Day,2009-02-07,2009
