In [1]:
import pandas as pd
import re
import numpy as np
import re
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request
from gensim.parsing.preprocessing import remove_stopwords




  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Setting up model

task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]


model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

In [4]:
df= pd.read_csv("data/songs_lyrics_filtered1959.csv")

In [5]:
def clean_text(text):

    text = text.replace('\n', ' ')
    text = re.sub(r'[,\.!?]', '', text)
    text = re.sub(r'\[.*?\]', ' ', text)
    text = re.sub(r'\w*\d\w*',' ', text)
    text = re.sub(r'[()]', ' ', text)
    text = text.lower()
    text = re.sub(r'\b(chorus|verse|intro)\b', '', text)
    return text

In [6]:
df['lyrics_clean'] = df['lyrics'].astype(str).apply(lambda x: clean_text(x))
df

Unnamed: 0.1,Unnamed: 0,song_id,artist,title,tag,year,lyrics,lyrics_clean
0,0,lil_wayne_mr_carter,Lil Wayne,Mr. Carter,rap,2008,[Produced by Infamous and Drew Correa]\n\n[Int...,yo yo drew and inf this-this this right h...
1,1,birdman_pop_bottles,Birdman,Pop Bottles,rap,2007,"[Hook: Jadakiss, Lil Wayne & Birdman]\nStart w...",start with straight shots and then pop bottl...
2,2,lil_wayne_fireman,Lil Wayne,Fireman,rap,2005,"[Intro]\n(Weezy Baby)\nShh, the fireman comin'...",weezy baby shh the fireman comin' yeah yea...
3,3,ol_dirty_bastard_brooklyn_zoo,Ol' Dirty Bastard,Brooklyn Zoo,rap,1995,[Produced by True Master & Ol' Dirty Bastard]\...,shit word i'll bust that nigga ass right ...
4,4,lil_wayne_a_milli,Lil Wayne,A Milli,rap,2008,[Intro]\nBangladesh\nYoung Money!\nYou dig?\nM...,bangladesh young money you dig mack i'm goin...
...,...,...,...,...,...,...,...,...
20514,20544,jack_harlow_first_class,Jack Harlow,First Class,rap,2022,[Chorus: Jack Harlow & Fergie]\nMm\nI been a G...,mm i been a g throw up the l sex in the am u...
20515,20545,mary_macgregor_for_a_while,Mary Macgregor,For a While,pop,1976,[Verse 1]\nI think I'll stay around here for a...,i think i'll stay around here for a while i ...
20516,20546,mary_macgregor_dancin_like_lovers,Mary Macgregor,Dancin Like Lovers,pop,1980,[Verse 1]\nThe music's playing softly in the s...,the music's playing softly in the summer nig...
20517,20547,42_dugg_thump_shit,42 Dugg & EST Gee,Thump Shit,rap,2022,"[Intro: 42 Dugg]\nFree them boys, we them (Me ...",free them boys we them me and spiff ayy fr...


In [7]:
# Removing stopwords

df['lyrics_clean'] = df['lyrics_clean'].astype(str).apply(lambda x: remove_stopwords(x))

In [8]:
df_test = df.sample(n=100, random_state=42)

In [None]:
positive_scores = []
neutral_scores = []
negative_scores = []

for index, row in df.iterrows():
    lyrics = row['lyrics_clean']
    
    encoded_input = tokenizer(lyrics, return_tensors='pt', max_length=512, truncation=True, padding=True)
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)

    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    
    # Initialize scores for each label
    positive_score = 0.0
    neutral_score = 0.0
    negative_score = 0.0
    
    for i in range(scores.shape[0]):
        l = labels[ranking[i]]
        s = scores[ranking[i]]
        if l == 'positive':
            positive_score = np.round(float(s), 4)
        elif l == 'neutral':
            neutral_score = np.round(float(s), 4)
        elif l == 'negative':
            negative_score = np.round(float(s), 4)
    
    # Append scores to respective lists
    positive_scores.append(positive_score)
    neutral_scores.append(neutral_score)
    negative_scores.append(negative_score)

# Add the score columns to the DataFrame
df['positive_score'] = positive_scores
df['neutral_score'] = neutral_scores
df['negative_score'] = negative_scores


In [21]:
def calculate_compound_score(positive_scores, neutral_scores, negative_scores):
    # Convert scores to numpy arrays
    positive_scores = np.array(positive_scores)
    neutral_scores = np.array(neutral_scores)
    negative_scores = np.array(negative_scores)
    
    # Define weights
    weights = np.array([1.0, 0.0, -1.0])  # POSITIVE: 1.0, NEUTRAL: 0.0, NEGATIVE: -1.0
    
    # Transpose the scores array to align dimensions for dot product
    scores_array = np.array([positive_scores, neutral_scores, negative_scores]).T
    
    # Calculate compound score
    compound_score = np.dot(scores_array, weights)
    
    return compound_score

In [22]:
df_test['compound_score'] = calculate_compound_score(df_test['positive_score'], df_test['neutral_score'], df_test['negative_score'])


In [23]:
df_test

Unnamed: 0.1,Unnamed: 0,song_id2,artist,title,tag,year,lyrics,lyrics_clean,positive_score,neutral_score,negative_score,compound_score
5864,5864,willie_nelson_blue_eyes_crying_in_the_rain,Willie Nelson,Blue Eyes Crying in the Rain,country,1975,[Verse]\nIn the twilight glow I see them\nBlue...,twilight glow blue eyes crying rain kissed goo...,0.3147,0.5535,0.1318,0.1829
3721,3721,pnk_glitter_in_the_air,P!nk,Glitter in the Air,pop,2010,[Verse 1]\nHave you ever fed a lover with just...,fed lover hands closed eyes trusted trusted th...,0.1705,0.5498,0.2797,-0.1092
3528,3528,tim_mcgraw_its_your_love,Tim McGraw,Its Your Love,country,1997,"[Verse 1: Tim McGraw]\nDancin' in the dark, mi...",dancin' dark middle night takin' heart holdin'...,0.6325,0.3415,0.0260,0.6065
11920,11920,player_baby_come_back,Player,Baby Come Back,rock,1977,"[Verse 1]\nSpending all my nights, all my mone...",spending nights money going town mind morning ...,0.0662,0.3798,0.5541,-0.4879
474,474,al_green_lets_stay_together,Al Green,Lets Stay Together,rb,1971,"[Intro]\nLet's stay together\n\n[Verse 1]\nI, ...",let's stay i'm love want alright 'cause feel b...,0.4889,0.4215,0.0896,0.3993
...,...,...,...,...,...,...,...,...,...,...,...,...
13814,13814,eddie_kendricks_happy,Eddie Kendricks,Happy,pop,1975,Happy make you happy baby\nI just wanna make y...,happy happy baby wanna happy happy baby ain't ...,0.8414,0.1360,0.0226,0.8188
18771,18771,taylor_swift_lover,Taylor Swift,Lover,country,2019,[Verse 1]\nWe could leave the Christmas lights...,leave christmas lights 'til january place rule...,0.1898,0.5757,0.2345,-0.0447
6808,6808,james_blunt_stay_the_night,James Blunt,Stay the Night,pop,2010,"[Intro]\nOh-oh-oh-oh, oh-oh-oh-oh, oh-oh-oh-oh...",oh-oh-oh-oh oh-oh-oh-oh oh-oh-oh-oh hey it’s d...,0.3363,0.5939,0.0698,0.2665
20529,20529,jerry_butler_walk_easy_my_son,Jerry Butler,Walk Easy My Son,rb,1971,When I was a very small boy (Just a small boy)...,small boy small boy looked cornfields wonder o...,0.0406,0.3326,0.6268,-0.5862
