In [8]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
import pandas as pd
from scipy.special import softmax
import numpy as np

In [2]:
tokenizer = AutoTokenizer.from_pretrained("juliensimon/autonlp-song-lyrics-18753417")
model = AutoModelForSequenceClassification.from_pretrained("juliensimon/autonlp-song-lyrics-18753417")
config = AutoConfig.from_pretrained("juliensimon/autonlp-song-lyrics-18753417")

In [3]:
df = pd.read_csv("data/processed_spotify_millsongdata.csv")
df = df.head(25000).copy()

In [4]:
def get_genres(text):
    inputs = tokenizer(text,  return_tensors="pt", truncation=True)
    outputs = model(**inputs)

    scores = outputs[0][0].detach().numpy()
    scores = softmax(scores)
    ranking = np.argsort(scores)
    ranking = ranking[::-1]

    genres = []
    for i in range(scores.shape[0]):
        l = config.id2label[ranking[i]]
        s = scores[ranking[i]]
        if s >= 0.08:
            genres.append(l)
        # print(f"{i+1}) {l} {np.round(float(s), 4)}")
        
    return ','.join(genres)

In [5]:
df['genres'] = df["cleaned_text"].apply(lambda x: get_genres(x))

In [6]:
df.head()

Unnamed: 0,artist,song,cleaned_text,genres
0,ABBA,Ahe's My Kind Of Girl,"look at her face , it 's a wonderful face and ...","Rock,Pop"
1,ABBA,"Andante, Andante","take it easy with me , please touch me gently ...","Pop,Dance,Rock"
2,ABBA,As Good As New,i 'll never know why i had to go why i had to ...,"Pop,Rock"
3,ABBA,Bang,making somebody happy is a question of give an...,"Pop,Rock,Dance"
4,ABBA,Bang-A-Boomerang,making somebody happy is a question of give an...,"Pop,Rock"


In [7]:
df.to_csv('data/spotify_millsongdata_w_genres.csv', index=False)