In [1]:
# from google.colab import files
# uploaded = files.upload()

Saving tracks.csv to tracks.csv


## Import Libraries

In [2]:
!python3 -m pip install "dask[complete]"

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!python3 -m pip install dask-ml

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting dask-ml
  Downloading dask_ml-2023.3.24-py3-none-any.whl (148 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m148.7/148.7 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Collecting dask-glm>=0.2.0 (from dask-ml)
  Downloading dask_glm-0.2.0-py2.py3-none-any.whl (12 kB)
Installing collected packages: dask-glm, dask-ml
Successfully installed dask-glm-0.2.0 dask-ml-2023.3.24


In [4]:
# import relevant libraries
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import f1_score

In [5]:
np.__version__

'1.22.4'

## Data Cleaning for Spotify Dataset

In [6]:
# import relevant libraries
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import f1_score

# import the dataset
spotify_tracks = pd.read_csv('/content/tracks.csv')

# drop duplicates with the same name and artist
spotify_tracks = spotify_tracks.drop_duplicates(
  subset = ['name', 'artists'],
  keep = 'last').reset_index(drop = True)

# convert release_time to appropriate time date format
spotify_tracks['release_date']= pd.to_datetime(spotify_tracks['release_date'], format='%Y-%m-%d')

# remove songs older than 1990
spotify_tracks = spotify_tracks[spotify_tracks['release_date'].dt.year >= 1990]

# change duration from ms to minutes
spotify_tracks['duration_ms'] = spotify_tracks['duration_ms']/60000

# rearrange columns
spotify_tracks = spotify_tracks[['id',
        'name',
        'artists',
 'id_artists',
 'release_date',
 'duration_ms',
 'explicit',
 'danceability',
 'energy',
 'key',
 'loudness',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'time_signature',
 'popularity',
]]

# reset index
spotify_tracks = spotify_tracks.reset_index(drop=True)

# identify IQR for duration and remove outliers
Q1 = np.percentile(spotify_tracks['duration_ms'], 25,
                   method = 'midpoint')
Q3 = np.percentile(spotify_tracks['duration_ms'], 75,
                   method = 'midpoint')
IQR = Q3 - Q1
upper = Q3 + 1.5*IQR
lower = Q1 - 1.5*IQR
upper_array=np.where(spotify_tracks['duration_ms']>=upper)
lower_array=np.where(spotify_tracks['duration_ms']<=lower)

spotify_tracks.drop(upper_array[0],inplace=True)
spotify_tracks.drop(lower_array[0],inplace=True)

# remove songs with time signature = 0, 1
spotify_tracks = spotify_tracks[(spotify_tracks['time_signature'] != 0) & 
                                (spotify_tracks['time_signature'] !=1)]

# remove songs with high speechiness like talk shows, audio books, poetry
spotify_tracks = spotify_tracks[spotify_tracks['speechiness']<0.8]

# remove songs with live audiences
spotify_tracks = spotify_tracks[spotify_tracks['liveness']<0.9]

# drop the artist_id, since we have the artist name
spotify_tracks.drop(columns = ['id', 'id_artists'], inplace=True)

# drop all null values
spotify_tracks = spotify_tracks.dropna()

# separate releasedate to month and year and drop releasedate
spotify_tracks['month'] = pd.DatetimeIndex(spotify_tracks['release_date']).month
spotify_tracks['year'] = pd.DatetimeIndex(spotify_tracks['release_date']).year
spotify_tracks.drop(columns = ['release_date'], axis = 1, inplace=True)

# it seems like energy/loudness, as well as loudness/acousticness are correlated, and energy/acousticness; decide to remove acousticness and loudness
spotify_tracks.drop(columns = ['loudness', 'acousticness'], inplace=True)

# ensure that song name and artist name is a string
spotify_tracks['name'] = spotify_tracks['name'].astype(str)
spotify_tracks['artists'] = spotify_tracks['artists'].astype(str)

# remove all non alphanumeric characters in song name and artists
spotify_tracks['name'] = spotify_tracks['name'].replace(r'[^A-Za-z0-9\s]+', '', regex=True)
spotify_tracks['artists'] = spotify_tracks['artists'].replace(r'[^A-Za-z0-9\s]+', '', regex=True)

# remove extra spaces in song name and artists
spotify_tracks['name'] = spotify_tracks['name'].replace(r'\s\s+', ' ', regex=True)
spotify_tracks['artists'] = spotify_tracks['artists'].replace(r'\s\s+', ' ', regex=True)

# remove all special characters, including punctuation
spotify_tracks['name'] = spotify_tracks['name'].replace(r'[^\w\s]|_', '', regex=True)
spotify_tracks['artists'] = spotify_tracks['artists'].replace(r'[^\w\s]|_', '', regex=True)

# make all characters in song name and artist lowercase
spotify_tracks['name'] = spotify_tracks.name.apply(lambda x: x.lower())
spotify_tracks['artists'] = spotify_tracks.artists.apply(lambda x: x.lower())

# length of spotify_tracks
len(spotify_tracks)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spotify_tracks['month'] = pd.DatetimeIndex(spotify_tracks['release_date']).month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spotify_tracks['year'] = pd.DatetimeIndex(spotify_tracks['release_date']).year
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spotify_tracks.drop(columns = ['release_date'], axis = 1, inplace=True)
A value is trying to be set on

265669

## Data Cleaning for Billboard Dataset

In [7]:
# import the dataset
billboard_tracks = pd.read_csv('/content/charts.csv')

# convert release_time to appropriate time date format
billboard_tracks['release_date']= pd.to_datetime(billboard_tracks['date'], format='%Y-%m-%d')

# remove songs older than 1990
billboard_tracks = billboard_tracks[billboard_tracks['release_date'].dt.year >= 1990]

# remove all fields other than song, rank, and artist
billboard_tracks.drop(columns = ['date', 'last-week', 'peak-rank', 'weeks-on-board', 'release_date'], inplace=True)

# ensure that song name and artist is a string
billboard_tracks['song'] = billboard_tracks['song'].astype(str)
billboard_tracks['artist'] = billboard_tracks['artist'].astype(str)

# remove all non alphanumeric characters in song name and artist
billboard_tracks['song'] = billboard_tracks['song'].replace(r'[^A-Za-z0-9\s]+', '', regex=True)
billboard_tracks['artist'] = billboard_tracks['artist'].replace(r'[^A-Za-z0-9\s]+', '', regex=True)

# remove extra spaces in song name and artist
billboard_tracks['song'] = billboard_tracks['song'].replace(r'\s\s+', ' ', regex=True)
billboard_tracks['artist'] = billboard_tracks['artist'].replace(r'\s\s+', ' ', regex=True)

# remove all special characters, including punctuation
billboard_tracks['song'] = billboard_tracks['song'].replace(r'[^\w\s]|_', '', regex=True)
billboard_tracks['artist'] = billboard_tracks['artist'].replace(r'[^\w\s]|_', '', regex=True)

# make all characters in song name lowercase
billboard_tracks['song'] = billboard_tracks.song.apply(lambda x: x.lower())
billboard_tracks['artist'] = billboard_tracks.artist.apply(lambda x: x.lower())

# length of billboard_tracks
len(billboard_tracks)

166200

## Joining the two datasets

In [8]:
# ensure that columns we join on are the same
spotify_tracks.rename(columns={'artists': 'artist', 'name': 'song'}, inplace=True)
print("Number of songs in Spotify data:", len(spotify_tracks))
print("Number of songs in Billboard data:", len(billboard_tracks))

# perform left join
combined_tracks = spotify_tracks.merge(billboard_tracks, how = 'left', on = ['song', 'artist'])

# replace nan values with zero, if there is no matches from the merge
combined_tracks['rank'] = combined_tracks['rank'].replace(np.nan, 0)

# convert the rank into binary variable (1 if popular, 0 otherwise)
combined_tracks['billboard_popularity'] = np.where(combined_tracks['rank'] > 0, 1, 0)

# drop the billboard rank, since we don't want it infuencing our prediction
combined_tracks.drop(columns = ['rank'], inplace=True)

# # drop the artist column, since it was only used for joining
# combined_tracks.drop(columns=['artist'], inplace=True)

print("Number of songs in combined data:", len(combined_tracks))

Number of songs in Spotify data: 265669
Number of songs in Billboard data: 166200
Number of songs in combined data: 347124


## Checking and Removing Duplicate Song Entries

In [9]:
# songs and the number of times they appear in the dataframe
song_counts = combined_tracks['song'].value_counts()

# get songs that appear more than once in the dataframe
song_counts_gt_1 = song_counts[song_counts > 1]

# sum of the number of times duplicate songs appear in the dataframe
total_count = song_counts_gt_1.sum()

# number of duplicates in the dataframe
duplicates = (song_counts > 1).sum()

print("Number of songs that have duplicate entries:", duplicates)
print(f"Number of duplicate entries: {total_count - duplicates} out of {len(combined_tracks)}")

combined_tracks = combined_tracks.drop_duplicates(subset=['song'])
print("The total number of combined tracks, after removing duplicates, is", len(combined_tracks)) 

Number of songs that have duplicate entries: 22336
Number of duplicate entries: 150639 out of 347124
The total number of combined tracks, after removing duplicates, is 196485


## Create separate dataset with song names vectorized

In [10]:
import dask.dataframe as dd

# perform count vectorizer (goal is to see if song name has impact on popularity)

count_vect = TfidfVectorizer(binary=False, min_df=150)
#print("init TF idf vectorizer")
name_vectorized = count_vect.fit_transform(combined_tracks['song'])

combined_vectsongs = combined_tracks.copy()

count_vect_df = pd.DataFrame(name_vectorized.todense(), columns = count_vect.get_feature_names_out())

# alternative way of dropping index column
combined_vectsongs.reset_index(drop=True, inplace=True)
print("shape of combined_vectsongs: ")
print(combined_vectsongs.shape)

#count_vect_df = count_vect_df.reset_index().drop('index', axis = 1)
count_vect_df.reset_index(drop=True, inplace=True)
print("shape of count_vect_df: ")
print(count_vect_df.shape)

combined_vectsongs = pd.concat([combined_vectsongs, count_vect_df], axis = 1)

shape of combined_vectsongs: 
(196485, 18)
shape of count_vect_df: 
(196485, 428)


## Creating two datasets, two where Spotify popularity is used as target, and the other two where Billboard popularity is used as target

In [11]:
# create dataset where Spotify popularity is target
combined_spotify = combined_tracks.drop(columns = ['billboard_popularity'])
combined_spotify_vectsongs = combined_vectsongs.drop(columns = ['billboard_popularity'])

print("Number of songs in dataset: ", len(combined_spotify))

# create dataset where Billboard popularity is target
combined_billboard = combined_tracks.drop(columns = ['popularity'])
combined_billboard_vectsongs = combined_vectsongs.drop(columns = ['popularity'])

Number of songs in dataset:  196485


In [12]:
# count # of songs in billboard dataset that are popular
df_billboard_popular = combined_billboard[combined_billboard['billboard_popularity'] == 1]
print("Number of songs in billboard dataset that are popular: ", len(df_billboard_popular))
df_billboard_unpopular = combined_billboard[combined_billboard['billboard_popularity'] == 0]
print("Number of songs in billboard dataset that are unpopular: ", len(df_billboard_unpopular))

# count # of songs in combined_billboard_vectsongs dataset that are popular
df_billboard_popular_vectsongs = combined_billboard_vectsongs[combined_billboard_vectsongs['billboard_popularity'] == 1]
print("Number of songs in combined_billboard_vectsongs dataset that are popular: ", len(df_billboard_popular_vectsongs))
df_billboard_unpopular_vectsongs = combined_billboard_vectsongs[combined_billboard_vectsongs['billboard_popularity'] == 0]
print("Number of songs in combined_billboard_vectsongs dataset that are unpopular: ", len(df_billboard_unpopular_vectsongs))

Number of songs in billboard dataset that are popular:  3824
Number of songs in billboard dataset that are unpopular:  192661
Number of songs in combined_billboard_vectsongs dataset that are popular:  3824
Number of songs in combined_billboard_vectsongs dataset that are unpopular:  192661


## Random Undersampling to Balance Billboard class data

In [13]:
# randomly sample 3824 songs from unpopular dataset
df_billboard_unpopular_sample = df_billboard_unpopular.sample(n=3824, random_state=1)

# combine back the unpopular sample and popular dataset, and shuffle the rows
combined_billboard = pd.concat([df_billboard_popular, df_billboard_unpopular_sample])

# randomly sample 3824 songs from df_billboard_unpopular_vectsongs dataset
df_billboard_unpopular_sample_vectsongs = df_billboard_unpopular_vectsongs.sample(n=3824, random_state=1)

# combine back the unpopular sample and popular dataset, and shuffle the rows
combined_billboard_vectsongs = pd.concat([df_billboard_popular_vectsongs, df_billboard_unpopular_sample_vectsongs])

### Dropping song and artist names

In [14]:
# For fetching lyrics later, we will need artist and song name
genius_tracks = combined_billboard.copy()
genius_tracks_vectorized = combined_billboard_vectsongs.copy()

combined_billboard.drop(columns = ['song'], inplace=True)
combined_billboard.drop(columns=['artist'], inplace=True)

combined_billboard_vectsongs.drop(columns = ['song'], inplace=True)
combined_billboard_vectsongs.drop(columns=['artist'], inplace=True)

combined_spotify.drop(columns = ['song'], inplace=True)
combined_spotify.drop(columns=['artist'], inplace=True)

combined_spotify_vectsongs.drop(columns = ['song'], inplace=True)
combined_spotify_vectsongs.drop(columns=['artist'], inplace=True)

## Baseline Model 1 - Linear Regression with Spotify Popularity without song name

In [15]:
from sklearn.preprocessing import StandardScaler
# standardize split the data into training and test sets

scaler = StandardScaler()

df_train, df_test = train_test_split(combined_spotify, test_size = 0.2)

# create the features and target dataframes
df_train_x = df_train.drop('popularity', axis = 1).to_numpy()
df_train_y = df_train['popularity'].values
df_train_x = scaler.fit_transform(df_train_x)

df_test_x = df_test.drop('popularity', axis = 1).to_numpy()
df_test_y = df_test['popularity'].values
df_test_x = scaler.fit_transform(df_test_x)

# fit the linear regression model
LinReg = LinearRegression()
LinReg.fit(df_train_x, df_train_y)

# get score on test-set
test_score = LinReg.score(df_test_x, df_test_y)

# print the score
print(f"R2 score for test set is {test_score}")

R2 score for test set is 0.14207248645580428


## Baseline Model 2 - Linear Regression with Spotify Popularity with vectorized song name

In [16]:
# split the data into training and test sets
import dask_ml.model_selection as dcv
import dask_ml.linear_model as dlm
import dask_ml.preprocessing as dpp

scaler = StandardScaler()
df_train, df_test = train_test_split(combined_spotify_vectsongs, test_size = 0.2)

# create the features and target dataframes
df_train_x = df_train.drop('popularity', axis = 1).values
df_train_y = df_train['popularity'].values
df_train_x = scaler.fit_transform(df_train_x)

#df_test_x = df_test.drop('popularity', axis = 1).to_numpy()
df_test_x = df_test.drop('popularity', axis = 1).values
df_test_y = df_test['popularity'].values
df_test_x = scaler.fit_transform(df_test_x)

# fit the linear regression model
LinReg = LinearRegression(n_jobs=8)
LinReg.fit(df_train_x, df_train_y)

# get score on test-set
test_score = LinReg.score(df_test_x, df_test_y)

# print the score
print(f"R2 score for test set is {test_score}")

R2 score for test set is 0.25745288210173556


## Baseline Model 3 - Logistic Regression with Billboard Popularity without song name

In [17]:
# split the data into training and test sets

scaler = StandardScaler()

df_train, df_test = train_test_split(combined_billboard, test_size = 0.2)

# create the features and target dataframes
df_train_x = df_train.drop('billboard_popularity', axis = 1).to_numpy()
df_train_y = df_train['billboard_popularity'].values
df_train_x = scaler.fit_transform(df_train_x)

df_test_x = df_test.drop('billboard_popularity', axis = 1).to_numpy()
df_test_y = df_test['billboard_popularity'].values
df_test_x = scaler.fit_transform(df_test_x)

# fit the logistic regression model with no regularization terms
LogReg = LogisticRegression(multi_class='ovr', penalty='none', max_iter = 10000)
LogReg.fit(df_train_x, df_train_y)

# calculate F1 score
f1_train = f1_score(df_train_y, LogReg.predict(df_train_x), average = 'micro')
f1_test = f1_score(df_test_y, LogReg.predict(df_test_x), average = 'micro')

# print F1 values out
print(f"Training set with no regularization terms F1-Score is {f1_train}")
print(f"Test set with no regularization terms F1-Score is {f1_test}")

Training set with no regularization terms F1-Score is 0.6305982347172279
Test set with no regularization terms F1-Score is 0.6496732026143791




## Baseline Model 4 - Logistic Regression with Billboard Popularity with vectorized song name

In [18]:
# split the data into training and test sets

scaler = StandardScaler()

df_train, df_test = train_test_split(combined_billboard_vectsongs, test_size = 0.2)

# create the features and target dataframes
df_train_x = df_train.drop('billboard_popularity', axis = 1).to_numpy()
df_train_y = df_train['billboard_popularity'].values
df_train_x = scaler.fit_transform(df_train_x)

df_test_x = df_test.drop('billboard_popularity', axis = 1).to_numpy()
df_test_y = df_test['billboard_popularity'].values
df_test_x = scaler.fit_transform(df_test_x)

# fit the logistic regression model with no regularization terms
LogReg = LogisticRegression(multi_class='ovr', penalty='none', max_iter = 10000)
LogReg.fit(df_train_x, df_train_y)

# calculate F1 score
f1_train = f1_score(df_train_y, LogReg.predict(df_train_x), average = 'micro')
f1_test = f1_score(df_test_y, LogReg.predict(df_test_x), average = 'micro')

# print F1 values out
print(f"Training set with no regularization terms F1-Score is {f1_train}")
print(f"Test set with no regularization terms F1-Score is {f1_test}")



Training set with no regularization terms F1-Score is 0.7911082052958482
Test set with no regularization terms F1-Score is 0.7490196078431373


# Fetch Lyrics for Billboard Data

In [19]:
!pip install lyricsgenius

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting lyricsgenius
  Downloading lyricsgenius-3.0.1-py3-none-any.whl (59 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: lyricsgenius
Successfully installed lyricsgenius-3.0.1


In [20]:
from lyricsgenius import Genius
import re
import nltk 
from nltk import pos_tag
#nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

client_id = 'BMU2d7PVnIvEnVFKQiMlwJFcSffuHq2epuHYfqTstC7CiqLqSC42KMDTEVTPnRsY'
client_secret ='6Sh1I8isQGva9bIDg8qDnCSEnHbfQy71SurcywPLj2s8R9S1W8MdpprpYdqSPR2AO8c79njZlhSl9RHnxB_5vw'
token = 'sL8fGLHNCUnND1eoeMd6t3-gLWvSet8sU-zoBfauaIPuOXlbKuZrdX2hJiXGFWjD'

def get_lyrics(artist, song_title):
  genius = Genius(token)
  genius.remove_section_headers = True
  genius.verbose = True

  try:
      lyrics = genius.search_song(song_title, artist).lyrics
  except:
      lyrics=''
  
  lyrics = clean_lyrics(lyrics)
  return lyrics

#Mapping POS tag to first character lemmatize() accepts
def get_wordnet_pos(word):
  tag = nltk.pos_tag([word])[0][1][0].upper()
  tag_dict = {"J": wordnet.ADJ,
              "N": wordnet.NOUN,
              "V": wordnet.VERB,
              "R": wordnet.ADV}

  return tag_dict.get(tag, wordnet.NOUN)

#Function to Lemmatize every word and remove stopwords 
def lemma(text):
  # Setting stopwords
  stop = set(stopwords.words("english"))

  #Initializing Lemmatizer
  lemmatizer = WordNetLemmatizer()

  text = [lemmatizer.lemmatize(x, get_wordnet_pos(x)) for x in nltk.word_tokenize(text)]
  text = [x for x in text if x not in stop]
  return ' '.join(text)

def clean_lyrics(lyrics):
  lyrics = " ".join(re.findall("[a-zA-Z]+", lyrics))
  lyrics = lyrics.lower()

  #lyrics = lemma(lyrics)

  return lyrics

In [21]:
clean_df = genius_tracks.copy()
print('before', clean_df.head())

clean_df['name'] = clean_df['song'].str.encode('ascii', 'ignore').str.decode('ascii')

# remove empty rows
clean_df = clean_df[clean_df["name"] != " "]
clean_df = clean_df[clean_df["name"] != ""]

print('after', clean_df.head())

# clean_df['release_date']= pd.to_datetime(clean_df['release_date'])
# clean_df = clean_df[clean_df['release_date'].dt.year >= 2019]

clean_df = clean_df[['name','artist']]
print('final', clean_df.head())

before                                    song           artist  duration_ms  \
383                      wind of change        scorpions     5.206000   
411                    send me an angel        scorpions     4.555550   
430                             hold on  wilson phillips     4.447783   
455  all i wanna do is make love to you            heart     5.181550   
480                          moneytalks             acdc     3.765783   

     explicit  danceability  energy  key  mode  speechiness  instrumentalness  \
383         0         0.452   0.524    0     1       0.0351          0.000000   
411         0         0.153   0.417    1     1       0.0318          0.000622   
430         0         0.679   0.657    5     1       0.0255          0.000000   
455         0         0.626   0.746    7     1       0.0282          0.000000   
480         0         0.649   0.903    7     1       0.0318          0.000351   

     liveness  valence    tempo  time_signature  month  year  \
383

In [28]:
test = clean_df.tail(20)

lyrics_list = []
for index, row in test.iterrows():
    res = row['artist'].strip('][').strip('\'').split(', ')
    print(row['name'])
    art = res[0].replace('\'', '')
    try:
        lyrics_list.append(get_lyrics(art, row['name']))
    except:
        lyrics_list.append("")

test['lyrics']=lyrics_list

raise the dead bonus track
Searching for "raise the dead bonus track" by michael stanley...
listen to your heart edmes unplugged vocal
Searching for "listen to your heart edmes unplugged vocal" by dht edmee...
powder snow 
Searching for "powder snow " by j soul brothers iii...
kepala yang berdarah
Searching for "kepala yang berdarah" by victor hutabarat...
a tear fell
Searching for "a tear fell" by teresa brewer...
rochy
Searching for "rochy" by juan bautista...
visions of a sunset
Searching for "visions of a sunset" by shawn stockman...
la colorada
Searching for "la colorada" by pibes chorros...
lensin matalalla 2
Searching for "lensin matalalla 2" by eppu normaali...
jeune frre musulman
Searching for "jeune frre musulman" by le silence des mosques...
io ho te
Searching for "io ho te" by audio 2...
mary jane remix
Searching for "mary jane remix" by burry soprano...
kinder brauchen trume
Searching for "kinder brauchen trume" by simone sommerland karsten glck die kitafrsche...
zied ieva

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['lyrics']=lyrics_list


In [29]:
test.head(10)

Unnamed: 0,name,artist,lyrics
274159,raise the dead bonus track,michael stanley,
265203,listen to your heart edmes unplugged vocal,dht edmee,
327205,powder snow,j soul brothers iii,
256121,kepala yang berdarah,victor hutabarat,
55372,a tear fell,teresa brewer,
100365,rochy,juan bautista,
313133,visions of a sunset,shawn stockman,
129357,la colorada,pibes chorros,
84644,lensin matalalla 2,eppu normaali,
263988,jeune frre musulman,le silence des mosques,


## Above Does not Work, Other Attempts:

### Use only Genius' Developer API methods ([context](https://github.com/johnwmillr/LyricsGenius/issues/190)); no direct way to fetch song lyrics



In [23]:
new_token = 'nLXf9LTNP4JLe0arE5LcI2gVnD9gBB8xhswa-PHVLKvE7WCmHaIhBC6VgKpNCak3'
genius = Genius(new_token, verbose=True)

songs = genius.search_songs("Eminem Rap God")['hits']

In [24]:
for song in songs:
    if song['result']['title'] == "Rap God":
        song_id = song['result']['id']
song = genius.song(song_id)

In [25]:
song['song']

{'annotation_count': 111,
 'api_path': '/songs/235729',
 'apple_music_id': '1440863086',
 'apple_music_player_url': 'https://genius.com/songs/235729/apple_music_player',
 'artist_names': 'Eminem',
 'description': {'plain': '“Rap God” is Eminem’s braggadocious ode to himself and his career. Over its six-minute run-time, he references comic books, throws back to his old songs, and raps crazily fast.\n\nIn a Q&A with Rolling Stone, Em was asked whether he felt like a Rap God or an underdog, to which he responded:\n\nI think everything switches back and forth from hour to hour, day by day with me. That whole ‘Rap God’ record pretty much from top to bottom is tongue in cheek. So I mean, do I want to feel like that? Maybe sometimes. Again, it goes back to everybody who competitively raps and does this for just purely for the sport of it wants to be the best. Again, that’s why Kendrick’s verse worked so well because he only said what every rapper’s already thinking, If you don’t want to be th

### Use AZLyrics API (not official). [Github Repo](https://github.com/elmoiv/azapi). Does not work either.

In [26]:
!pip install git+https://github.com/elmoiv/azapi.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/elmoiv/azapi.git
  Cloning https://github.com/elmoiv/azapi.git to /tmp/pip-req-build-23qnzrq5
  Running command git clone --filter=blob:none --quiet https://github.com/elmoiv/azapi.git /tmp/pip-req-build-23qnzrq5
  Resolved https://github.com/elmoiv/azapi.git to commit ef28e5a06d23690cef6f3c72f043f1ea55bf7858
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bs4 (from azapi==3.0.6)
  Downloading bs4-0.0.1.tar.gz (1.1 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: azapi, bs4
  Building wheel for azapi (setup.py) ... [?25l[?25hdone
  Created wheel for azapi: filename=azapi-3.0.6-py3-none-any.whl size=20568 sha256=0750df89eddf6e4489f552a4d61cde6cb9a4b6b4fbce288c92d263caa85f83f6
  Stored in directory: /tmp/pip-ephem-wheel-cache-48x3g5qa/wheels/2c/41/c9/3382e80754ba048eadfb29ac71e59aad2d6cf07675

In [27]:
import azapi

API = azapi.AZlyrics('google', accuracy=0.5)

API.artist = 'Tylor Swft'
API.title = 'Bad Blods'

API.getLyrics(save=True, ext='lrc')

print(API.lyrics)

# Correct Artist and Title are updated from webpage
print(API.title, API.artist)

Google found nothing!

Bad Blods Tylor Swft
