In [None]:
import numpy as np
import pandas as pd
from pandas import json_normalize
import seaborn as sns
import matplotlib.pyplot as plt

import requests
import time
import ast
import random

# Spotipy
from dotenv import load_dotenv
import os
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

import warnings
warnings.filterwarnings("ignore")

from my_functions import *

In [196]:
# import the dataframes
df_uk_masters = pd.read_csv('Datasets/df_uk_masters.csv')                         # all the albums from the UK
df_us_masters = pd.read_csv('Datasets/df_us_masters.csv')                         # albums from the US until 1996, 1998 and 2000
df_us_new_masters = pd.read_csv('Datasets/df_us_new_masters.csv')                         # albums from the US from 1997, 1999 and 2001
df_ratings_20 = pd.read_csv('Datasets/df_ratings_20.csv', keep_default_na=False)  # albums with >= 20 votes, mostly from rock, worldwide
df_ratings_10 = pd.read_csv('Datasets/df_ratings_10.csv', keep_default_na=False)  # albums with >= 10 votes, mostly from rock, worldwide
df_masters_blended = pd.read_csv('Datasets/df_masters_blended.csv')               # albums from the UK and US (until 2000) with >= 20 votes 

# print information
print(f'{df_uk_masters.shape}: df_uk_masters')
print(f'{df_us_masters.shape}: df_us_masters')
print(f'{df_us_new_masters.shape}: df_us_new_masters')
print(f'{df_ratings_20.shape}: df_ratings_20')
print(f'{df_ratings_10.shape}: df_ratings_10')
print(f'{df_masters_blended.shape}: df_masters_blended')

(56660, 13): df_uk_masters
(48690, 13): df_us_masters
(37597, 13): df_us_new_masters
(51222, 5): df_ratings_20
(79625, 5): df_ratings_10
(11429, 13): df_masters_blended


# **``album_length``**

In [640]:
df_masters_blended = pd.read_csv('Datasets/df_masters_blended.csv')               # albums from the UK and US (until 2000) with >= 20 votes 
df = df_masters_blended
df.shape

(9667, 13)

In [641]:
percentage_album_length_missing = round((df['album_length']==0).sum() / df.shape[0] * 100, 2)

print(f"albums missing album_length: {(df['album_length']==0).sum()} ")
print(f'% missing album_length: {percentage_album_length_missing}%')

albums missing album_length: 2234 
% missing album_length: 23.11%


In [657]:
df_length_0 = df[df['album_length']==0].sort_values(['artist', 'year', 'title'])
df_length_0.shape

(2234, 13)

## **``Spotipy`` (Spotify API)**

In [643]:
load_dotenv()

True

In [933]:
user = os.getenv('client_id')
password = os.getenv('client_secret')

# Initialize Spotipy with user credentials
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id = user,
                                                           client_secret = password), requests_timeout=10)

### Testing

In [836]:
count = 0
for artist, title in df_length_0[['artist', 'title']][:10].values:
    count+=1

    print(count, artist, title)

1 ...And You Will Know Us by the Trail of Dead Madonna
2 108 Songs of Separation
3 10cc 10cc
4 10cc Sheet Music
5 10cc Deceptive Bends
6 23 Skidoo Seven Songs
7 2:54 2:54
8 7 Seconds The Crew
9 7 Seconds New Wind
10 A Certain Ratio The Graveyard and The Ballroom


In [None]:
artist = 'Agent Orange'

results = sp.search(q = artist
                    , type = 'artist'
                    , limit = 5)

# theorically the artist should be the first result
artist_name = results['artists']['items'][0]['name']
artist_id = results['artists']['items'][0]['id'] # get the artist_id

if artist_name.lower() == artist.lower():
    print(f'artist found: {artist_name}')
else:
    print(f"Cannot find '{artist}', try some other one")

artist found: Agent Orange


In [1025]:
def get_all_albums(artist_id):
    albums = []
    results = sp.artist_albums(artist_id, limit=50)  # First request

    while results:
        albums.extend(results['items'])  # Store the albums
        if results['next']:  # Check if there's another page
            results = sp.next(results)  # Fetch next page
        else:
            break  # Stop when no more pages

    return albums

In [None]:
artist = 'Adam and the Ants'
title = 'Dirk Wears White Sox'
# other albums by the same artist:
# Kings of the Wild Frontier
# Prince Charming

results = sp.search(q = artist
                    , type = 'artist'
                    , limit = 5)

artists = results['artists']['items']
artists_ids = results['artists']['items'] # get the artist_id

for i in range(len(artists)):
    # artist = artists[i]['name']
    try:
        # look for my artist
        if artist.lower().replace('and', '&') == artists[i]['name'].lower():
            # if it finds my artist:
            artist_id = artists_ids[i]['id']
            print('artist found')
            print(artist)

            artist_albums = get_all_albums(artist_id)

            for album in artist_albums:

                print(album['name'])

                # look for my album
                if album['name'].lower().replace('and', '&') == (title.lower().replace('and', '&')):
                    print('album found')
                    album_id = album['id']
                    results = sp.album(album_id)
                    tracks = results['tracks']['items']
                    song_durations = [song['duration_ms']/60000 for song in tracks]
                    
                    album_length = round(sum(song_durations), 2)

                    break   # once it finds the album, stop
    except:
        print('error')            

artist found
Adam and the Ants
Kings of the Wild Frontier (Deluxe Edition)
Dirk Wears White Sox (Remastered)
Prince Charming (Remastered)
B-Side Babies
Young Parisians / Lady
Young Parisians
Lady
80s 100 Hits
The Best Of The 80's
80s 100 Hits - Volume 2
80s: 100 Remixes
On Your 80's Radio
Pure... Alternative 80s
The Definitive 80's (eighties)
Ultimate 80's Number 1's
Pure... 80's Dance Party
Essential - One For The Lads
The Very Best Of
The Supreme Record Company
Hits of the 80s
Heroic Hits
Rock Classics
Haynes Ultimate Guide to 80s
Party On!
The Essential 80s Rock
Dig'Hits All 80'S
The Best Year Of My Life: 1981G010004775674D
Go Boys Go
Ultimate No 2 Hits Of The 80's
The Best Year Of My Life: 1980
Haynes Ultimate Guide to Classic Anthems


Here we can see that the album is found, though not on the exact same name, it doesn't do an exact match because there are other parts in the title, such as **'Deluxe Edition'** or **'Remastered'**

In [1078]:
len(artist_albums)

31

In [1082]:
artist_albums[0]['name']

'Kings of the Wild Frontier (Deluxe Edition)'

In [952]:
artist_id

'2jK54ZlZhTF1TxygsVeR05'

### **define the function**

In [973]:
def get_album_length(df):
    count = 0
    scrapped = 0

    # create empty lists
    artists_list = []
    albums_list = []
    albums_lengths = []
    tracks_list = []

    for artist, title in df[['artist', 'title']].values:
        time.sleep(2)
        count+=1

        artists_list.append(artist)
        albums_list.append(title)

        results = sp.search(q = artist
                            , type = 'artist'
                            , limit = 5)

        artists = results['artists']['items']
        artists_ids = results['artists']['items'] # get the artist_id

        for i in range(len(artists)):
            try:
                # look for my artist
                if artist == artists[i]['name']:
                    # if it finds my artist:
                    artist_id = artists_ids[i]['id']

                    # get the albums of the artist
                    results = sp.artist_albums(artist_id, limit=50)
                    data = results['items']

                    # examine all the albums
                    for j in range(len(data)):
                        album = data[j]['name']
                        album_id = data[j]['id']

                    # for album in artist_albums:
                    for j in range(len(data)):
                        album = data[j]['name']
                        album_id = data[j]['id']

                        # look for my album
                        if album.lower() == title.lower():
                            results = sp.album(album_id)
                            tracks = results['tracks']['items']
                            tracks_list.append(len(tracks))
                            song_durations = [song['duration_ms']/60000 for song in tracks]
                            
                            album_length = round(sum(song_durations), 2)
                            albums_lengths.append(album_length)
                            scrapped+=1
                            break   # once it finds the album, stop
                    else:
                        continue
                    break   
            except:
                print('error')
                break
        
        else:
            albums_lengths.append(np.nan)
            tracks_list.append(np.nan)

        print(f"{scrapped}/{count}: {artist} - {title}")

        lists = [artists_list, albums_list, albums_lengths, tracks_list]
        # for lst in lists:
        #     print(len(lst))

    # Check if all lists have the same length
        lengths = [len(lst) for lst in lists]
        if len(set(lengths)) != 1:
            print("Lengths are not the same.")
            break # stop the loop, I won't be able to store the data if I have one value missing

    df_lengths_missing = pd.DataFrame({'artist': artists_list,
                                    'title': albums_list,
                                    'album_length': albums_lengths,
                                    'tracks': tracks_list})
    return df_lengths_missing

In [960]:
def get_all_albums(artist_id):
    albums = []
    results = sp.artist_albums(artist_id, limit=50)  # First request

    while results:
        albums.extend(results['items'])  # Store the albums
        if results['next']:  # Check if there's another page
            results = sp.next(results)  # Fetch next page
        else:
            break  # Stop when no more pages

    return albums

In [1090]:
album = 'Dirk Wears White Sox'

title = 'Dirk Wears White Sox (Remastered)'

album in title 

True

In [None]:
def get_album_length(df):
    count = 0
    scraped = 0

    # create empty lists
    artists_list = []
    albums_list = []
    albums_lengths = []
    tracks_list = []

    for artist, title in df[['artist', 'title']].values:
        time.sleep(1)
        count+=1

        artists_list.append(artist)
        albums_list.append(title)

        results = sp.search(q = artist
                            , type = 'artist'
                            , limit = 5)

        artists = results['artists']['items']
        artists_ids = results['artists']['items'] # get the artist_id

        for i in range(len(artists)):
            artist_name = artists[i]['name'].lower()
            artist_name_and = artist.replace('and', '&').lower()
            try:
                # look for my artist
                if artist_name == artist.lower() or artist_name == artist_name_and:
                    # if it finds my artist:
                    artist_id = artists_ids[i]['id']

                    artist_albums = get_all_albums(artist_id)

                    for album in artist_albums:
                        album_name = album['name'].lower()        
                        title_name_and = title.replace('and', '&').lower()
                        if title.lower() in album_name or title_name_and in album_name:
                            album_id = album['id']
                            results = sp.album(album_id)
                            tracks = results['tracks']['items']
                            tracks_list.append(len(tracks))
                            song_durations = [song['duration_ms']/60000 for song in tracks]
                            
                            album_length = round(sum(song_durations), 2)
                            albums_lengths.append(album_length)
                            scraped+=1
                            break   # once it finds the album, stop
                    else:
                        continue
                    break 
            except:
                print('error')
                albums_lengths.append(np.nan)
                tracks_list.append(np.nan)
                break
        
        else:
            albums_lengths.append(np.nan)
            tracks_list.append(np.nan)

        print(f"{scraped}/{count}: {artist} - {title}")

        lists = [artists_list, albums_list, albums_lengths, tracks_list]

    # Check if all lists have the same length
        lengths = [len(lst) for lst in lists]
        if len(set(lengths)) != 1:
            print("Lengths are not the same.")
            break # stop the loop, I won't be able to store the data if I have one value missing

    df_lengths_missing = pd.DataFrame({'artist': artists_list,
                                    'title': albums_list,
                                    'album_length': albums_lengths,
                                    'tracks': tracks_list})
    return df_lengths_missing

### **Get the ``album_length`` from Spotipy**

In [None]:
start_index = 100
end_index = start_index + 100

df_lengths_missing = get_album_length(df_length_0.iloc[start_index:end_index])
df_lengths_missing.shape

1/1: At the Skylines - The Secrets To Life
1/2: Atomic Rooster - Atomic Roooster
2/3: Atomic Rooster - Death Walks Behind You
3/4: Atomic Rooster - Nice 'N' Greasy
4/5: Atoms for Peace - Amok
5/6: Atreyu - In Our Wake
6/7: Atreyu - The Beautiful Dark of Life
7/8: Atrophy - Socialized Hate
8/9: Atvm - Famine, Putrid and Fucking Endless
9/10: Au Pairs - Playing With A Different Sex
9/11: Aus-Rotten - ...And Now Back To Our Programming
10/12: Autopsy - Severed Survival
11/13: Autopsy - Skull Grinder
12/14: Autopsy - Puncturing the Grotesque
13/15: Avail - Dixie
14/16: Avail - 4am Friday
15/17: Avail - Over the James
16/18: Azure Ray - Hold On Love
17/19: Bachelor - Doomin' Sun
18/20: Bad Brains - Bad Brains
19/21: Bad Brains - Quickness
20/22: Bad Brains - Black Dots
21/23: Bad Omens - Finding God Before God Finds Me
22/24: Bad Omens - The Death of Peace of Mind
23/25: Bad Religion - How Could Hell Be Any Worse?
24/26: Bad Religion - Recipe for Hate
25/27: Badfinger - No Dice
25/28: Badly



In [919]:
minutes = 12.25
attempts = 500

seconds_per_attempt = minutes*60/attempts
seconds_per_attempt 

1.47

In [2]:
df_lengths_missing

NameError: name 'df_lengths_missing' is not defined

In [1101]:
df_lengths_missing.to_csv('Datasets/df_lengths_missing_1.csv', index=False)

I don't want to get rid of these albums, because they have been released so dropping them would be deleting information, but they affect the average of ``album_length``, so I will just convert them to null values.

In [None]:
# replace it with a nan
df_masters_concat.loc[28, 'album_length'] = np.nan

In [None]:
df_masters_concat[df_masters_concat['album_length']==0].head()

Unnamed: 0,artist,album,year,country,album_length,tracks,genres,styles,master_id
6,Hyper On Experience,Keep It In The Family E.P.,1993,UK,0.0,4,['Electronic'],"['Breakbeat', 'Hardcore']",119
8,Mixrace,The Future Is Before Your Eyes,1992,UK,0.0,6,['Electronic'],"['Hardcore', 'Breakbeat']",121
9,Mixrace,Organized Chaos E.P.,1993,UK,0.0,6,['Electronic'],"['Breakbeat', 'Hardcore', 'Jungle']",122
10,Mixrace,The Endless Skies / True Jungle,1994,UK,0.0,4,['Electronic'],"['Drum n Bass', 'Jungle']",123
13,Earth Leakage Trip,Neopolitan EP,1992,UK,0.0,3,['Electronic'],"['Breakbeat', 'Hardcore', 'Techno', 'Bleep']",126


### **Exporting to csv**

In [None]:
df_masters_concat.to_csv('Datasets/df_masters.csv', index=False)