In [3]:
import pandas as pd
import json
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from IPython.display import clear_output
from difflib import SequenceMatcher

### Almendra, Pescado Rabioso, Invisible, Spinetta Jade, Spinetta y los Socios del Desierto, Luis Alberto Spinetta
https://es.wikipedia.org/wiki/Anexo:Discografía_de_Luis_Alberto_Spinetta

 ### Spotify!

First, connect to the API with your credentials.

In [2]:
from config import SPOTIFY_CLIENT_ID, SPOTIFY_CLIENT_SECRET
client_credentials_manager = SpotifyClientCredentials(client_id=SPOTIFY_CLIENT_ID,
                                                      client_secret=SPOTIFY_CLIENT_SECRET)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

Let's get all his albums first.

In [3]:
artist_ids = ['3FjdJbt6Myq32uv7P4owM1'] # We start with the id for Invisible already inside... not so easy to guess the search
albums_ids = {}
artist_names = ['Almendra', 'Pescado Rabioso', 'Spinetta Jade', 'Spinetta y los Socios del Desierto', 'Luis Alberto Spinetta', 'Spinetta, García, Ferrón']
for an_artist_name in artist_names:
    an_artist_id = sp.search(q='artist:' + an_artist_name, type='artist')['artists']['items'][0]['id']
    artist_ids.append(an_artist_id)
for an_artist_id in artist_ids:
    some_albums_ids = [(x['id'], x['name']) for x in sp.artist_albums(an_artist_id, album_type='album', limit=50)['items']]
    albums_ids.update({x[1]: x[0] for x in some_albums_ids})
    

In [4]:
albums_to_disregard = ['30 Años de Almendra', 'Almendra', 'Almendra - RCA Victor 100 Años', 
                       'Antologia Almendra', 'Argentina Sorgo Films (Live At Obras / 2001)', 
                       'Elija Y Gane', 'El Album', 'Exactas (En Vivo)', 'Inolvidables RCA - 20 Grandes Exitos', 
                       'Lo Mejor de Pescado Rabioso', 'Obras Cumbres', 'San Cristóforo', 
                       'Spinetta y Las Bandas Eternas', 'Sí o Sí - Diario del Rock Argentino - Spinetta', 
                       'Tester de Violencia', 'Festival Encuentro 1977 (En Vivo)', 'Only Love Can Sustain']
for k in albums_to_disregard:
    if k in albums_ids:
        albums_ids.pop(k, None)

In [5]:
albums_ids.keys()

dict_keys(['El Jardin de los Presentes', 'Durazno Sangrando', 'Invisible', 'El 1er Álbum Más los Singles', 'Almendra 2', 'Artaud', 'Pescado Rabioso 2', 'Desatormentándonos', 'Bajo Belgrano', 'Los Niños Que Escriben En El Cielo', 'Madre En Años Luz', 'Alma de Diamante', 'Los Ojos', 'Spinetta y los Socios del Desierto', 'Un Mañana', 'La La La', 'Pan', 'Para Los Arboles', 'Silver Sorgo', 'Fuego Gris', 'Peluson Of Milk', 'Don Lucero', 'Téster de Violencia', 'Prive', 'Mondo Di Cromo', 'Kamikaze', "A 18' Del Sol", 'Spinettalandia y Sus Amigos', 'Spinetta Los Amigo'])

__Missing "El valle interior" (1980) of Almendra__

In [6]:
spotify_raw_df = pd.DataFrame()
for an_album_title, an_album_id in albums_ids.items():
    a_dataframe = pd.DataFrame(sp.album_tracks(an_album_id)['items'])
    a_dataframe['album'] = an_album_title
    if not spotify_raw_df.shape[0]:
        spotify_raw_df = pd.DataFrame(a_dataframe)
    else:
        spotify_raw_df = pd.concat([spotify_raw_df, a_dataframe])

Deleting useless features and reseting index

In [7]:
del spotify_raw_df['available_markets']
del spotify_raw_df['explicit']
del spotify_raw_df['external_urls']
del spotify_raw_df['href']
del spotify_raw_df['is_local']
del spotify_raw_df['preview_url']
del spotify_raw_df['type']
del spotify_raw_df['id']
del spotify_raw_df['uri']

spotify_raw_df = spotify_raw_df.reset_index(drop=True)

for index, row in spotify_raw_df.iterrows():
    spotify_raw_df.at[index, 'artist'] = row['artists'][0]['name'] 
    
del spotify_raw_df['artists']
spotify_raw_df['author'] = '' # Add author for future filling
spotify_raw_df.sample(10)

Unnamed: 0,disc_number,duration_ms,name,track_number,album,artist,author
200,1,143533,Cuando El Arte Ataque,14,La La La,Fito Paez,
253,1,279289,Dedos De Mimbre,11,Fuego Gris,Luis Alberto Spinetta,
136,1,395626,Perdido En Ti,9,Los Ojos,Spinetta Y Los Socios Del Desierto,
21,1,178146,Hoy Todo El Hielo En La Ciudad,3,El 1er Álbum Más los Singles,Almendra,
210,1,201773,Proserpina,4,Pan,Luis Alberto Spinetta,
193,1,83066,Solo La La La,7,La La La,Fito Paez,
104,1,269815,El Hombre Dirigente,3,Los Niños Que Escriben En El Cielo,Spinetta Jade,
149,1,175000,Diana,8,Spinetta y los Socios del Desierto,Spinetta Y Los Socios Del Desierto,
244,1,219960,Yo No Puedo Dar Sombra,2,Fuego Gris,Luis Alberto Spinetta,
113,1,313200,Entonces Es Como Dar Amor,2,Madre En Años Luz,Spinetta Jade,


Let's manually complete "El valle interior"

In [8]:
columns = ['disc_number', 'duration_ms', 'name', 'track_number', 'album', 'artist', 'author']
songs = [[1, (6*60+35)*1000, 'Las cosas para hacer', 1, "El valle interior", "Almendra", "Emilio del Guercio"],
        [1, (3*60+24)*1000, 'Amidama', 2, "El valle interior", "Almendra", "Luis Alberto Spinetta"],
        [1, (8*60+43)*1000, 'Miguelito, mi espíritu ha partido a tiempo', 3, "El valle interior", "Almendra", "Luis Alberto Spinetta"],
        [1, (2*60+3)*1000, 'Espejada', 4, "El valle interior", "Almendra", "Luis Alberto Spinetta"],
        [1, (4*60+40)*1000, 'Cielo fuerte (amor guaraní)', 5, "El valle interior", "Almendra", "Emilio del Guercio"],
        [1, (6*60+42)*1000, 'El fantasma de la buena suerte', 6, "El valle interior", "Almendra", "Luis Alberto Spinetta"],
        [1, (4*60+41)*1000, 'Buen día, día de sol', 7, "El valle interior", "Almendra", "Luis Alberto Spinetta"]]
df_to_appends = pd.DataFrame(songs, columns=columns)
raw_df = spotify_raw_df.append(df_to_appends, ignore_index=True)
raw_df.tail()

Unnamed: 0,disc_number,duration_ms,name,track_number,album,artist,author
358,1,523000,"Miguelito, mi espíritu ha partido a tiempo",3,El valle interior,Almendra,Luis Alberto Spinetta
359,1,123000,Espejada,4,El valle interior,Almendra,Luis Alberto Spinetta
360,1,280000,Cielo fuerte (amor guaraní),5,El valle interior,Almendra,Emilio del Guercio
361,1,402000,El fantasma de la buena suerte,6,El valle interior,Almendra,Luis Alberto Spinetta
362,1,281000,"Buen día, día de sol",7,El valle interior,Almendra,Luis Alberto Spinetta


Let's change the artist of _La, La, La_ and _Los Amigo_

In [9]:
raw_df.at[(raw_df['artist'] == 'Fito Paez') |  (raw_df['artist'] == 'Spinetta, García, Ferrón'), 'artist'] = 'Luis Alberto Spinetta'


### Genius!

In [11]:
def _get(path, params=None, headers=None):
    from config import GENIUS_BASE_URI, GENIUS_CLIENT_ACCESS_TOKEN
    url = '/'.join([GENIUS_BASE_URI, path])
    token = "Bearer {}".format(GENIUS_CLIENT_ACCESS_TOKEN)

    if headers:
        headers['Authorization'] = token
    else:
        headers = {"Authorization": token}
    response = requests.get(url=url, params=params, headers=headers)
    response.raise_for_status()

    return response.json()

In [12]:
# find artist ids from given data.
artist_names = ['Almendra',
 'Pescado Rabioso',
 'Spinetta Jade',
 'Spinetta y los Socios del Desierto',
 'Luis Alberto Spinetta',
 'Invisible']

for an_artist_name in artist_names:
    find_id = _get("search", {'q': an_artist_name})
    for hit in find_id["response"]["hits"]:
        if hit["result"]["primary_artist"]["name"].lower() == an_artist_name.lower():
            artist_id = hit["result"]["primary_artist"]["id"]
            break

    print("-> " + an_artist_name + "'s id is " + str(artist_id) + "\n")

-> Almendra's id is 344735

-> Pescado Rabioso's id is 357615

-> Spinetta Jade's id is 355287

-> Spinetta y los Socios del Desierto's id is 369509

-> Luis Alberto Spinetta's id is 344478

-> Invisible's id is 344478



Invisible is always problems.

In [13]:
_get("search", {'q': 'Durazno Sangrando'})["response"]["hits"][0]["result"]["primary_artist"]

{'api_path': '/artists/369283',
 'header_image_url': 'https://images.genius.com/2d07534421dc542d2a4e7a67a39ee5ac.409x409x1.jpg',
 'id': 369283,
 'image_url': 'https://images.genius.com/2d07534421dc542d2a4e7a67a39ee5ac.409x409x1.jpg',
 'is_meme_verified': False,
 'is_verified': False,
 'name': 'Invisible',
 'url': 'https://genius.com/artists/Invisible'}

Great, it's artist id is 369283

In [14]:
artist_ids = {
    'Almendra': 344735,
    'Pescado Rabioso': 357615,
    'Spinetta Jade': 355287,
    'Spinetta y los Socios del Desierto': 369509,
    'Luis Alberto Spinetta': 344478,
    'Invisible': 369283
}

Let's get those songs

In [15]:
def get_artist_songs(artist_id):

    current_page = 1
    next_page = True
    songs = []

    while next_page:
        path = "artists/{}/songs/".format(artist_id)
        params = {'page': current_page}
        data = _get(path=path, params=params)
        page_songs = data['response']['songs']
        if page_songs:
            songs += page_songs
            current_page += 1
        else:
            next_page = False

    return songs

In [16]:
songs = []
for an_artist_id in artist_ids.values():
    songs.extend([song for song in get_artist_songs(an_artist_id) if song['primary_artist']['id'] == an_artist_id])

songs

[{'annotation_count': 5,
  'api_path': '/songs/795987',
  'full_title': 'A Estos Hombres Tristes by\xa0Almendra',
  'header_image_thumbnail_url': 'https://images.genius.com/6311b812d44083d731baf2ecad949856.300x300x1.jpg',
  'header_image_url': 'https://images.genius.com/6311b812d44083d731baf2ecad949856.631x631x1.jpg',
  'id': 795987,
  'lyrics_owner_id': 1549345,
  'lyrics_state': 'complete',
  'path': '/Almendra-a-estos-hombres-tristes-lyrics',
  'primary_artist': {'api_path': '/artists/344735',
   'header_image_url': 'https://assets.genius.com/images/default_avatar_300.png?1541791909',
   'id': 344735,
   'image_url': 'https://assets.genius.com/images/default_avatar_300.png?1541791909',
   'is_meme_verified': False,
   'is_verified': False,
   'name': 'Almendra',
   'url': 'https://genius.com/artists/Almendra'},
  'pyongs_count': None,
  'song_art_image_thumbnail_url': 'https://images.genius.com/6311b812d44083d731baf2ecad949856.300x300x1.jpg',
  'stats': {'hot': False, 'unreviewed_an

In [17]:
songs = [{
    'title': song['title'],
    'url': song['url'],
    'artist': song['primary_artist']['name']
} 
    for song in songs if song['lyrics_state'] == 'complete']

In [18]:
genius_df = pd.DataFrame(songs)
genius_df.sample(10)

Unnamed: 0,artist,title,url
99,Spinetta Jade,Sombras en los álamos,https://genius.com/Spinetta-jade-sombras-en-lo...
178,Luis Alberto Spinetta,El mono tremendo,https://genius.com/Luis-alberto-spinetta-el-mo...
132,Luis Alberto Spinetta,Atado a tu Frontera,https://genius.com/Luis-alberto-spinetta-atado...
243,Luis Alberto Spinetta,Penumbra,https://genius.com/Luis-alberto-spinetta-penum...
210,Luis Alberto Spinetta,La montaña,https://genius.com/Luis-alberto-spinetta-la-mo...
217,Luis Alberto Spinetta,Lejísimo,https://genius.com/Luis-alberto-spinetta-lejis...
158,Luis Alberto Spinetta,Cuenta En El Sol,https://genius.com/Luis-alberto-spinetta-cuent...
187,Luis Alberto Spinetta,Flecha Zen,https://genius.com/Luis-alberto-spinetta-flech...
26,Almendra,Para Ir,https://genius.com/Almendra-para-ir-lyrics
183,Luis Alberto Spinetta,Espuma Mística,https://genius.com/Luis-alberto-spinetta-espum...


In [19]:
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [20]:
raw_df['lyrics_url'] = ''
for index_s, row_s in raw_df.iterrows():
    for index_g, row_g in genius_df.iterrows():
        if similar(row_g['title'].lower(), row_s['name'].lower()) > 0.7 and row_g['artist'].lower() == row_s['artist'].lower():
            raw_df.at[index_s, 'lyrics_url'] = row_g['url']

In [21]:
len(raw_df[raw_df['lyrics_url'] == ''])    

92

92 empty values, this is gonna hurt later

Let's scrape the ones we have

In [30]:
def scrape_lyrics(url):

    response = requests.get(url)
    html = response.text

    soup = BeautifulSoup(html, 'html.parser')
    lyrics = soup.find("div", {"class": "lyrics"})
    lyrics = cleanhtml(str(lyrics))
    return lyrics

def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext


for index, row in raw_df.iterrows():
    url = row['lyrics_url']
    lyrics = ''
    if url:
        lyrics = scrape_lyrics(url)
    raw_df.at[index, 'lyrics'] = lyrics

In [37]:
raw_df['lyrics'] = raw_df['lyrics'].str.replace("\n", " ")

In [39]:
# raw_df.to_csv('data/raw_songs.csv', sep=';',index=False)

__Some editing by hand later...__

In [43]:
complete_df = pd.read_csv('data/raw_songs.csv', sep=';')
complete_df.sample(10)

Unnamed: 0,disc_number,duration_ms,name,track_number,album,artist,author,lyrics_url,lyrics
200,1,143533,Cuando El Arte Ataque,14,La La La,Luis Alberto Spinetta,,,Yo se que hoy me desperté para contar\npara c...
55,1,94146,Verde Llano,17,Almendra 2,Almendra,Edelmiro Molinari,,-
81,1,579506,"Hola, Pequeño Ser!",13,Pescado Rabioso 2,Pescado Rabioso,,https://genius.com/Pescado-rabioso-hola-pequen...,Hola pequeño ser Lo que amaba en vos Ya no l...
143,1,320400,Paraíso,2,Spinetta y los Socios del Desierto,Spinetta Y Los Socios Del Desierto,,,"Estás, tan llena de palabras,\ny los gestos so..."
192,1,316573,Dejaste Ver Tu Corazon,6,La La La,Luis Alberto Spinetta,,,Agosto le dice que no\nla luna se cae\ny ella ...
346,1,131000,Alteración de Tiempo,9,Spinettalandia y Sus Amigos,Luis Alberto Spinetta,,,-
122,1,257573,Alma de Diamante,2,Alma de Diamante,Spinetta Jade,,https://genius.com/Spinetta-jade-alma-de-diama...,Ven a mí Con tu dulce luz Alma de diamante Y...
54,1,155986,Amor de Aire,16,Almendra 2,Almendra,Edelmiro Molinari,,A pesar de que vuelvo\nA mis tierras de inocen...
321,1,178546,Águila de Trueno (Parte I),3,Kamikaze,Luis Alberto Spinetta,,https://genius.com/Luis-alberto-spinetta-aguil...,Mientras dure la mañana Hombre que llora En ...
172,2,235866,El Sol y la Afeitadora Eléctrica,14,Spinetta y los Socios del Desierto,Spinetta Y Los Socios Del Desierto,,,"Tu concierto está en aire,\ny el adiós,\nun in..."


In [51]:
complete_df['author'] = complete_df['author'].fillna('Luis Alberto Spinetta')

In [53]:
# complete_df.to_csv('data/raw_songs_v2.csv', sep=';',index=False)

In [9]:
# complete_df = pd.read_csv('data/raw_songs_v2.csv', sep=';')

Cleaning!

In [10]:
# Deleting html tags trash
complete_df.lyrics = complete_df.lyrics.str.replace(re.compile("<[^<>]*>"), "")

# Deleting []
complete_df.lyrics = complete_df.lyrics.str.replace(re.compile("\[[^\[\]]*\]"), "")

# Instrumental
complete_df.lyrics = complete_df.lyrics.str.replace("Instrumental", "")

# New lines
# complete_df.lyrics = complete_df.lyrics.str.replace("\n", "")


Last manual cleaning

In [11]:
# complete_df.to_csv('data/raw_songs_3.csv', sep=';',index=False)

In [20]:
df = pd.read_csv('data/complete_spinetta.csv', sep=';')

In [21]:
df.sample(10)

Unnamed: 0,disc_number,duration_ms,name,track_number,album,artist,author,lyrics_url,lyrics
323,1,158866,Almendra,5,Kamikaze,Luis Alberto Spinetta,Luis Alberto Spinetta,https://genius.com/Luis-alberto-spinetta-almen...,(instrumental) Guitarra electrica (Luis Alb...
131,1,395066,Extiéndete Una Vez Más,4,Los Ojos,Spinetta Y Los Socios Del Desierto,Luis Alberto Spinetta,,"Ya no sé donde fue,\ncon su luz tu verdad,\nav..."
272,1,263293,Ella Bailo (Love Of My Life),13,Peluson Of Milk,Luis Alberto Spinetta,Luis Alberto Spinetta,https://genius.com/Luis-alberto-spinetta-ella-...,Ella bailó Hasta perder su piel bailó No sé ...
289,1,151106,El Mono Tremendo,6,Téster de Violencia,Luis Alberto Spinetta,Pechugo,https://genius.com/Luis-alberto-spinetta-el-mo...,Hace mucho tiempo Había un maquinista de loc...
158,1,264306,La Orilla Infinita,17,Spinetta y los Socios del Desierto,Spinetta Y Los Socios Del Desierto,Luis Alberto Spinetta,,"Una vez en la orilla infinita,\nel mar bramó,\..."
102,1,209250,Moviola,1,Los Niños Que Escriben En El Cielo,Spinetta Jade,Luis Alberto Spinetta,https://genius.com/Spinetta-jade-moviola-lyrics,Alguien vio a este anciano Sólo aquí en el d...
56,1,89866,Leves Instrucciones,18,Almendra 2,Almendra,Luis Alberto Spinetta,,"Deberás,\nave salón de turno\nvenir a mí...\n\..."
165,2,273266,Jazmín,7,Spinetta y los Socios del Desierto,Spinetta Y Los Socios Del Desierto,Luis Alberto Spinetta,https://genius.com/Spinetta-y-los-socios-del-d...,Caído de un amor Nunca encontrarás Luces don...
321,1,178546,Águila de Trueno (Parte I),3,Kamikaze,Luis Alberto Spinetta,Luis Alberto Spinetta,https://genius.com/Luis-alberto-spinetta-aguil...,Mientras dure la mañana Hombre que llora En ...
55,1,94146,Verde Llano,17,Almendra 2,Almendra,Edelmiro Molinari,,-


In [22]:
df.lyrics = df.lyrics.str.replace(re.compile(r"(\s)([A-Z])"), r"\1\n\2")

with open("data/spinetta.txt", "w") as f: 
    for a_lyrics in df.lyrics.values:
        f.write(a_lyrics + "\n\n\n") 