In [None]:
%pip install -r requirements.txt

import pandas as pd
import numpy as np
from langdetect import detect
from iso639 import languages
from lyrics_extractor import SongLyrics
import datetime
import calendar
from os import listdir
from os.path import isfile, join

<h2>Preparar dataset</h2>
<ol>
    <li>Concatenar Jsons de StreamingHistory</li>
    <li>Quitar caracteres especiales</li>
    <li>Separar time y hour en dos columnas</li>
    <li>Imprimir primeras rows del dataset</li>
</ol>

In [None]:
test_num_reproducciones = 999999999

raw_files = [f for f in listdir('./raw') if isfile(join('./raw', f))]
streaming_history = pd.DataFrame()
if len(raw_files) == 0:
    raise Exception("No hay ficheros StreamingHistory_music para analizar. Asegurate de que se encuentran en el directorio /raw")

for f in raw_files:
    if f.startswith('StreamingHistory_music'):        
        streaming_history = pd.concat([streaming_history, pd.read_json('./raw/'+f)], ignore_index=True)[:test_num_reproducciones]


streaming_history['trackName'] = streaming_history['trackName'].str.replace(r'\$', '', regex=True)

streaming_history[['time', 'hour']] = streaming_history['endTime'].str.split(' ', n=1, expand=True)

print(f'Registros a analizar: {len(streaming_history)}')
fechas_datos = 'Fechas de datos: ' + min(streaming_history['endTime']) + ' y ' + max(streaming_history['endTime'])

streaming_history.head()


<h2>Obtener idioma</h2>
<p>Actualmente está usando langdetect pero es muy poco fiable</p>

In [None]:
def returnLenguage(txt):
    if len(txt) > 5:
        code = detect(txt)
        lang_name = languages.get(part1=code).name
        return lang_name
    else:
        return 'none'

for index, row in streaming_history.iterrows():
    artist = row['artistName']
    title = row['trackName']
    
    streaming_history.at[index, 'lenguage'] = returnLenguage(title + ', ' + artist)


<h2>Obtener día de la semana de la reproducción</h2> 

In [None]:
def returnDiaSemana(txt):
    my_date = datetime.datetime.strptime(txt, "%Y-%m-%d").date()
    calendar.day_name[my_date.weekday()] 
    
    return calendar.day_name[my_date.weekday()] 

for index, row in streaming_history.iterrows():
    streaming_history.at[index, 'diaSemana'] = returnDiaSemana(row['time'])


streaming_history.to_csv('./processed/processed_ds.csv', index=False)  

<h2>Algunas estadisticas</h2>

In [None]:
artist_counts = streaming_history['artistName'].value_counts().reset_index()
artist_counts.columns = ['artistName', 'n_reproducciones']
artist_counts = artist_counts.sort_values(by='n_reproducciones', ascending=False)

song_counts = streaming_history['trackName'].value_counts().reset_index()
song_counts.columns = ['trackName', 'n_reproducciones']
song_counts = song_counts.sort_values(by='n_reproducciones', ascending=False)

unique_artists = len(streaming_history['artistName'].unique())
unique_songs = len(streaming_history['trackName'].unique())

<h2>Visualizaciones</h2>

In [None]:
import matplotlib.pyplot as plt
import numpy as np

In [None]:
fig, ax = plt.subplots(figsize=(14, 6))

ax.bar(artist_counts['artistName'][:10], artist_counts['n_reproducciones'][:10], color='skyblue')

ax.set_ylabel('n reproducciones')
ax.set_title('Artistas por n de Reproducciones')

plt.grid(True) 
plt.savefig('./processed/reproducciones_por_artista.png', bbox_inches='tight')
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(14, 6))

ax.bar(song_counts['trackName'][:10], song_counts['n_reproducciones'][:10], color='lightgreen')

ax.set_ylabel('n reproducciones')
ax.set_title('Canciones por n de Reproducciones')

plt.grid(True) 
plt.savefig('./processed/reproducciones_por_cancion.png', bbox_inches='tight')
plt.show()

In [None]:
streaming_history['endTime'] = pd.to_datetime(streaming_history['endTime'])
streaming_history.set_index('endTime', inplace=True)

reproducciones_por_dia = streaming_history.resample('ME').size().reset_index()
reproducciones_por_dia.columns = ['Fecha', 'Reproducciones']

plt.figure(figsize=(15, 6))
plt.plot(reproducciones_por_dia['Fecha'], reproducciones_por_dia['Reproducciones'], marker='o', linestyle='-')

plt.xticks(reproducciones_por_dia['Fecha'])

plt.title('Reproducciones por mes')

plt.grid(True) 
plt.savefig('./processed/reproducciones_por_mes.png', bbox_inches='tight')
plt.show()

In [None]:
song_counts_lenguage = streaming_history['lenguage'].value_counts().reset_index()
song_counts_lenguage.columns = ['lenguage', 'n_reproducciones']
scl = song_counts_lenguage.sort_values(by='n_reproducciones', ascending=False)

fig, ax = plt.subplots(figsize=(14, 6))

ax.bar(scl['lenguage'][:10], scl['n_reproducciones'][:10], color='mediumorchid')

ax.set_ylabel('n reproducciones')
ax.set_title('Idioma de canciones por n de reproducciones (idioma poco fiable)')

plt.grid(True) 
plt.savefig('./processed/reproducciones_por_idioma.png', bbox_inches='tight')
plt.show()

In [None]:
song_counts_lenguage = streaming_history['diaSemana'].value_counts().reset_index()
song_counts_lenguage.columns = ['diaSemana', 'n_reproducciones']
scl = song_counts_lenguage.sort_values(by='n_reproducciones', ascending=False)

fig, ax = plt.subplots(figsize=(14, 6))

x_ticks_labels = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
ax.bar(x_ticks_labels, scl['n_reproducciones'], color='orange')

ax.set_ylabel('n reproducciones')
ax.set_title('Reproducciones por día de la semana')

plt.grid(True) 
plt.savefig('./processed/reproducciones_por_dia_semana.png', bbox_inches='tight')
plt.show()

In [None]:
from PIL import Image, ImageDraw, ImageFont

width, height = 1080, 600
background_color = (255, 255, 255)
image = Image.new('RGB', (width, height), background_color)

draw = ImageDraw.Draw(image)
font = ImageFont.load_default()

try:
    font1 = ImageFont.truetype("arial.ttf", 20)
    font2 = ImageFont.truetype("arial.ttf", 10)
except IOError:
    font1 = ImageFont.load_default()
    font2 = ImageFont.load_default()

text = f'Artistas únicos: {unique_artists}\nCanciones únicas: {unique_songs}'
draw.text((45, 45), text, font=font1, fill='black')
draw.text((295, 45), fechas_datos, font=font2, fill='black')


try:
    additional_image1 = Image.open('./processed/reproducciones_por_artista.png')
    additional_image2 = Image.open('./processed/reproducciones_por_cancion.png')

    additional_image1 = additional_image1.resize((500, 300), resample=Image.LANCZOS)
    additional_image2 = additional_image2.resize((500, 300), resample=Image.LANCZOS)

    image.paste(additional_image1, (20, 150))
    image.paste(additional_image2, (540, 150))
except IOError:
    print("Una o más imágenes no se pudieron cargar. Verifica las rutas.")


image.save('./processed/stats_visualization_with_text.png', quality=95)  
