In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from google.colab import drive
import os
import torch

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')
else:
    print("Drive ya está montado")

# Ruta base en Google Drive donde se guarda el modelo
BASE_PATH = "/content/drive/MyDrive/ProyectoFinal"
MODEL_PATH = f"{BASE_PATH}/FineTuning/DeepESP_gpt2-spanish/full_fine_tuning_v4"
DATA_FOLDER = f"{MODEL_PATH}/Data"

Mounted at /content/drive


In [4]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import os
import json
import gradio as gr
from collections import Counter
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import io
from PIL import Image
import nltk
from nltk.corpus import stopwords

artist_data_dict = {}

for filename in os.listdir(DATA_FOLDER):
    if filename.endswith(".json"):
        filepath = os.path.join(DATA_FOLDER, filename)
        with open(filepath, "r", encoding="utf-8") as f:
            artist_json = json.load(f)
            artist_name = artist_json.get("name", "Unknown")
            songs = artist_json.get("songs", [])
            lyrics = [song.get("lyric", "") for song in songs if song.get("lyric")]
            if artist_name in artist_data_dict:
                artist_data_dict[artist_name]["lyrics"].extend(lyrics)
            else:
                artist_data_dict[artist_name] = {
                    "filename": filename,
                    "lyrics": lyrics
                }

artist_names = sorted(artist_data_dict.keys())

# --- FUNCIONES DE VISUALIZACIÓN ---
# desestimar las palabras más comunes ("el", "la", "de", "y", "que", etc.):
spanish_stopwords = set(stopwords.words("spanish"))

def generar_visualizaciones(artist_name, top_n=30):
    lyrics = " ".join(artist_data_dict[artist_name]["lyrics"]).lower()

    # Tokenización simple
    words = lyrics.split()

    # Filtro: eliminar palabras cortas y stopwords
    filtered_words = [w for w in words if len(w) > 3 and w not in spanish_stopwords]

    word_counts = Counter(filtered_words)
    most_common = word_counts.most_common(top_n)

    if most_common:
        palabras, frecuencias = zip(*most_common)
    else:
        palabras, frecuencias = [], []

    # --- Histograma ---
    plt.figure(figsize=(10, 5))
    plt.bar(palabras, frecuencias, color="skyblue")
    plt.xticks(rotation=45, ha='right')
    plt.title(f"Top {top_n} palabras más frecuentes de {artist_name}")
    plt.tight_layout()

    hist_buf = io.BytesIO()
    plt.savefig(hist_buf, format='png')
    plt.close()
    hist_buf.seek(0)

    # --- Nube de palabras ---
    wordcloud = WordCloud(
        width=600, height=400,
        background_color='white',
        stopwords=spanish_stopwords
    ).generate(" ".join(filtered_words))

    cloud_buf = io.BytesIO()
    wordcloud.to_image().save(cloud_buf, format='PNG')
    cloud_buf.seek(0)

    return Image.open(hist_buf), Image.open(cloud_buf)

interface = gr.Interface(
    fn=generar_visualizaciones,
    inputs=[
        gr.Dropdown(choices=artist_names, label="Seleccionar artista"),
        gr.Slider(minimum=10, maximum=100, step=5, value=30, label="Cantidad de palabras en el histograma")
    ],
    outputs=[
        gr.Image(type="pil", label="Histograma de palabras"),
        gr.Image(type="pil", label="Nube de palabras")
    ],
    title="Análisis de letras por artista",
    description="Visualiza las palabras más frecuentes de las letras en el dataset por artista (histograma y nube de palabras)."
)

interface.launch(share=True)