In [None]:
"""
Note: Gemini was used as autocomplete for this notebook
"""

"""
This notebook will help you download the album cover images, and assign genre labels to them
"""

# Download datasets
# !gdown https://drive.google.com/uc?id=1myz_mN01Uv8WGUk_Wtag5fiZw8v-WVos
# !gdown https://drive.google.com/uc?id=1qsStd4onpGFLLMPEdjB4DDb8negqNLdz

In [2]:
import numpy as np
import pandas as pd
from time import sleep
# from google.colab import drive
# drive.mount('/content/drive')

albums = pd.read_csv('albums.csv', sep="\t")
albums.head()

Unnamed: 0,album_id,name,billboard,artists,popularity,total_tracks,album_type,image_url
0,5n1GSzC1Reao29ScnpLYqp,Dying To Live,Dying To Live,{'46SHBwWsqBkxI7EeeBEQG7': 'Kodak Black'},83,16,album,https://i.scdn.co/image/db2133234d458f432ca207...
1,6UYZEYjpN1DYRW0kqFy9ZE,Championships,Championships,{'20sxb77xiYeusSH8cVdatc': 'Meek Mill'},85,19,album,https://i.scdn.co/image/77eb7c17cafe5503c58661...
2,7uVimUILdzSZG4KKKWToq0,Christmas (Deluxe Special Edition),Christmas,{'1GxkXlMwML1oSg5eLPiAz3': 'Michael Bublé'},60,20,album,https://i.scdn.co/image/2d6ee8d4fb5a45abf35cd3...
3,35s58BRTGAEWztPo9WqCIs,Spider-Man: Into the Spider-Verse (Soundtrack ...,Spider-Man: Into The Spider-Verse,{'0LyfQWJT6nXafLPZqxe9Of': 'Various Artists'},92,13,compilation,https://i.scdn.co/image/3aa37254a41cf96e815725...
4,41GuZcammIkupMPKH2OJ6I,ASTROWORLD,ASTROWORLD,{'0Y5tJX1MQlPlqiwlOH1tJY': 'Travis Scott'},91,17,album,https://i.scdn.co/image/cdca7dc20c778ada42fb18...


In [3]:
artists = pd.read_csv('artists.csv', sep="\t")
artists.head()

Unnamed: 0,artist_id,name,followers,popularity,artist_type,main_genre,genres,image_url
0,66CXWjxzNUsdJxJ2JdwvnR,Ariana Grande,34554242.0,96,singer,dance pop,"['dance pop', 'pop', 'post-teen pop']",https://i.scdn.co/image/b1dfbe843b0b9f54ab2e58...
1,26VFTg2z8YR0cCuwLzESi2,Halsey,7368242.0,90,singer,dance pop,"['dance pop', 'electropop', 'etherpop', 'indie...",https://i.scdn.co/image/22a5f3d8c42bc7cb55215e...
2,0Y5tJX1MQlPlqiwlOH1tJY,Travis Scott,6313709.0,94,rapper,pop,"['pop', 'pop rap', 'rap']",https://i.scdn.co/image/dc5eba5e032c2e5bc4d42c...
3,246dkjvS1zLTtiykXe5h60,Post Malone,16737002.0,96,rapper,dfw rap,"['dfw rap', 'pop', 'rap']",https://i.scdn.co/image/f9d8b742b66609f12da023...
4,1zNqQNIdeOUZHb8zbZRFMX,Swae Lee,483032.0,89,singer,trap music,['trap music'],https://i.scdn.co/image/a177469870b41f7e17e3b5...


In [None]:
# For each album, do a few things:
# 1. Download the associated image from the albums.csv file
# 2. For each artist associated with the album, match their ID with the
# appropriate ID in artists.csv, and label them as such

In [None]:
import requests
from io import BytesIO
from PIL import Image

def save_image_from_url(image_url, file_path):
  """
  Saves an image from a URL to a local file.

  Args:
    image_url: The URL of the image to save.
    file_path: The local file path to save the image to.
  """
  response = requests.get(image_url, stream=True)
  response.raise_for_status()  # Raise an exception for bad responses (4xx or 5xx)
  image = Image.open(BytesIO(response.content))
  image.save(file_path)

In [None]:
!mkdir album_covers

In [None]:
import time
import ast
import shutil
import os
import traceback  # Import for traceback information

output_df = pd.DataFrame(columns=['album_index', 'album_id', 'genres'])
for index, row in albums.iterrows():
    try:
        artist = ast.literal_eval(row.artists)
        genres = []
        for id in artist:
            matching_row = artists[artists['artist_id'] == id]
            print(matching_row)
            genres.append(ast.literal_eval(matching_row['genres'].tolist()[0]))
        if len(genres) == 0:
            continue
        # Download the image of the album associated with these genres
        save_image_from_url(row.image_url, "album_covers/" + str(index) + ".jpg")

        new_row = {'album_index': len(output_df), 'album_id': row.album_id, 'genres': genres}
        output_df.loc[len(output_df)] = new_row
        time.sleep(0.25)  # Optional: Add a delay to avoid overwhelming the server
        # if index > 10:
        #     break
    except Exception as e:
        print(f"Error processing album at index {index}: {e}")
        traceback.print_exc()  # Print traceback for debugging
        # You can choose to continue or break based on the specific error

# Save results regardless of errors
output_df.to_csv('album_genres.csv', index=True)
shutil.copy('album_genres.csv', '/content/drive/MyDrive/Colab Notebooks/qac239/album_genres.csv')

# # Check if the destination directory exists before attempting to copy
# if not os.path.exists('/content/drive/MyDrive/Colab Notebooks/qac239/album_covers'):
#     shutil.copytree('album_covers', '/content/drive/MyDrive/Colab Notebooks/qac239/album_covers')
# else:
#     print("Destination directory already exists. Overwriting...")
#     shutil.rmtree('/content/drive/MyDrive/Colab Notebooks/qac239/album_covers')  # Remove existing directory
#     shutil.copytree('album_covers', '/content/drive/MyDrive/Colab Notebooks/qac239/album_covers')  # Copy new directory

In [None]:
# Cell used to correct the dataset

import ast
output_df = pd.DataFrame(columns=['album_index', 'album_id', 'genres'])

albums = pd.read_csv('albums.csv', sep="\t")

# Check if properly labeled
for index, row in albums.iterrows():
    # if index < 26500:
    #     continue
    # print(index, row.artists)
    artist = ast.literal_eval(row.artists)
    # print(artist)
    genres = []
    for id in artist:
            matching_row = artists[artists['artist_id'] == id]
            if matching_row.empty:
                 continue
            # print('matching_row', matching_row['genres'])
            genres.append(ast.literal_eval(matching_row['genres'].tolist()[0]))

    new_row = {'album_index': len(output_df), 'album_id': row.album_id, 'genres': genres}
    output_df.loc[len(output_df)] = new_row

output_df.to_csv('fix.csv', index=True)
