In [37]:
import pandas as pd
from itertools import chain
from collections import Counter
import os
from utils import load_json_config

In [38]:
config = load_json_config("config.json")

## Downloading a small or a large dataset

### SMALL (~2k records)

In [None]:
!gdown {config["small_dataset_csv"]}

### LARGE (~26k records)

In [6]:
!gdown {config["large_dataset_csv"]}

Downloading...
From: https://drive.google.com/uc?id=1Up0kFTtyaz-CoiHIKoYAmfdLK7voK6xl
To: /home/yehor/Desktop/Genre_based_on_album_cover/large_dataset.csv
100%|██████████████████████████████████████| 3.10M/3.10M [00:00<00:00, 10.9MB/s]


## Downloading a small or large set of imgs associated with the dataset

### SMALL (~180mb)

In [None]:
!gdown {config["small_dataset_imgs"]}

### LARGE (~1.5gb)

In [None]:
!gdown {config["large_dataset_imgs"]}

NOTE: YOU HAVE TO ALTER THE PATH BELOW BASED ON WHEREVER YOU DOWNLOADED THE DATA

In [None]:
!unzip -o /home/yehor/Desktop/Genre_based_on_album_cover/large_dataset_imgs.zip

NOTE: YOU HAVE TO ALTER THE PATH BELOW BASED ON WHEREVER YOU DOWNLOADED THE DATA

In [98]:
images_location = "/home/yehor/Desktop/Genre_based_on_album_cover/large_cleaned_dataset_imgs"

## Organizing the data

In [99]:
df = pd.read_csv("/home/yehor/Desktop/Genre_based_on_album_cover/large_dataset.csv")
df

Unnamed: 0.1,Unnamed: 0,album_index,album_id,genres
0,0,0,5n1GSzC1Reao29ScnpLYqp,"[['florida rap', 'hip hop', 'miami hip hop', '..."
1,1,1,6UYZEYjpN1DYRW0kqFy9ZE,"[['hip hop', 'philly rap', 'pop', 'pop rap', '..."
2,2,2,7uVimUILdzSZG4KKKWToq0,"[['adult standards', 'canadian pop', 'lounge',..."
3,3,3,35s58BRTGAEWztPo9WqCIs,[[]]
4,4,4,41GuZcammIkupMPKH2OJ6I,"[['pop', 'pop rap', 'rap']]"
...,...,...,...,...
26172,26172,26172,5lXF6AFeXG8UsIUfBJNsNH,"[['adult standards', 'easy listening', 'lounge..."
26173,26173,26173,4ceeeNqKJ9OtF33DaHQuum,"[['adult standards', 'bebop', 'bossa nova', 'c..."
26174,26174,26174,1G8AfOjrE0FO9w1gfemIy1,"[['baroque pop', 'brill building pop', 'folk r..."
26175,26175,26175,62usLEsQho4s5TCfa6Ks4s,"[['adult standards', 'bebop', 'big band', 'coo..."


Deleting the images whose records are not present in the .csv file (if any)

In [100]:
df = df[df['album_index'].apply(lambda x: os.path.exists(f"{images_location}/{x}.jpg"))]

Now proceeding to merging the branches of genres into the parent genres

In [101]:
maps = load_json_config("country_classical.json")

In [102]:
indexed_categories = maps["indexed_categories"]
keyword_to_category = maps["keyword_to_category"]

In [103]:
def extract_genres(genres):
    return list(chain.from_iterable(eval(genres)))

In [104]:
def parent_genre(sub_genre):
    if not sub_genre or not isinstance(sub_genre, str):
        return "unknown"
    
    sub_genre = sub_genre.lower()
    for keyword, category in keyword_to_category.items():
        if keyword in sub_genre:
            return category
    
    return "unknown"

In [105]:
def assign_genre(sub_genres):
    genre_counts = Counter([parent_genre(sub_genre) for sub_genre in sub_genres])
    if len(genre_counts) == 0:
        return "unknown"
    return max(genre_counts.items(), key=lambda item: item[1])[0]  # return the genre with max count

In [106]:
cleaned_df = pd.DataFrame({"album_id": pd.Series(dtype="int"), "genre": pd.Series(dtype="int")})
for row in df.itertuples(index=True, name="Row"):
    genre = assign_genre(extract_genres(row.genres))
    if genre == "unknown":
        continue
    new_row = pd.DataFrame([{"album_id": int(row.album_index), "genre": indexed_categories[genre]}])
    cleaned_df = pd.concat([cleaned_df, new_row], ignore_index=True)

cleaned_df

Unnamed: 0,album_id,genre
0,13,0
1,25,1
2,33,1
3,44,0
4,45,1
...,...,...
3094,26148,1
3095,26149,1
3096,26172,0
3097,26173,0


In [107]:
cleaned_df["genre"].value_counts()

genre
1    1689
0    1410
Name: count, dtype: int64

In [108]:
cleaned_df.to_csv(config["organized_dataset_name"], index=False)