In [2]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder


In [9]:
df = pd.read_csv('../data.csv')

df.head()
df.drop(['Unnamed: 0'], axis=1, inplace=True)

# Encode and process data
col = ['year', 'key', 'popularity', 'acousticness', 'danceability', 'duration_ms',
       'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness',
       'tempo', 'valence', 'time_signature']
scaler = MinMaxScaler()
df[col] = scaler.fit_transform(df[col])

encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
enc = pd.DataFrame(encoder.fit_transform(np.array(df["genre"]).reshape(-1,1)))
enc.columns = df["genre"].unique()

df[enc.columns] = enc
df = df.drop("genre", axis=1)

memory_usage = df.memory_usage(index=True).sum()
print("Memory Usage (Before): ", memory_usage)

# Assuming you have a DataFrame df with 'artist_name', 'track_name', and 'popularity' columns
min_songs = 2  # Minimum number of songs for an artist to be retained
cap = 125  # Maximum number of most popular songs to keep for each artist

# Step 1: Group the DataFrame by 'artist_name' and count the number of songs for each artist
artist_song_counts = df['artist_name'].value_counts()

print("Number of artists with at least", min_songs, "songs:", sum(artist_song_counts >= min_songs))
print("Number of artists with less than", min_songs, "songs:", sum(artist_song_counts < min_songs))

# Step 2: Create a mask to filter artists with at least 'min_songs' songs
artists_with_min_songs = artist_song_counts.index[artist_song_counts >= min_songs]

# Step 3: Apply the mask to the original DataFrame to keep only relevant artists
df_filtered = df[df['artist_name'].isin(artists_with_min_songs)]

# Step 4: Sort the DataFrame by 'popularity' within each artist's group in descending order
df_sorted = df_filtered.sort_values(by=['artist_name', 'popularity'], ascending=[True, False])

# Step 5: Keep the top 'cap' most popular songs for each artist
df_most_popular = df_sorted.groupby('artist_name').head(cap)

# Step 6: Calculate the number of songs removed
total_songs_removed = len(df) - len(df_most_popular)

# Step 7: Print the results
print("Number of artists with at least", min_songs, "songs and their", cap, "most popular songs:")
print("Number of songs removed:", total_songs_removed)
print("Number of songs left:", len(df_most_popular))

# Revert to default sorting by 'index'
df_most_popular.sort_index(inplace=True)

df = df_most_popular.dropna(subset=['track_name'])
# Count and print the number of songs that are a remix
remix_count = df[df['track_name'].str.contains(' remix', case=False)].shape[0]
print(f"There are {remix_count} songs that are a remix.")
df = df[~df['track_name'].str.contains(' remix', case=False)]

df[df.columns[3:17]] = df[df.columns[3:17]].astype('float32')

df[df.columns[9]] = df[df.columns[9]].astype('int8')

# Convert columns 19+ to integers
df[df.columns[18:]] = df[df.columns[18:]].astype('int8')

memory_usage = df.memory_usage(index=True).sum()
print("Memory Usage (After): ", memory_usage)

#df.to_csv("data_encoded.csv")
df.to_parquet("data_encoded.parquet")

#df = pd.read_csv('data_encoded.csv')
df = pd.read_parquet("data_encoded.parquet")
memory_usage = df.memory_usage(index=True).sum()
print("Memory Usage (From File): ", memory_usage)

#number of artists
print("Number of artists: ", len(df['artist_name'].unique()))
#number of songs
print("Number of songs: ", len(df))

Memory Usage (Before):  927811328
Number of artists with at least 2 songs: 50359
Number of artists with less than 2 songs: 13800
Number of artists with at least 2 songs and their 125 most popular songs:
Number of songs removed: 122658
Number of songs left: 1037106


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_most_popular.sort_index(inplace=True)


There are 31808 songs that are a remix.
Memory Usage (After):  175927150
Memory Usage (From File):  175927150
Number of artists:  50297
Number of songs:  1005298


In [10]:
print("Number of songs: ", len(df))

Number of songs:  1005298
