# Data Preparation

**Authors:** Abderrahmane Salmi, Ricardo Talarico, Lorenzo Allegrini

In [111]:
import numpy as np
import pandas as pd
import math
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime

from utils import drop_columns_if_exists

In [112]:
tracks_df = pd.read_csv("../datasets/tracks_cleaned.csv")
artists_df = pd.read_csv("../datasets/artists_cleaned.csv")

### Check current features list

In [113]:
# Print all feature names
tracks_original_features = tracks_df.columns.tolist()
print(tracks_original_features)

# Print total number of features
print(f"\nTotal number of features: {len(tracks_original_features)}")

['id_artist', 'name_artist', 'title', 'featured_artists', 'language', 'stats_pageviews', 'swear_IT', 'swear_EN', 'swear_IT_words', 'swear_EN_words', 'year', 'month', 'day', 'n_tokens', 'tokens_per_sent', 'char_per_tok', 'lexical_density', 'avg_token_per_clause', 'bpm', 'centroid', 'rolloff', 'flux', 'flatness', 'spectral_complexity', 'pitch', 'loudness', 'album_name', 'album_release_date', 'album_type', 'disc_number', 'track_number', 'duration_ms', 'explicit', 'popularity', 'lyrics']

Total number of features: 35


In [114]:
# Print all feature names
artists_original_features = artists_df.columns.tolist()
print(artists_original_features)

# Print total number of features
print(f"\nTotal number of features: {len(artists_original_features)}")

['id_author', 'name', 'gender', 'birth_date', 'birth_place', 'nationality', 'description', 'active_start', 'active_end', 'province', 'region', 'country', 'latitude', 'longitude']

Total number of features: 14


### Features Created
- swear_ratio: fraction of words in the songs that are swear words
- aggressiveness: measure of the aggressiveness of the song. Computed based on swear_ratio and musical features like bpm, loudness and flux.
- relative_popularity: ranking of the popularity of the song out of all the songs of the artist
- starting_age: the age at which the artist started making music

In [115]:
tracks_df['swear_ratio'] = (tracks_df['swear_IT'] + tracks_df['swear_EN']) / tracks_df['n_tokens']


from sklearn.preprocessing import StandardScaler

features_for_angriness = ['bpm','centroid','flux','rolloff','loudness','swear_ratio']

scaler = StandardScaler()
scaled_features = scaler.fit_transform(tracks_df[features_for_angriness])
scaled_features = pd.DataFrame(scaled_features, columns=features_for_angriness)


tracks_df['aggressiveness'] = (
    (1/6)*scaled_features['centroid']+
    (1/6)*scaled_features['bpm']+
    (1/6)*scaled_features['flux']+
    (1/6)*scaled_features['swear_ratio']+
    (1/6)*scaled_features['loudness']+
    (1/6)*scaled_features['rolloff']
)
# starting age TODO: check if the code works
artists_df['starting_age'] = math.floor((artists_df['active_start'] - artists_df['birth_date']).days / 365)

# relative popularity
tracks_df['relative_popularity'] = (
    tracks_df
    .groupby('name_artist')['popularity']
    .rank(ascending=False, method='dense')
)


TypeError: unsupported operand type(s) for -: 'str' and 'str'

**artist_age**: Older artists may have more experience, different styles, musical inspirations, lyrics richness, etc.

In [116]:
# Ensure datetime parsing for birth dates
# TODO; we already did this in data understand, delete after check
artists_df['birth_date'] = pd.to_datetime(artists_df['birth_date'], errors='coerce')

# Get the birth year
artists_df['birth_year'] = artists_df['birth_date'].dt.year

# Compute artist age
curr_year = datetime.now().year
artists_df['artist_age'] = curr_year - artists_df['birth_year']

# Optional: handle impossible or missing ages (e.g. negative or NaN)
# artists_df.loc[artists_df['artist_age'] < 10, 'artist_age'] = None  # filter unrealistic ages

# Drop temporary columns we created
artists_df = drop_columns_if_exists(artists_df, ['birth_year'])

# Check that the column exists and contains valid values
artists_df[['id_author', 'artist_age']].sample(10)

Unnamed: 0,id_author,artist_age
52,ART88423027,55.0
71,ART07629990,45.0
9,ART02666525,
101,ART57587384,40.0
8,ART19605256,
72,ART19060721,28.0
76,ART12092805,24.0
16,ART71969350,43.0
57,ART16868977,33.0
77,ART66932389,52.0


**artist_age_at_release**: How old the artist was when they released each song, it allows us to detect age-related patterns in lyrics, energy, popularity, etc. Maybe know if successfull songs are mostly made by younger or older artists, which we can be interpreted as hype vs experience.

In [117]:
# Ensure birth_date is datetime
artists_df['birth_date'] = pd.to_datetime(artists_df['birth_date'], errors='coerce')

# Get birth year directly from birth_date
artists_df['birth_year'] = artists_df['birth_date'].dt.year

# Merge birth year from artists_df into tracks_df
tracks_df = tracks_df.merge(
    artists_df[['id_author', 'birth_year']],
    left_on='id_artist',
    right_on='id_author',
    how='left'
)

# Compute artist age at the time of song release
tracks_df['artist_age_at_release'] = tracks_df['year'] - tracks_df['birth_year']

# Optional: filter unrealistic ages
# tracks_df.loc[(tracks_df['artist_age_at_release'] < 10) | (tracks_df['artist_age_at_release'] > 100), 'artist_age_at_release'] = None

# Drop temporary columns we created
tracks_df = drop_columns_if_exists(tracks_df, ['birth_year'])

# Quick check
tracks_df[['title', 'year', 'name_artist', 'artist_age_at_release']].sample(10)



Unnamed: 0,title,year,name_artist,artist_age_at_release
5394,Ma è un sogno (Variazione sul tema ”Arrivi sta...,2068.0,Dargen D’Amico,
5280,Vibe,2018.0,Baby K,35.0
8886,Tengo Il Respiro,2016.0,Rancore,27.0
6284,Napoli,1929.0,99 Posse,
7139,Essere vero,2064.0,Gemitaiz,76.0
4504,DENG DENG,1906.0,Ensi,-79.0
5732,Serie TV Freestyle,2016.0,Shade,29.0
8053,Mi sono perso,2009.0,Coez,26.0
9610,IL CIELO NELLA STANZA,2018.0,Salmo,34.0
9187,War (Gengis Khan RMX),2008.0,Colle Der Fomento,


### Check all new features

In [118]:
# Find the new features
artists_current_features = artists_df.columns.tolist()
new_artist_features = [col for col in artists_current_features if col not in artists_original_features]

# Print results
print("New Features Added to artists_df:")
for feature in new_artist_features:
    print(f"- {feature}")

print(f"\nTotal new features: {len(new_artist_features)}")

New Features Added to artists_df:
- artist_age
- birth_year

Total new features: 2


In [119]:
# Find the new features
tracks_current_features = tracks_df.columns.tolist()
new_tracks_features = [col for col in tracks_current_features if col not in tracks_original_features]

# Print results
print("New Features Added to artists_df:")
for feature in new_tracks_features:
    print(f"- {feature}")

print(f"\nTotal new features: {len(new_tracks_features)}")

New Features Added to artists_df:
- swear_ratio
- aggressiveness
- id_author
- artist_age_at_release

Total new features: 4
