# Data Preparation

**Authors:** Abderrahmane Salmi, Ricardo Talarico, Lorenzo Allegrini

In [19]:
import numpy as np
import pandas as pd
import math
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime

In [20]:
tracks_df = pd.read_csv("../datasets/tracks_cleaned.csv")
artists_df = pd.read_csv("../datasets/artists_cleaned.csv")

### Features Created
- swear_ratio: fraction of words in the songs that are swear words
- aggressiveness: measure of the aggressiveness of the song. Computed based on swear_ratio and musical features like bpm, loudness and flux.
- relative_popularity: ranking of the popularity of the song out of all the songs of the artist
- starting_age: the age at which the artist started making music

In [14]:
tracks_df['swear_ratio'] = (tracks_df['swear_IT'] + tracks_df['swear_EN']) / tracks_df['n_tokens']


from sklearn.preprocessing import StandardScaler

features_for_angriness = ['bpm','centroid','flux','rolloff','loudness','swear_ratio']

scaler = StandardScaler()
scaled_features = scaler.fit_transform(tracks_df[features_for_angriness])
scaled_features = pd.DataFrame(scaled_features, columns=features_for_angriness)


tracks_df['aggressiveness'] = (
    (1/6)*scaled_features['centroid']+
    (1/6)*scaled_features['bpm']+
    (1/6)*scaled_features['flux']+
    (1/6)*scaled_features['swear_ratio']+
    (1/6)*scaled_features['loudness']+
    (1/6)*scaled_features['rolloff']
)
# starting age TODO: check if the code works
artists_df['starting_age'] = math.floor((artists_df['active_start'] - artists_df['birth_date']).days / 365)

# relative popularity
tracks_df['relative_popularity'] = (
    tracks_df
    .groupby('name_artist')['popularity']
    .rank(ascending=False, method='dense')
)


TypeError: unsupported operand type(s) for -: 'str' and 'str'

Artist Age:
Older artists may have more experience, different styles, musical inspirations, lyrics richness, etc.

In [None]:
# Ensure datetime parsing for birth dates
# TODO; we already did this in data understand, delete after check
# artists_df['birth_date'] = pd.to_datetime(artists_df['birth_date'], errors='coerce')

# Get the birth year
artists_df['birth_year'] = artists_df['birth_date'].dt.year

# Compute artist age
curr_year = datetime.now().year
artists_df['artist_age'] = curr_year - artists_df['birth_year']

# Optional: handle impossible or missing ages (e.g. negative or NaN)
# artists_df.loc[artists_df['artist_age'] < 10, 'artist_age'] = None  # filter unrealistic ages

# Check that the column exists and contains valid values
artists_df[['id_author', 'birth_year', 'artist_age']].sample(10)

Unnamed: 0,id_author,birth_year,artist_age
26,ART09119396,1997.0,28.0
77,ART66932389,1973.0,52.0
81,ART17240256,1994.0,31.0
0,ART82291002,,
55,ART20729624,2002.0,23.0
9,ART02666525,,
63,ART51628788,,
64,ART48537029,1976.0,49.0
20,ART59609037,,
71,ART07629990,1980.0,45.0
