In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from scipy.stats import zscore
from sklearn.preprocessing import scale
import numpy as np

In [3]:
#Get data
df1 = pd.read_csv('data/Spotify_train_dataset.csv')
df2 = pd.read_csv('data/Spotify_test_dataset.csv')

def preprocess_data(df, features_columns, label = None, z_score = False, standardize = False) :
    print("--------------------------------")
    print("Data infos before transformation")
    print("--------------------------------")
    print(df.info())
    print("--------------------------------")
    print(df.describe())
    print("--------------------------------")
    print(f"Find column with NaN values : \n{df.isna().sum()}")
    print("--------------------------------")

    #Create inputs and labels
    #label
    if label !=None :
        df_labels = df['genre']
        le = LabelEncoder()
        df_labels = le.fit_transform(df_labels)

    #inputs
    df = df[features_columns]
    #Remove outliers
    if z_score :
        z_scores = zscore(df)
        abs_z_scores = np.abs(z_scores)
        filtered_entries = (abs_z_scores < 4).all(axis=1)
        df = df[filtered_entries]
        if label != None : df_labels = df_labels[filtered_entries]

    #Strandardize : center reduce
    if standardize :
        df = scale(df, axis=0, with_mean=True, with_std=True)
        df = pd.DataFrame(df, columns=features_columns)

    print("--------------------------------")
    print("Data infos after transformation")
    print("--------------------------------")
    print(df.info())
    print("--------------------------------")



    if label != None :
        res = (df, df_labels)
    else :
        res = df

    return res

In [4]:
#TEST
features_columns = ['danceability', 'energy', 'key', 'loudness', 'mode',
        'speechiness', 'acousticness', 'instrumentalness', 'liveness',
        'valence', 'tempo', 'duration_ms', 'time_signature']
label = ['genre']
df1_preprocessed, df1_labels_preprocessed = preprocess_data(df1, features_columns, label, z_score=True, standardize=True)
df2 = preprocess_data(df2, features_columns)

--------------------------------
Data infos before transformation
--------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31728 entries, 0 to 31727
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   danceability      31728 non-null  float64
 1   energy            31728 non-null  float64
 2   key               31728 non-null  int64  
 3   loudness          31728 non-null  float64
 4   mode              31728 non-null  int64  
 5   speechiness       31728 non-null  float64
 6   acousticness      31728 non-null  float64
 7   instrumentalness  31728 non-null  float64
 8   liveness          31728 non-null  float64
 9   valence           31728 non-null  float64
 10  tempo             31728 non-null  float64
 11  type              31728 non-null  object 
 12  id                31728 non-null  object 
 13  uri               31728 non-null  object 
 14  track_href        31728 non-null  o

(31728, 20)