# Preprocessing

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode
import seaborn as sns
import datetime as dt
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)
import random
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import  LinearRegression

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import initializers
from sklearn.utils import compute_class_weight

from sklearn.decomposition import PCA

# Load Dataset

In [2]:
tracks = pd.read_csv('../../Dataset/tracks.csv')
tracks.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],['45tIt06XoI0Iio4LBEVpls'],1922-02-22,0.645,0.445,0,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,['Fernando Pessoa'],['14jtPCOoNZwquk5wd9DxrY'],1922-06-01,0.695,0.263,0,-22.136,1,0.957,0.797,0.0,0.148,0.655,102.009,1
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.434,0.177,1,-21.18,1,0.0512,0.994,0.0218,0.212,0.457,130.418,5
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.321,0.0946,7,-27.961,1,0.0504,0.995,0.918,0.104,0.397,169.98,3
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,['Dick Haymes'],['3BiJGZsyX9sJchTqcSA7Su'],1922,0.402,0.158,3,-16.9,0,0.039,0.989,0.13,0.311,0.196,103.22,4


# Change 'categorical' key and mode to One-Hot Encoding

In [3]:
df_key = tracks[['key']]
df_key_onehot = OneHotEncoder(dtype=np.int8).fit_transform(df_key).toarray()

In [4]:
tracks.drop(['key'], axis=1, inplace=True)
key_indices = [f'key_{i}' for i in range(12)]
print(key_indices)

['key_0', 'key_1', 'key_2', 'key_3', 'key_4', 'key_5', 'key_6', 'key_7', 'key_8', 'key_9', 'key_10', 'key_11']


In [5]:
tracks = tracks.join(pd.DataFrame(df_key_onehot, columns=key_indices))

In [6]:
tracks.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,key_0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,key_11
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],['45tIt06XoI0Iio4LBEVpls'],1922-02-22,0.645,0.445,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3,1,0,0,0,0,0,0,0,0,0,0,0
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,['Fernando Pessoa'],['14jtPCOoNZwquk5wd9DxrY'],1922-06-01,0.695,0.263,-22.136,1,0.957,0.797,0.0,0.148,0.655,102.009,1,1,0,0,0,0,0,0,0,0,0,0,0
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.434,0.177,-21.18,1,0.0512,0.994,0.0218,0.212,0.457,130.418,5,0,1,0,0,0,0,0,0,0,0,0,0
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.321,0.0946,-27.961,1,0.0504,0.995,0.918,0.104,0.397,169.98,3,0,0,0,0,0,0,0,1,0,0,0,0
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,['Dick Haymes'],['3BiJGZsyX9sJchTqcSA7Su'],1922,0.402,0.158,-16.9,0,0.039,0.989,0.13,0.311,0.196,103.22,4,0,0,0,1,0,0,0,0,0,0,0,0


In [7]:
df_mode = tracks[['mode']]
df_mode_onehot = OneHotEncoder(dtype=np.int8).fit_transform(df_mode).toarray()

In [8]:
tracks.drop(['mode'], axis=1, inplace=True)
tracks = tracks.join(pd.DataFrame(df_mode_onehot, columns=['minor', 'major']))

In [9]:
tracks.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,key_0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,key_11,minor,major
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],['45tIt06XoI0Iio4LBEVpls'],1922-02-22,0.645,0.445,-13.338,0.451,0.674,0.744,0.151,0.127,104.851,3,1,0,0,0,0,0,0,0,0,0,0,0,0,1
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,['Fernando Pessoa'],['14jtPCOoNZwquk5wd9DxrY'],1922-06-01,0.695,0.263,-22.136,0.957,0.797,0.0,0.148,0.655,102.009,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.434,0.177,-21.18,0.0512,0.994,0.0218,0.212,0.457,130.418,5,0,1,0,0,0,0,0,0,0,0,0,0,0,1
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.321,0.0946,-27.961,0.0504,0.995,0.918,0.104,0.397,169.98,3,0,0,0,0,0,0,0,1,0,0,0,0,0,1
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,['Dick Haymes'],['3BiJGZsyX9sJchTqcSA7Su'],1922,0.402,0.158,-16.9,0.039,0.989,0.13,0.311,0.196,103.22,4,0,0,0,1,0,0,0,0,0,0,0,0,1,0


In [10]:
tracks.columns

Index(['id', 'name', 'popularity', 'duration_ms', 'explicit', 'artists',
       'id_artists', 'release_date', 'danceability', 'energy', 'loudness',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'time_signature', 'key_0', 'key_1', 'key_2',
       'key_3', 'key_4', 'key_5', 'key_6', 'key_7', 'key_8', 'key_9', 'key_10',
       'key_11', 'minor', 'major'],
      dtype='object')

In [11]:
tracks.tail()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,key_0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,key_11,minor,major
586667,5rgu12WBIHQtvej2MdHSH0,云与海,50,258267,0,['阿YueYue'],['1QLBXKM5GCpyQQSVMNZqrZ'],2020-09-26,0.56,0.518,-7.471,0.0292,0.785,0.0,0.0648,0.211,131.896,4,1,0,0,0,0,0,0,0,0,0,0,0,1,0
586668,0NuWgxEp51CutD2pJoF4OM,blind,72,153293,0,['ROLE MODEL'],['1dy5WNgIKQU6ezkpZs4y8z'],2020-10-21,0.765,0.663,-5.223,0.0652,0.141,0.000297,0.0924,0.686,150.091,4,1,0,0,0,0,0,0,0,0,0,0,0,0,1
586669,27Y1N4Q4U3EfDU5Ubw8ws2,What They'll Say About Us,70,187601,0,['FINNEAS'],['37M5pPGs6V1fchFJSgCguX'],2020-09-02,0.535,0.314,-12.823,0.0408,0.895,0.00015,0.0874,0.0663,145.095,4,0,0,0,0,0,0,0,1,0,0,0,0,1,0
586670,45XJsGpFTyzbzeWK8VzR8S,A Day At A Time,58,142003,0,"['Gentle Bones', 'Clara Benin']","['4jGPdu95icCKVF31CcFKbS', '5ebPSE9YI5aLeZ1Z2g...",2021-03-05,0.696,0.615,-6.212,0.0345,0.206,3e-06,0.305,0.438,90.029,4,0,0,0,0,0,0,0,0,0,0,1,0,0,1
586671,5Ocn6dZ3BJFPWh4ylwFXtn,Mar de Emociones,38,214360,0,['Afrosound'],['0i4Qda0k4nf7jnNHmSNpYv'],2015-07-01,0.686,0.723,-7.067,0.0363,0.105,0.0,0.264,0.975,112.204,4,0,0,0,0,0,0,1,0,0,0,0,0,0,1


In [12]:
tracks['release_date'] = np.where(tracks['release_date'].str.len() > 4, tracks['release_date'].str.slice(start=0, stop=4), tracks['release_date']).astype(str).astype(int)
before1922 = tracks[tracks['release_date'].astype(int) < 1980].index
tracks.drop(before1922, inplace=True)
tracks = tracks.reset_index(drop=True)
tracks.tail()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,key_0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,key_11,minor,major
403496,5rgu12WBIHQtvej2MdHSH0,云与海,50,258267,0,['阿YueYue'],['1QLBXKM5GCpyQQSVMNZqrZ'],2020,0.56,0.518,-7.471,0.0292,0.785,0.0,0.0648,0.211,131.896,4,1,0,0,0,0,0,0,0,0,0,0,0,1,0
403497,0NuWgxEp51CutD2pJoF4OM,blind,72,153293,0,['ROLE MODEL'],['1dy5WNgIKQU6ezkpZs4y8z'],2020,0.765,0.663,-5.223,0.0652,0.141,0.000297,0.0924,0.686,150.091,4,1,0,0,0,0,0,0,0,0,0,0,0,0,1
403498,27Y1N4Q4U3EfDU5Ubw8ws2,What They'll Say About Us,70,187601,0,['FINNEAS'],['37M5pPGs6V1fchFJSgCguX'],2020,0.535,0.314,-12.823,0.0408,0.895,0.00015,0.0874,0.0663,145.095,4,0,0,0,0,0,0,0,1,0,0,0,0,1,0
403499,45XJsGpFTyzbzeWK8VzR8S,A Day At A Time,58,142003,0,"['Gentle Bones', 'Clara Benin']","['4jGPdu95icCKVF31CcFKbS', '5ebPSE9YI5aLeZ1Z2g...",2021,0.696,0.615,-6.212,0.0345,0.206,3e-06,0.305,0.438,90.029,4,0,0,0,0,0,0,0,0,0,0,1,0,0,1
403500,5Ocn6dZ3BJFPWh4ylwFXtn,Mar de Emociones,38,214360,0,['Afrosound'],['0i4Qda0k4nf7jnNHmSNpYv'],2015,0.686,0.723,-7.067,0.0363,0.105,0.0,0.264,0.975,112.204,4,0,0,0,0,0,0,1,0,0,0,0,0,0,1


In [13]:
tracks.iloc[:,19]
count_nan = len(tracks.iloc[:,19]) - tracks.iloc[:,19].count()
print(len(tracks), count_nan)

403501 0


In [14]:
zeroPopularity = tracks[tracks['popularity'] == 0].index
tracks.drop(zeroPopularity, inplace=True)
tracks = tracks.reset_index(drop=True)
tracks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 394390 entries, 0 to 394389
Data columns (total 32 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   id                394390 non-null  object 
 1   name              394387 non-null  object 
 2   popularity        394390 non-null  int64  
 3   duration_ms       394390 non-null  int64  
 4   explicit          394390 non-null  int64  
 5   artists           394390 non-null  object 
 6   id_artists        394390 non-null  object 
 7   release_date      394390 non-null  int32  
 8   danceability      394390 non-null  float64
 9   energy            394390 non-null  float64
 10  loudness          394390 non-null  float64
 11  speechiness       394390 non-null  float64
 12  acousticness      394390 non-null  float64
 13  instrumentalness  394390 non-null  float64
 14  liveness          394390 non-null  float64
 15  valence           394390 non-null  float64
 16  tempo             39

In [15]:
tracks['duration'] = tracks['duration_ms'].apply (lambda x : round(x/1000))
tracks.drop('duration_ms', inplace = True, axis=1)
tracks.duration.head()

0    160
1    160
2    142
3    157
4    187
Name: duration, dtype: int64

In [17]:
tracks.head()

Unnamed: 0,id,name,popularity,explicit,artists,id_artists,release_date,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,key_0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,key_11,minor,major,duration
0,6catF1lDhNTjjGa2GxRQNN,You'll Never Walk Alone - Mono; 2002 Remaster,56,0,['Gerry & The Pacemakers'],['3UmBeGyNwr4iDWi1vTxWi8'],2008,0.484,0.265,-11.101,0.0322,0.394,0.0,0.149,0.285,113.564,3,1,0,0,0,0,0,0,0,0,0,0,0,0,1,160
1,6Pkt6qVikqPBt9bEQy8iTz,A Lover's Concerto,41,0,['The Toys'],['6lH5PpuiMa5SpfjoIOlwCS'],2020,0.671,0.867,-2.706,0.0571,0.436,0.0,0.139,0.839,120.689,4,0,0,1,0,0,0,0,0,0,0,0,0,0,1,160
2,4aSw1QJIMwYSoDEgzgdCJL,Ferry Cross the Mersey - Mono; 2002 Remaster,40,0,['Gerry & The Pacemakers'],['3UmBeGyNwr4iDWi1vTxWi8'],2008,0.405,0.365,-10.226,0.0289,0.255,5e-06,0.163,0.588,104.536,4,0,0,0,0,0,0,1,0,0,0,0,0,1,0,142
3,0ZMMtH875IR2TfkyC4PolD,Don't Let the Sun Catch You Crying (Main) - Mono,34,0,['Gerry & The Pacemakers'],['3UmBeGyNwr4iDWi1vTxWi8'],2008,0.477,0.352,-14.165,0.03,0.406,0.0,0.122,0.478,106.773,4,0,1,0,0,0,0,0,0,0,0,0,0,0,1,157
4,1hx7X9cMXHWJjknb9O6Ava,The September Of My Years - Live At The Sands ...,26,0,['Frank Sinatra'],['1Mxqyy3pSjf8kZZL4QVxS0'],2018,0.319,0.201,-17.796,0.0623,0.887,0.0,0.904,0.239,117.153,3,0,0,0,0,0,0,0,1,0,0,0,0,0,1,187


In [18]:
tracks.drop(['name', 'artists', 'id_artists'], axis=1, inplace=True)

In [20]:
tracks.drop(['id'], axis=1, inplace=True)

In [21]:
tracks.head()

Unnamed: 0,popularity,explicit,release_date,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,key_0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,key_11,minor,major,duration
0,56,0,2008,0.484,0.265,-11.101,0.0322,0.394,0.0,0.149,0.285,113.564,3,1,0,0,0,0,0,0,0,0,0,0,0,0,1,160
1,41,0,2020,0.671,0.867,-2.706,0.0571,0.436,0.0,0.139,0.839,120.689,4,0,0,1,0,0,0,0,0,0,0,0,0,0,1,160
2,40,0,2008,0.405,0.365,-10.226,0.0289,0.255,5e-06,0.163,0.588,104.536,4,0,0,0,0,0,0,1,0,0,0,0,0,1,0,142
3,34,0,2008,0.477,0.352,-14.165,0.03,0.406,0.0,0.122,0.478,106.773,4,0,1,0,0,0,0,0,0,0,0,0,0,0,1,157
4,26,0,2018,0.319,0.201,-17.796,0.0623,0.887,0.0,0.904,0.239,117.153,3,0,0,0,0,0,0,0,1,0,0,0,0,0,1,187


In [22]:
zeroTempo = tracks[tracks['tempo'].astype(int) == 0].index
tracks.drop(zeroTempo, inplace=True)
track = tracks.reset_index(drop=True)

In [26]:
def get_outlier_counts(df, treshold):
    df = df.copy()
    #Get z-score for specified treshold. shitft and scale, ne kadar mean den uzaklar ı hesaplarız.
    treshold_z_score = stats.norm.ppf(treshold) #norm distribution. ppf: percent point funct. scipy içindfe bir istatik func. cdf nin tersi

    #get the z-scores for each value in track 
    z_score_df = pd.DataFrame(np.abs(stats.zscore(df)), columns=df.columns)
    
    #compare df z-scores to the treshold, Return the count of outliers in each column
    return (z_score_df > treshold_z_score).sum(axis=0)

In [27]:
get_outlier_counts(tracks , 0.999999995)

popularity              0
explicit                0
release_date            0
danceability            0
energy                  0
loudness              403
speechiness             0
acousticness            0
instrumentalness        0
liveness                0
valence                 0
tempo                   0
time_signature       3225
key_0                   0
key_1                   0
key_2                   0
key_3               11617
key_4                   0
key_5                   0
key_6                   0
key_7                   0
key_8                   0
key_9                   0
key_10                  0
key_11                  0
minor                   0
major                   0
duration              932
dtype: int64

In [28]:
def remove_outliers(df, treshold):

    df1 = df.copy()
	
    #Get z-score for specified treshold
    treshold_z_score = stats.norm.ppf(treshold)

    #get the z-scores for each value in track and compare them to the treshold
    z_score_df = pd.DataFrame(np.abs(stats.zscore(df1)), columns=df1.columns)
    z_score_df.time_signature = treshold_z_score  # treshold is not needed for time signature

    z_score_df = z_score_df > treshold_z_score
    # Get indicies of outliers
    outliers = z_score_df.sum(axis=1) # her rowdaki outliers toplamı
    outliers = outliers > 0

    outliers_indicies = df1.index[outliers]

    #Drop outliers
    df = df.drop(outliers_indicies, axis=0).reset_index(drop=True)

    return df

In [29]:
tracks2 = remove_outliers(tracks, 0.999999995)

In [32]:
tracks2.to_csv('../../Dataset/data_processed.csv', index=False)