In [3]:
import ast
import pandas as pd
import numpy as np
import sklearn
import joblib
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedKFold, GridSearchCV
from sklearn import preprocessing, decomposition, manifold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

#plotting
import matplotlib
import matplotlib.pyplot as plt

#3d plotting
from mpl_toolkits.mplot3d import Axes3D
#%matplotlib qt
#plt.ion()

# Load dataset

In [7]:
df_toptracks_features_analysis = pd.read_csv("./dataset/toptracks_ratings_features_analysis.csv")
df_toptracks_features_analysis

Unnamed: 0,id,name,range,rating,id_copy,energy,danceability,loudness,valence,tempo,speechiness,acousticness,instrumentalness,duration_ms,time_signature,pitch_avg,timbre_avg,key_change_percentage,mode_avg,id_copy_2
0,7L5IwfKB6W0tadcSh9wlyH,Ouverture,short_term,5.393939,7L5IwfKB6W0tadcSh9wlyH,0.336,0.496,-19.44,0.185,129.006,0.0513,0.955,0.912,322173,4,"[0.27, 0.58, 0.21, 0.27, 0.18, 0.34, 0.26, 0.0...","[36.4, -140.17, -20.32, -18.11, 13.38, -37.86,...",75.0,0.14,7L5IwfKB6W0tadcSh9wlyH
1,0tAZi3X7dUdd7m8OXB8pMA,Shadow,short_term,5.272727,0tAZi3X7dUdd7m8OXB8pMA,0.275,0.0832,-15.256,0.0334,170.316,0.0347,0.887,0.853,558267,1,"[0.18, 0.26, 0.64, 0.5, 0.16, 0.26, 0.16, 0.21...","[40.29, -65.03, 29.6, -16.94, 3.88, -26.49, -1...",93.333333,0.86,0tAZi3X7dUdd7m8OXB8pMA
2,1XMDIKQbV30WJPKLMN6MKv,INSTRUCTION,short_term,4.969697,1XMDIKQbV30WJPKLMN6MKv,0.355,0.491,-12.48,0.0369,145.405,0.0354,0.382,0.887,262733,4,"[0.51, 0.26, 0.22, 0.12, 0.12, 0.2, 0.21, 0.39...","[38.23, -65.25, -37.01, -15.68, 9.7, -36.31, 2...",75.0,0.36,1XMDIKQbV30WJPKLMN6MKv
3,1XZdwzd8DTDvkjVc0eJ9BI,Wildlife Analysis,short_term,4.909091,1XZdwzd8DTDvkjVc0eJ9BI,0.0204,0.155,-31.212,0.278,79.755,0.0462,0.995,0.936,75627,4,"[0.12, 0.06, 0.43, 0.1, 0.56, 0.08, 0.08, 0.16...","[28.95, -181.05, 81.47, 3.86, 94.46, -38.57, -...",0.0,0.33,1XZdwzd8DTDvkjVc0eJ9BI
4,1f4cKwcKfNiLbQr8x2tZ3C,Melt!,short_term,4.848485,1f4cKwcKfNiLbQr8x2tZ3C,0.919,0.785,-13.059,0.443,131.037,0.0552,0.00772,0.916,214307,4,"[0.63, 0.68, 0.49, 0.41, 0.46, 0.5, 0.56, 0.53...","[36.82, 22.73, -62.74, 44.28, 36.1, -56.15, -1...",85.714286,0.56,1f4cKwcKfNiLbQr8x2tZ3C


# Convert dataset to numpy array and clean/prepare for training

In [14]:
# convert dataframe rows into arrays. Every array in the 2d array is a row
features = df_toptracks_features_analysis[
    ["name", "energy", "danceability", "loudness", "valence", "tempo", 
     "speechiness", "acousticness", "instrumentalness", "duration_ms", "time_signature", 
     "pitch_avg", "timbre_avg", "key_change_percentage", "mode_avg"]].to_numpy()

target = df_toptracks_features_analysis["rating"]

print(features.shape)
print(target.shape)

(238, 15)
(238,)


In [15]:
# cleaned features
features_cleaned = features

# Substitute titles with integers
for i in range(features_cleaned.shape[0]):
    features_cleaned[i][0] = i

features_cleaned[0]

array([0, 0.336, 0.496, -19.44, 0.185, 129.006, 0.0513, 0.955, 0.912,
       322173, 4,
       '[0.27, 0.58, 0.21, 0.27, 0.18, 0.34, 0.26, 0.07, 0.08, 0.27, 0.57, 0.19]',
       '[36.4, -140.17, -20.32, -18.11, 13.38, -37.86, 3.36, 1.33, -6.62, 3.97, -0.23, 9.65]',
       75.0, 0.14], dtype=object)

In [17]:
# convert the timbre and pitch vectors, which are actually strings in the dataframe imported, to lists again
for row in range(features_cleaned.shape[0]):
    for col in range(features_cleaned.shape[1]):
        if type(features_cleaned[row][col]) == str:
            features_cleaned[row][col] = ast.literal_eval(features_cleaned[row][col])

print(features_cleaned[0])
print(features_cleaned[1])

[0 0.336 0.496 -19.44 0.185 129.006 0.0513 0.955 0.912 322173 4
 list([0.27, 0.58, 0.21, 0.27, 0.18, 0.34, 0.26, 0.07, 0.08, 0.27, 0.57, 0.19])
 list([36.4, -140.17, -20.32, -18.11, 13.38, -37.86, 3.36, 1.33, -6.62, 3.97, -0.23, 9.65])
 75.0 0.14]
[1 0.275 0.0832 -15.256 0.0334 170.316 0.0347 0.887 0.853 558267 1
 list([0.18, 0.26, 0.64, 0.5, 0.16, 0.26, 0.16, 0.21, 0.18, 0.24, 0.43, 0.27])
 list([40.29, -65.03, 29.6, -16.94, 3.88, -26.49, -13.53, -4.79, -7.63, -6.14, -3.13, -4.81])
 93.33333333333331 0.86]


In [19]:
# unpack the timbre and pitch lists within the 2d features_cleaned object
# and subesequently extent the coloumn size of the feature array.

def flatten(x):
    for item in x:
        try:
            #if x has a member (item) it means its a a list or array, therefore we feed the item back into the function.
            yield from flatten(item)
        #so if x has no members to iterate on (i.e its a float or integer), we return it (yield)
        except TypeError:
            yield item

temp_features = np.empty([])
for i in range(features_cleaned.shape[0]):
    
    # flatten row
    row = list(flatten(features_cleaned[i]))
    
    # round all values in row to 2 decimals max
    #row = [round(elem, 2) for elem in row]
    
    # make numpy array of row
    row = np.array(row)
    
    # add them together
    if i == 0:
        temp_features = row
    else:
        temp_features = np.vstack((temp_features, row))

# add to final variable
features_cleaned = temp_features

print(features_cleaned.shape)
print(features_cleaned[0])
print(features_cleaned[1])

(238, 37)
[ 0.00000e+00  3.36000e-01  4.96000e-01 -1.94400e+01  1.85000e-01
  1.29006e+02  5.13000e-02  9.55000e-01  9.12000e-01  3.22173e+05
  4.00000e+00  2.70000e-01  5.80000e-01  2.10000e-01  2.70000e-01
  1.80000e-01  3.40000e-01  2.60000e-01  7.00000e-02  8.00000e-02
  2.70000e-01  5.70000e-01  1.90000e-01  3.64000e+01 -1.40170e+02
 -2.03200e+01 -1.81100e+01  1.33800e+01 -3.78600e+01  3.36000e+00
  1.33000e+00 -6.62000e+00  3.97000e+00 -2.30000e-01  9.65000e+00
  7.50000e+01  1.40000e-01]
[ 1.00000000e+00  2.75000000e-01  8.32000000e-02 -1.52560000e+01
  3.34000000e-02  1.70316000e+02  3.47000000e-02  8.87000000e-01
  8.53000000e-01  5.58267000e+05  1.00000000e+00  1.80000000e-01
  2.60000000e-01  6.40000000e-01  5.00000000e-01  1.60000000e-01
  2.60000000e-01  1.60000000e-01  2.10000000e-01  1.80000000e-01
  2.40000000e-01  4.30000000e-01  2.70000000e-01  4.02900000e+01
 -6.50300000e+01  2.96000000e+01 -1.69400000e+01  3.88000000e+00
 -2.64900000e+01 -1.35300000e+01 -4.79000000e

# Dimensionality reduction

In [20]:
# First, find how many components I need for the dimentionalty reducction
# to keep 99% of the variance in the dataset

feat_variance = np.var(features_cleaned, axis=0).sum()

for i in range(features_cleaned.shape[1]):
        temp = np.var(features_cleaned[:,0:i+1], axis=0).sum()
        percentage = temp/feat_variance
        if percentage > 0.99:
            print("components needed: ", i+1)
            print("reached: ", percentage, "%")
            break

componenets needed:  10
reached:  0.999999468782655 %


### LDA

In [None]:
components_needed = 10

# lda instance with n-number of components
lda = LinearDiscriminantAnalysis(n_components=components_needed)

# pass/fit the lda model with the features and our target
lda.fit(features_cleaned, target)

features_projected = lda.transform(features_cleaned)

print(projected_features.shape)

In [None]:
# save dimred to file
#joblib_file = "dimred.pkl"
#joblib.dump(lda, joblib_file)