In [29]:
import pandas as pd
import numpy as np

#### Reading the dataset

In [30]:
df = pd.read_csv('onewayoranother3.csv')

In [31]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,acousticness,name,danceability,tempo,loudness,popularity,key,energy,id,instrumentalness,liveness,speechiness,valence,year,artists,genres
0,0,0.995,Singende Bataillone 1. Teil Carl Woitschach,0.708,0.002377,-0.002373,0.0,0.003865,0.195,6KbQ3uYMLKb5jDxLF7wYDD,0.563,0.151,0.0506,0.779,1928,Carl Woitschach,[]


In [32]:
df = df.drop(['Unnamed: 0'],axis=1)

#### Standardising the numerical values for a more unbaised approach

In [33]:
from sklearn.preprocessing import StandardScaler
X = np.array(df.drop(['name','id','artists','genres','year'],axis=1))
X_scaled = StandardScaler().fit_transform(X)

#### Finding the covariance of the features

In [34]:
features = X_scaled.T
cov_matrix = np.cov(features)

In [35]:
cov_matrix

array([[ 1.00000639e+00, -2.62426141e-01, -2.04865516e-01,
        -5.61818118e-01, -5.88664084e-01, -2.22541927e-02,
        -7.47472741e-01,  3.28837313e-01, -2.34791169e-02,
        -5.53834532e-02, -1.76985562e-01],
       [-2.62426141e-01,  1.00000639e+00, -4.51420577e-03,
         2.90164490e-01,  2.12220088e-01,  2.31194907e-02,
         2.15910019e-01, -2.78621571e-01, -1.05078912e-01,
         2.29489867e-01,  5.58772789e-01],
       [-2.04865516e-01, -4.51420577e-03,  1.00000639e+00,
         2.11019272e-01,  1.34036019e-01,  2.78923160e-03,
         2.49380911e-01, -1.05347589e-01,  7.11777064e-03,
        -1.35763178e-02,  1.68807304e-01],
       [-5.61818118e-01,  2.90164490e-01,  2.11019272e-01,
         1.00000639e+00,  4.59373882e-01,  2.23796702e-02,
         7.81317823e-01, -4.08004702e-01,  5.10116811e-02,
        -1.15519789e-01,  3.04951475e-01],
       [-5.88664084e-01,  2.12220088e-01,  1.34036019e-01,
         4.59373882e-01,  1.00000639e+00,  1.14767983e-02,
  

#### calculating eigenvalues and eigenvectors

In [36]:
values, vectors = np.linalg.eig(cov_matrix)

In [37]:
values

array([3.4111866 , 1.47299421, 0.12397608, 0.28831681, 0.36346246,
       0.54558115, 1.15931538, 0.75016941, 0.8753232 , 1.01167171,
       0.99807323])

In [38]:
vectors

array([[-4.41062784e-01,  1.53519034e-01, -3.93870283e-01,
        -4.35260237e-01, -5.35829201e-01, -7.37003338e-02,
        -9.38965600e-02,  3.14365180e-01, -1.01190576e-01,
         1.63954155e-01, -1.74758798e-03],
       [ 2.67589375e-01,  5.48817259e-01, -1.74521782e-01,
         4.83230741e-01, -3.05990059e-01, -3.99450328e-01,
        -2.79967754e-01, -9.51425145e-02, -6.67076007e-02,
        -1.29485927e-01, -3.72116841e-02],
       [ 1.75536867e-01, -9.47967051e-02, -1.51798418e-02,
         1.06349243e-01, -1.03040671e-01, -2.00464525e-01,
         8.23870308e-02,  6.73899981e-02,  6.38308758e-01,
         6.91618644e-01, -2.74398328e-02],
       [ 4.52473001e-01, -1.13787512e-01,  4.56994220e-01,
        -5.03302084e-02, -6.30068168e-01,  3.46802308e-01,
         1.23776352e-02,  1.10056667e-01, -1.94177832e-01,
         6.09338557e-02, -1.53656784e-03],
       [ 3.56308026e-01, -3.17148192e-01,  6.76277516e-03,
        -5.18496066e-01, -5.96476788e-02, -5.89642111e-01,
  

#### calculating the explained variances and choosing the higher values

In [39]:
explained_variances = []
for i in range(len(values)):
    explained_variances.append(values[i] / np.sum(values))
 
print(np.sum(explained_variances), '\n', explained_variances)

1.0 
 [0.310105891891705, 0.13390770964004015, 0.01127048112166851, 0.026210451827858625, 0.033041830347397376, 0.04959796987187985, 0.10539163440482817, 0.06819678328469223, 0.07957432839063136, 0.09196956835848713, 0.09073335086081152]


#### Approximately 86% of the values have been incorporated

In [40]:
projected = list()
projected.append(X_scaled.dot(vectors.T[0]))
projected.append(X_scaled.dot(vectors.T[1]))
projected.append(X_scaled.dot(vectors.T[6]))
projected.append(X_scaled.dot(vectors.T[7]))
projected.append(X_scaled.dot(vectors.T[9]))
projected.append(X_scaled.dot(vectors.T[10]))

In [41]:
res = pd.DataFrame(projected)

In [42]:
res = res.transpose()

In [43]:
res.head()

Unnamed: 0,0,1,2,3,4,5
0,-1.54146,1.561465,-1.091242,-0.244909,1.274373,1.348374
1,-4.85588,-0.493071,-0.667723,-1.258564,-0.355632,0.966655
2,-1.030287,4.954395,1.328649,-1.29425,-1.225028,-0.283366
3,-2.21572,1.6448,-1.406495,-1.351455,1.029483,-1.174713
4,-3.944471,-1.295857,-0.258271,-1.128535,-0.513041,1.883192


#### Song feature measures are replaced by the values found after conducting PCA 

In [44]:
df = df.join(res)

In [45]:
df.head(2)

Unnamed: 0,acousticness,name,danceability,tempo,loudness,popularity,key,energy,id,instrumentalness,...,valence,year,artists,genres,0,1,2,3,4,5
0,0.995,Singende Bataillone 1. Teil Carl Woitschach,0.708,0.002377,-0.002373,0.0,0.003865,0.195,6KbQ3uYMLKb5jDxLF7wYDD,0.563,...,0.779,1928,Carl Woitschach,[],-1.54146,1.561465,-1.091242,-0.244909,1.274373,1.348374
1,0.994,"Fantasiestücke, Op. 111: Più tosto lento Rober...",0.379,0.001685,-0.005434,0.0,0.003092,0.0135,6KuQTIu1KoTTkLXKrwlLPV,0.901,...,0.0767,1928,Robert Schumann,"['classical', 'early romantic era']",-4.85588,-0.493071,-0.667723,-1.258564,-0.355632,0.966655


In [46]:
df = df[['name','id','artists','genres','year',0,1,2,3,4,5]]

In [47]:
df.head()

Unnamed: 0,name,id,artists,genres,year,0,1,2,3,4,5
0,Singende Bataillone 1. Teil Carl Woitschach,6KbQ3uYMLKb5jDxLF7wYDD,Carl Woitschach,[],1928,-1.54146,1.561465,-1.091242,-0.244909,1.274373,1.348374
1,"Fantasiestücke, Op. 111: Più tosto lento Rober...",6KuQTIu1KoTTkLXKrwlLPV,Robert Schumann,"['classical', 'early romantic era']",1928,-4.85588,-0.493071,-0.667723,-1.258564,-0.355632,0.966655
2,Chapter 1.18 - Zamek kaniowski Seweryn Goszczy...,6L63VW0PibdM1HDSBoqnoM,Seweryn Goszczyński,[],1928,-1.030287,4.954395,1.328649,-1.29425,-1.225028,-0.283366
3,Bebamos Juntos - Instrumental (Remasterizado) ...,6M94FkXd15sOAOQYRnWPN8,Francisco Canaro,"['tango', 'vintage tango']",1928,-2.21572,1.6448,-1.406495,-1.351455,1.029483,-1.174713
4,"Polonaise-Fantaisie in A-Flat Major, Op. 61 Fr...",6N6tiFZ9vLTSOIxkj8qKrd,Frédéric Chopin,"['classical', 'early romantic era', 'polish cl...",1928,-3.944471,-1.295857,-0.258271,-1.128535,-0.513041,1.883192


In [48]:
df.shape

(156581, 11)

In [49]:
#df.to_csv('PCADWG.csv')
newdf = pd.read_csv('Kmeansfromscratch2.csv')
dfnew = pd.merge(df, newdf, left_on ='name', right_on = 'name')

In [50]:
dfnew.head(1)

Unnamed: 0,name,id_x,artists_x,genres_x,year_x,0,1,2,3,4,...,id_y,instrumentalness,liveness,speechiness,sub_cluster,valence,year_y,artists_y,genres_y,cluster_no
0,Singende Bataillone 1. Teil Carl Woitschach,6KbQ3uYMLKb5jDxLF7wYDD,Carl Woitschach,[],1928,-1.54146,1.561465,-1.091242,-0.244909,1.274373,...,6KbQ3uYMLKb5jDxLF7wYDD,0.563,0.151,0.0506,3.0,0.779,0.08,Carl Woitschach,[],0.0


In [51]:
dfnew = dfnew.drop(['Unnamed: 0',"id_y",'year_y','acousticness','danceability','instrumentalness','key','liveness','popularity','valence','tempo','loudness','energy','speechiness','artists_y','genres_x'],axis=1)
dfnew = dfnew.rename(columns= {'id_x':"id", 'year_x':'year','artists_x':'artists','genres_y':'genres'})
X = np.array(dfnew['name'])
for i in range(0,len(X)):
    X[i] = X[i].lower()
dfnew = dfnew.drop(['name'],axis=1)
dfnew.insert(loc = 0, column = 'name',value = X)

In [52]:
dfnew.head(1)

Unnamed: 0,name,id,artists,year,0,1,2,3,4,5,sub_cluster,genres,cluster_no
0,singende bataillone 1. teil carl woitschach,6KbQ3uYMLKb5jDxLF7wYDD,Carl Woitschach,1928,-1.54146,1.561465,-1.091242,-0.244909,1.274373,1.348374,3.0,[],0.0


In [53]:
dfnew = dfnew[['name','id','artists','genres','year',0,1,2,3,4,5,'cluster_no','sub_cluster']]

In [54]:
dfnew.head(1)

Unnamed: 0,name,id,artists,genres,year,0,1,2,3,4,5,cluster_no,sub_cluster
0,singende bataillone 1. teil carl woitschach,6KbQ3uYMLKb5jDxLF7wYDD,Carl Woitschach,[],1928,-1.54146,1.561465,-1.091242,-0.244909,1.274373,1.348374,0.0,3.0


In [55]:
dfnew.shape

(156581, 13)

In [56]:
dfnew.to_csv("PCAClustering2.csv")