In [1]:
import pandas as pd
import numpy as np

#### Reading the dataset

In [2]:
df = pd.read_csv('onewayoranother3.csv')

In [3]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,acousticness,name,danceability,tempo,loudness,popularity,year,key,energy,id,instrumentalness,liveness,speechiness,valence,artists,genres
0,0,0.995,Singende Bataillone 1. Teil Carl Woitschach,0.708,0.002377,-0.002373,0.0,0.003865,0.002365,0.195,6KbQ3uYMLKb5jDxLF7wYDD,0.563,0.151,0.0506,0.779,Carl Woitschach,[]


In [4]:
df = df.drop(['Unnamed: 0'],axis=1)

#### Standardising the numerical values for a more unbaised approach

In [5]:
from sklearn.preprocessing import StandardScaler
X = np.array(df.drop(['name','id','artists','genres','year'],axis=1))
X_scaled = StandardScaler().fit_transform(X)

#### Finding the covariance of the features

In [6]:
features = X_scaled.T
cov_matrix = np.cov(features)

In [7]:
cov_matrix

array([[ 1.00000639e+00, -2.62426141e-01, -2.04865516e-01,
        -5.61818118e-01, -5.88664084e-01, -6.19251766e-01,
        -7.47472741e-01,  3.28837313e-01, -2.34791169e-02,
        -5.53834532e-02, -1.76985562e-01],
       [-2.62426141e-01,  1.00000639e+00, -4.51420577e-03,
         2.90164490e-01,  2.12220088e-01,  1.94293968e-01,
         2.15910019e-01, -2.78621571e-01, -1.05078912e-01,
         2.29489867e-01,  5.58772789e-01],
       [-2.04865516e-01, -4.51420577e-03,  1.00000639e+00,
         2.11019272e-01,  1.34036019e-01,  1.37315799e-01,
         2.49380911e-01, -1.05347589e-01,  7.11777064e-03,
        -1.35763178e-02,  1.68807304e-01],
       [-5.61818118e-01,  2.90164490e-01,  2.11019272e-01,
         1.00000639e+00,  4.59373882e-01,  4.82042809e-01,
         7.81317823e-01, -4.08004702e-01,  5.10116811e-02,
        -1.15519789e-01,  3.04951475e-01],
       [-5.88664084e-01,  2.12220088e-01,  1.34036019e-01,
         4.59373882e-01,  1.00000639e+00,  8.87268135e-01,
  

#### calculating eigenvalues and eigenvectors

In [8]:
values, vectors = np.linalg.eig(cov_matrix)

In [9]:
values

array([3.92777602, 1.64862769, 0.12473964, 0.1085166 , 0.30346638,
       0.36473344, 1.16753202, 1.09029926, 0.62003997, 0.76101603,
       0.88332319])

In [10]:
vectors

array([[ 0.41843389,  0.05489964,  0.38954104,  0.04880205,  0.43322318,
         0.57099959,  0.10259995, -0.0842922 ,  0.29083426,  0.22140229,
         0.05307468],
       [-0.22006006,  0.4933516 ,  0.1685712 ,  0.05346585, -0.55062466,
         0.25467011,  0.37504233,  0.1905048 ,  0.33098101, -0.1416916 ,
         0.04779934],
       [-0.15066139,  0.03383708,  0.01449397,  0.00616745, -0.12660213,
         0.08955769, -0.20978111, -0.548454  ,  0.23337008,  0.02618381,
        -0.74290128],
       [-0.4081917 ,  0.04675854, -0.45140403, -0.07412507,  0.03522903,
         0.64508311, -0.08499743, -0.18995383, -0.30066568,  0.14655401,
         0.22220442],
       [-0.38805589, -0.32223697, -0.15673321,  0.65983133,  0.28800799,
         0.01992935,  0.14002663,  0.22057635,  0.34664843, -0.08917394,
        -0.07686996],
       [-0.39453667, -0.33796213,  0.15544739, -0.71339374,  0.19319371,
         0.05204587,  0.09915443,  0.22518035,  0.28063711, -0.10997048,
        -0.074

#### calculating the explained variances and choosing the higher values

In [11]:
explained_variances = []
for i in range(len(values)):
    explained_variances.append(values[i] / np.sum(values))
 
print(np.sum(explained_variances), '\n', explained_variances)

1.0 
 [0.3570682667253339, 0.14987428768017866, 0.011339894708089774, 0.009865082842647541, 0.027587676727564528, 0.03315737343515563, 0.10613859660116848, 0.09911748156683897, 0.05636691037642541, 0.06918283369094722, 0.08030159564564984]


#### Approximately 86% of the values have been incorporated

In [12]:
projected = list()
projected.append(X_scaled.dot(vectors.T[0]))
projected.append(X_scaled.dot(vectors.T[1]))
projected.append(X_scaled.dot(vectors.T[6]))
projected.append(X_scaled.dot(vectors.T[7]))
projected.append(X_scaled.dot(vectors.T[9]))
projected.append(X_scaled.dot(vectors.T[10]))

In [13]:
res = pd.DataFrame(projected)

In [14]:
res = res.transpose()

In [15]:
res.head()

Unnamed: 0,0,1,2,3,4,5
0,2.351675,1.779346,0.910064,-1.281001,-0.472533,0.425612
1,5.135041,-1.101633,0.733296,0.222134,-1.181466,-0.069818
2,2.011972,4.620201,-0.344635,2.715108,-0.962605,-2.328331
3,2.897644,1.71422,1.348373,-0.943884,-1.515001,0.436541
4,4.298736,-1.524035,0.117903,-0.091247,-0.909719,1.037288


#### Song feature measures are replaced by the values found after conducting PCA 

In [16]:
df = df.join(res)

In [17]:
df.head(2)

Unnamed: 0,acousticness,name,danceability,tempo,loudness,popularity,year,key,energy,id,...,speechiness,valence,artists,genres,0,1,2,3,4,5
0,0.995,Singende Bataillone 1. Teil Carl Woitschach,0.708,0.002377,-0.002373,0.0,0.003865,0.002365,0.195,6KbQ3uYMLKb5jDxLF7wYDD,...,0.0506,0.779,Carl Woitschach,[],2.351675,1.779346,0.910064,-1.281001,-0.472533,0.425612
1,0.994,"Fantasiestücke, Op. 111: Più tosto lento Rober...",0.379,0.001685,-0.005434,0.0,0.003092,0.002365,0.0135,6KuQTIu1KoTTkLXKrwlLPV,...,0.0462,0.0767,Robert Schumann,"['classical', 'early romantic era']",5.135041,-1.101633,0.733296,0.222134,-1.181466,-0.069818


In [18]:
df = df[['name','id','artists','genres','year',0,1,2,3,4,5]]

In [19]:
df.head()

Unnamed: 0,name,id,artists,genres,year,0,1,2,3,4,5
0,Singende Bataillone 1. Teil Carl Woitschach,6KbQ3uYMLKb5jDxLF7wYDD,Carl Woitschach,[],0.003865,2.351675,1.779346,0.910064,-1.281001,-0.472533,0.425612
1,"Fantasiestücke, Op. 111: Più tosto lento Rober...",6KuQTIu1KoTTkLXKrwlLPV,Robert Schumann,"['classical', 'early romantic era']",0.003092,5.135041,-1.101633,0.733296,0.222134,-1.181466,-0.069818
2,Chapter 1.18 - Zamek kaniowski Seweryn Goszczy...,6L63VW0PibdM1HDSBoqnoM,Seweryn Goszczyński,[],0.001932,2.011972,4.620201,-0.344635,2.715108,-0.962605,-2.328331
3,Bebamos Juntos - Instrumental (Remasterizado) ...,6M94FkXd15sOAOQYRnWPN8,Francisco Canaro,"['tango', 'vintage tango']",0.000386,2.897644,1.71422,1.348373,-0.943884,-1.515001,0.436541
4,"Polonaise-Fantaisie in A-Flat Major, Op. 61 Fr...",6N6tiFZ9vLTSOIxkj8qKrd,Frédéric Chopin,"['classical', 'early romantic era', 'polish cl...",0.004251,4.298736,-1.524035,0.117903,-0.091247,-0.909719,1.037288


In [20]:
df.shape

(156581, 11)

In [21]:
#df.to_csv('PCADWG.csv')
newdf = pd.read_csv('Kmeansfromscratch2.csv')
dfnew = pd.merge(df, newdf, left_on ='name', right_on = 'name')

In [22]:
dfnew.head(1)

Unnamed: 0,name,id_x,artists_x,genres_x,year_x,0,1,2,3,4,...,energy,id_y,instrumentalness,liveness,sub_cluster,speechiness,valence,artists_y,genres_y,cluster_no
0,Singende Bataillone 1. Teil Carl Woitschach,6KbQ3uYMLKb5jDxLF7wYDD,Carl Woitschach,[],0.003865,2.351675,1.779346,0.910064,-1.281001,-0.472533,...,0.195,6KbQ3uYMLKb5jDxLF7wYDD,0.563,0.151,0.0,0.0506,0.779,Carl Woitschach,[],2.0


In [23]:
dfnew = dfnew.drop(['Unnamed: 0',"id_y",'year_y','acousticness','danceability','instrumentalness','key','liveness','popularity','valence','tempo','loudness','energy','speechiness','artists_y','genres_x'],axis=1)
dfnew = dfnew.rename(columns= {'id_x':"id", 'year_x':'year','artists_x':'artists','genres_y':'genres'})
X = np.array(dfnew['name'])
for i in range(0,len(X)):
    X[i] = X[i].lower()
dfnew = dfnew.drop(['name'],axis=1)
dfnew.insert(loc = 0, column = 'name',value = X)

In [24]:
dfnew.head(1)

Unnamed: 0,name,id,artists,year,0,1,2,3,4,5,sub_cluster,genres,cluster_no
0,singende bataillone 1. teil carl woitschach,6KbQ3uYMLKb5jDxLF7wYDD,Carl Woitschach,0.003865,2.351675,1.779346,0.910064,-1.281001,-0.472533,0.425612,0.0,[],2.0


In [25]:
dfnew = dfnew[['name','id','artists','genres','year',0,1,2,3,4,5,'cluster_no','sub_cluster']]

In [26]:
dfnew.head(1)

Unnamed: 0,name,id,artists,genres,year,0,1,2,3,4,5,cluster_no,sub_cluster
0,singende bataillone 1. teil carl woitschach,6KbQ3uYMLKb5jDxLF7wYDD,Carl Woitschach,[],0.003865,2.351675,1.779346,0.910064,-1.281001,-0.472533,0.425612,2.0,0.0


In [27]:
dfnew.shape

(156581, 13)

In [28]:
dfnew.to_csv("PCAClustering2.csv")