# Open data Metrics

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import pandas as pd

In [3]:
folder = "/content/gdrive/My Drive/Colab/Thesis/data/"
metrics_file_name = "Metrics.csv"
path_metrics = folder + metrics_file_name

In [4]:
metrics = pd.read_csv(path_metrics)

In [5]:
metrics.head()

Unnamed: 0,C1_metrics,C2_metrics,C3_metrics,C4_metrics,L1_metrics,L2_metrics,L3_metrics,S1_metrics,S2_metrics,S3_metrics,S4_metrics,T2_metrics
0,0.831126,0.339699,0.703,0.7442,0.226134,0.079115,0.061708,0.061706,0.35233,0.0606,0.0545096,2000
1,0.82926,0.041134,0.7458,0.7457,0.226652,0.079481,0.060633,0.181418,1.288584,0.1796,0.00120525,200
2,0.835254,0.024082,0.7019,0.7419,0.223276,0.076781,0.058414,0.234123,1.829934,0.2317,4.539511e-08,100
3,0.422846,0.172853,0.7476,0.9764,0.411699,0.205707,0.193441,0.377738,0.41319,0.3752,0.3406492,2000
4,0.427914,0.027137,0.7033,0.9758,0.407773,0.203658,0.191667,0.409241,1.307032,0.4086,0.00420248,200


In [6]:
columns = metrics.columns

# Standardize the Data

In [7]:
from sklearn.preprocessing import StandardScaler

In [8]:
standard_metrics = pd.DataFrame(data = StandardScaler().fit_transform(metrics), columns = columns)

In [9]:
standard_metrics.head()

Unnamed: 0,C1_metrics,C2_metrics,C3_metrics,C4_metrics,L1_metrics,L2_metrics,L3_metrics,S1_metrics,S2_metrics,S3_metrics,S4_metrics,T2_metrics
0,1.803166,3.175816,0.907475,-0.860778,-1.059745,-1.168259,-1.22177,-1.706258,-1.365168,-1.712257,-0.238251,1.412667
1,1.795837,-0.256336,1.734492,-0.85137,-1.055575,-1.163466,-1.235118,-0.960036,0.19404,-0.97113,-0.56683,-0.649063
2,1.819371,-0.452356,0.88622,-0.875203,-1.082741,-1.198838,-1.262657,-0.631498,1.095586,-0.646653,-0.574259,-0.763604
3,0.200387,1.257838,1.769273,0.595518,0.433552,0.489812,0.413223,0.26372,-1.263814,0.24706,1.525573,1.412667
4,0.220284,-0.417232,0.913272,0.591755,0.401966,0.462981,0.391209,0.460095,0.224762,0.455074,-0.548355,-0.649063


# Principal Component Analysis (PCA)

In [10]:
from sklearn.decomposition import PCA
import plotly.express as px

In [11]:
#n_components = metrics.shape[1]
n_components = 3
pca = PCA(n_components = n_components)
#pca_columns = ['p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'p7', 'p8', 'p9', 'p10', 'p11', 'p12']
pca_columns = ['p1', 'p2', 'p3']
principalComponents = pca.fit_transform(standard_metrics)
pca_metrics = pd.DataFrame(data = principalComponents, 
                           columns = pca_columns)

In [12]:
fig = px.scatter(principalComponents, x=0, y=1)
fig.show()

In [13]:
labels = {
    str(i): f"PC {i+1} ({var:.1f}%)"
    for i, var in enumerate(pca.explained_variance_ratio_ * 100)
}

In [14]:
labels

{'0': 'PC 1 (61.2%)', '1': 'PC 2 (26.2%)', '2': 'PC 3 (7.2%)'}

In [15]:
import numpy as np

In [16]:
loadings = pca.components_.T * np.sqrt(pca.explained_variance_)

loading_matrix = pd.DataFrame(loadings, columns= pca_columns, index=standard_metrics.columns)
loading_matrix

Unnamed: 0,p1,p2,p3
C1_metrics,0.884291,-0.010512,0.442613
C2_metrics,0.494245,0.731862,0.21769
C3_metrics,-0.608854,0.17617,0.770728
C4_metrics,-0.98017,0.065852,0.119329
L1_metrics,-1.005059,0.072497,0.019556
L2_metrics,-1.005855,0.068257,-0.013563
L3_metrics,-1.004368,0.065805,-0.052276
S1_metrics,-1.006243,-0.097859,0.005264
S2_metrics,-0.075701,-0.979126,0.08257
S3_metrics,-1.006551,-0.09599,-0.000179


In [17]:
norm_loading_matrix = loading_matrix.pow(2) / loading_matrix.pow(2).sum(axis=0)

In [21]:
norm_loading_matrix.sort_values('p1').tail(3)

Unnamed: 0,p1,p2,p3
L2_metrics,0.133971,0.001438,0.0002058482
S1_metrics,0.134075,0.002956,3.100627e-05
S3_metrics,0.134157,0.002844,3.584451e-08


In [22]:
norm_loading_matrix.sort_values('p2').tail(3)

Unnamed: 0,p1,p2,p3
S4_metrics,0.017143,0.207065,0.027647
S2_metrics,0.000759,0.295911,0.007629
T2_metrics,0.000363,0.310549,0.008087


In [23]:
norm_loading_matrix.sort_values('p3').tail(3)

Unnamed: 0,p1,p2,p3
C2_metrics,0.032346,0.165326,0.053029
C1_metrics,0.103546,3.4e-05,0.219225
C3_metrics,0.049087,0.00958,0.664726


In [27]:
p1_metrics = norm_loading_matrix.sort_values('p1').tail(3).index
p2_metrics = norm_loading_matrix.sort_values('p2').tail(3).index
p3_metrics = norm_loading_matrix.sort_values('p3').tail(3).index

In [28]:
metrics = list(p1_metrics) + list(p2_metrics) + list(p3_metrics)
metrics = list(dict.fromkeys(metrics))

In [31]:
metrics

['L2_metrics',
 'S1_metrics',
 'S3_metrics',
 'S4_metrics',
 'S2_metrics',
 'T2_metrics',
 'C2_metrics',
 'C1_metrics',
 'C3_metrics']