# Análisis de Componentes Principales - Paso a Paso

* Normalizar los datos
* Obtener los eigenvectors y eigenvalues a partir de la matriz de covarianzas o correlaciones (singular vector decomposition también funciona)
* Ordenar los valores propios en orden descendente y quedarnos con los "p" mayores y así disminuir el número de variables del dataset (p<m)
* Construir la matriz de proyección W a partir de los p eigenvectors 
* Transformar el dataset original X a través de W para obtener datos en el subespacio dimensional de dimensión "P", que será Y

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("../datasets/iris/iris.csv")
df.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [3]:
X = df.iloc[:,0:4].values
Y = df.iloc[:,-1].values

In [4]:
X.shape

(150, 4)

In [67]:
import chart_studio.plotly as py
from plotly.graph_objs import *
import chart_studio.tools as tls
import plotly.graph_objects as go

## Hay que iniciar sesión y usar la clave que da el API Key de Plotly

In [6]:
tls.set_credentials_file(username="camilogo1016", api_key="7uUuLxKozSxCd8etbzgl")

In [7]:
traces = []
legend ={0: True, 1:False, 2:False, 3:False}

colors = {'setosa': 'rgb(255,127,20)',
          'versicolor': 'rgb(31, 220, 120)',
          'virginica': 'rgb(44, 50, 180)'}

for col in range(4):
    for key in colors:
        
        traces.append(go.Histogram(x=X[Y==key, col], opacity=0.7, xaxis='x%s'%(col+1), marker_color=colors[key],
        name= key, showlegend=legend[col]))
        
data = go.Data(traces)
layout = go.Layout(barmode="overlay", 
                xaxis=go.XAxis(domain=[0, 0.25], title="Longitud Sepalos (cm)"),
                xaxis2=go.XAxis(domain=[0.3, 0.5], title="Anchura Sepalos (cm)"),
                xaxis3=go.XAxis(domain=[0.55, 0.75], title="Longitud de Petalos (cm)"),
                xaxis4=go.XAxis(domain=[0.8, 1.0], title="Anchura de Petalos (cm)"),
                yaxis =go.YAxis(title="Número de ejemplares"),
                title="Distribución de los rasgos de las diferentes flores Iris")
fig = go.Figure(data = data, layout = layout)
py.iplot(fig)


plotly.graph_objs.Data is deprecated.
Please replace it with a list or tuple of instances of the following types
  - plotly.graph_objs.Scatter
  - plotly.graph_objs.Bar
  - plotly.graph_objs.Area
  - plotly.graph_objs.Histogram
  - etc.



plotly.graph_objs.XAxis is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.layout.XAxis
  - plotly.graph_objs.layout.scene.XAxis



plotly.graph_objs.YAxis is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.layout.YAxis
  - plotly.graph_objs.layout.scene.YAxis




In [8]:
from sklearn.preprocessing import StandardScaler

In [9]:
X_std = StandardScaler().fit_transform(X)

### Para comprobar que quedaron los valores estandarizados vamos a hacer la gráfica y todos los datos deberían estar centrados en 0

In [10]:
traces = []
legend ={0: True, 1:False, 2:False, 3:False}

colors = {'setosa': 'rgb(255,127,20)',
          'versicolor': 'rgb(31, 220, 120)',
          'virginica': 'rgb(44, 50, 180)'}

for col in range(4):
    for key in colors:
        
        traces.append(go.Histogram(x=X_std[Y==key, col], opacity=0.7, xaxis='x%s'%(col+1), marker_color=colors[key],
        name= key, showlegend=legend[col]))
        
data = go.Data(traces)
layout = go.Layout(barmode="overlay", 
                xaxis=go.XAxis(domain=[0, 0.25], title="Longitud Sepalos (cm)"),
                xaxis2=go.XAxis(domain=[0.3, 0.5], title="Anchura Sepalos (cm)"),
                xaxis3=go.XAxis(domain=[0.55, 0.75], title="Longitud de Petalos (cm)"),
                xaxis4=go.XAxis(domain=[0.8, 1.0], title="Anchura de Petalos (cm)"),
                yaxis =go.YAxis(title="Número de ejemplares"),
                title="Distribución de los rasgos de las diferentes flores Iris")
fig = go.Figure(data = data, layout = layout)
py.iplot(fig)


plotly.graph_objs.Data is deprecated.
Please replace it with a list or tuple of instances of the following types
  - plotly.graph_objs.Scatter
  - plotly.graph_objs.Bar
  - plotly.graph_objs.Area
  - plotly.graph_objs.Histogram
  - etc.



plotly.graph_objs.XAxis is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.layout.XAxis
  - plotly.graph_objs.layout.scene.XAxis



plotly.graph_objs.YAxis is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.layout.YAxis
  - plotly.graph_objs.layout.scene.YAxis




### 1. Calculamos la descomposición de Eigenvalores y Eigenvectores
##### a) Usando la matriz de covarianza

In [11]:
from IPython.display import display, Math, Latex

In [12]:
display(Math(r'\sigma_(jk) = \frac{1}{n-1}\sum_{i=1}^m (x_{ij} - \overline{x_j})(x_{ik} - \overline{x_k})'))

<IPython.core.display.Math object>

In [13]:
display(Math(r'\Sigma = \frac{1}{n-1} ((X-\overline{x})^T(X-\overline{x}))'))

<IPython.core.display.Math object>

#### Este es Sigma mayúscula y representa la covarianza

In [14]:
display(Math(r'\overline{x} = \sum_{i=1}^n x_i \in \mathbb R^m'))

<IPython.core.display.Math object>

In [15]:
import numpy as np

In [16]:
mean_vect = np.mean(X_std, axis=0)
mean_vect

array([-4.73695157e-16, -7.81597009e-16, -4.26325641e-16, -4.73695157e-16])

In [17]:
cov_matrix = (X_std-mean_vect).T.dot(X_std-mean_vect) / (X_std.shape[0]-1)
print("La matriz de covarianzas es: \n%s"%cov_matrix)

La matriz de covarianzas es: 
[[ 1.00671141 -0.11835884  0.87760447  0.82343066]
 [-0.11835884  1.00671141 -0.43131554 -0.36858315]
 [ 0.87760447 -0.43131554  1.00671141  0.96932762]
 [ 0.82343066 -0.36858315  0.96932762  1.00671141]]


### También podemos usar la función directa de Numpy

In [18]:
np.cov(X_std.T)

array([[ 1.00671141, -0.11835884,  0.87760447,  0.82343066],
       [-0.11835884,  1.00671141, -0.43131554, -0.36858315],
       [ 0.87760447, -0.43131554,  1.00671141,  0.96932762],
       [ 0.82343066, -0.36858315,  0.96932762,  1.00671141]])

In [19]:
eig_vals, eig_vect = np.linalg.eig(cov_matrix)
print("Los valores propios son:\n\n%s \n" %(eig_vals))
print("Los vectores propios son:\n\n%s" %(eig_vect))

Los valores propios son:

[2.93808505 0.9201649  0.14774182 0.02085386] 

Los vectores propios son:

[[ 0.52106591 -0.37741762 -0.71956635  0.26128628]
 [-0.26934744 -0.92329566  0.24438178 -0.12350962]
 [ 0.5804131  -0.02449161  0.14212637 -0.80144925]
 [ 0.56485654 -0.06694199  0.63427274  0.52359713]]


##### b) Usando la matriz de Correlación

In [20]:
corr_matrix = np.corrcoef(X_std.T)
corr_matrix

array([[ 1.        , -0.11756978,  0.87175378,  0.81794113],
       [-0.11756978,  1.        , -0.4284401 , -0.36612593],
       [ 0.87175378, -0.4284401 ,  1.        ,  0.96286543],
       [ 0.81794113, -0.36612593,  0.96286543,  1.        ]])

In [21]:
eig_vals_corr, eig_vect_corr = np.linalg.eig(corr_matrix)

eig_vals, eig_vect = np.linalg.eig(cov_matrix)
print("Los valores propios son:\n\n%s \n" %(eig_vals_corr))
print("Los vectores propios son:\n\n%s" %(eig_vect_corr))

Los valores propios son:

[2.91849782 0.91403047 0.14675688 0.02071484] 

Los vectores propios son:

[[ 0.52106591 -0.37741762 -0.71956635  0.26128628]
 [-0.26934744 -0.92329566  0.24438178 -0.12350962]
 [ 0.5804131  -0.02449161  0.14212637 -0.80144925]
 [ 0.56485654 -0.06694199  0.63427274  0.52359713]]


##### Acá estamos demostrando que la matriz de correlaciones es la misma con o sin estandarización (de los valores)

In [22]:
corr_matrix = np.corrcoef(X.T)
corr_matrix

array([[ 1.        , -0.11756978,  0.87175378,  0.81794113],
       [-0.11756978,  1.        , -0.4284401 , -0.36612593],
       [ 0.87175378, -0.4284401 ,  1.        ,  0.96286543],
       [ 0.81794113, -0.36612593,  0.96286543,  1.        ]])

##### c) Singular Value Decomposition (esta es la más eficaz computacional)

In [23]:
u, s, v = np.linalg.svd(X_std.T)
u

array([[-0.52106591, -0.37741762,  0.71956635,  0.26128628],
       [ 0.26934744, -0.92329566, -0.24438178, -0.12350962],
       [-0.5804131 , -0.02449161, -0.14212637, -0.80144925],
       [-0.56485654, -0.06694199, -0.63427274,  0.52359713]])

In [26]:
for ev in eig_vect:
    print("La longitud del Eigenvector es: %0.3f" %np.linalg.norm(ev))

La longitud del Eigenvector es: 1.000
La longitud del Eigenvector es: 1.000
La longitud del Eigenvector es: 1.000
La longitud del Eigenvector es: 1.000


#### Esto significa que los eigenvectors son de magnitud 1 y cumplen con la condición

In [30]:
eigen_pairs = [(np.abs(eig_vals[i]), eig_vect[:, i]) for i in range (len(eig_vals))]
eigen_pairs

[(2.938085050199993,
  array([ 0.52106591, -0.26934744,  0.5804131 ,  0.56485654])),
 (0.9201649041624873,
  array([-0.37741762, -0.92329566, -0.02449161, -0.06694199])),
 (0.1477418210449481,
  array([-0.71956635,  0.24438178,  0.14212637,  0.63427274])),
 (0.020853862176462803,
  array([ 0.26128628, -0.12350962, -0.80144925,  0.52359713]))]

#### Para ordenar de mayor a menor se hace lo siguiente:

In [31]:
eigen_pairs.sort()
eigen_pairs.reverse()
eigen_pairs

[(2.938085050199993,
  array([ 0.52106591, -0.26934744,  0.5804131 ,  0.56485654])),
 (0.9201649041624873,
  array([-0.37741762, -0.92329566, -0.02449161, -0.06694199])),
 (0.1477418210449481,
  array([-0.71956635,  0.24438178,  0.14212637,  0.63427274])),
 (0.020853862176462803,
  array([ 0.26128628, -0.12350962, -0.80144925,  0.52359713]))]

In [32]:
print("Valores propios en orden descendente")
for ep in eigen_pairs:
    print(ep[0])

Valores propios en orden descendente
2.938085050199993
0.9201649041624873
0.1477418210449481
0.020853862176462803


In [37]:
total_sum_eivalues = sum(eig_vals)
var_exp = [(i/total_sum_eivalues)*100 for i in sorted(eig_vals, reverse=True)]

var_exp

[72.96244541329986, 22.850761786701774, 3.6689218892828794, 0.5178709107154932]

In [39]:
# La varianza acumulada la mostramos como :
acum_var_exp = np.cumsum(var_exp)
acum_var_exp

array([ 72.96244541,  95.8132072 ,  99.48212909, 100.        ])

In [42]:
plot1 = go.Bar(x=["CP %s" %i for i in range(1,5)], y = var_exp, showlegend=False)
plot2 = go.Scatter(x=["CP %s" %i for i in range(1,5)], y = acum_var_exp, showlegend=True, name="% de Varianza acumulada")
data = go.Data([plot1, plot2])

layout = go.Layout(xaxis = go.XAxis(title= "Componentes Principales"),
                    yaxis = go.YAxis(title="Porcentaje de Varianza explicada"),
                  title="Porcentaje de Variabilidad Explicada por cada componente principal")

fig = go.Figure(data, layout)
py.iplot(fig)

In [43]:
W = np.hstack((eigen_pairs[0][1].reshape(4,1), 
              eigen_pairs[1][1].reshape(4,1)))
W

array([[ 0.52106591, -0.37741762],
       [-0.26934744, -0.92329566],
       [ 0.5804131 , -0.02449161],
       [ 0.56485654, -0.06694199]])

In [44]:
X[0]

array([5.1, 3.5, 1.4, 0.2])

### 3. Proyectar las variables en el nuevo Subespacio Vectorial
* Reducimos las dimensiones de 4 a 2 porque esas 2 dimensiones aportan muy poca covarianza

In [49]:
display(Math(r'Y = X \cdot W, X \in M(\mathbb R)_ {150,4}, W \in M(\mathbb R)_{4,2}, Y \in M(\mathbb R)_{150,2}'))

<IPython.core.display.Math object>

In [52]:
proyecc = X_std.dot(W)

In [88]:
import chart_studio.plotly as py
from plotly.graph_objs import *
import chart_studio.tools as tls
import plotly.graph_objects as go

In [100]:
results = []
colors = {'setosa': 'rgb(255,127,20)',
          'versicolor': 'rgb(31, 220, 120)',
          'virginica': 'rgb(44, 50, 180)'}

for name in ("setosa", "versicolor", "virginica"):
    result = go.Scatter(x=proyecc[Y==name, 0], y = proyecc[Y==name, 1],
                       mode = "markers", name = name, marker_color = colors[name], marker_size=12, line_width=0.5, opacity=0.8)
                        
    results.append(result)
    
    #marker_color=colors[key]

data = go.Data(results)
layout = go.Layout(showlegend=True, scene= go.Scene(xaxis = go.XAxis(title="Componente Principal 1", zeroline=True),
                                                   yaxis = go.YAxis(title="Componente Principal 2", zeroline=True)))
fig = go.Figure(data, layout)
fig.update_xaxes(zeroline=True, zerolinewidth=1, zerolinecolor='grey')
fig.update_yaxes(zeroline=True, zerolinewidth=1, zerolinecolor='grey')
py.iplot(fig)