---
## PCA
Daten werden auf orthogonalen Achsen mit maximaler Varianz (=wichtigste Infos der Daten bleiben erhalten) in einen neuen Merkmalsunterraum projiziert. 

---
### IMPORTS

In [7]:
import numpy as np
import pandas as pd
from sklearn import datasets

---
### LOAD DATA

In [9]:
X, y = datasets.load_wine(return_X_y=True)
X

array([[1.423e+01, 1.710e+00, 2.430e+00, ..., 1.040e+00, 3.920e+00,
        1.065e+03],
       [1.320e+01, 1.780e+00, 2.140e+00, ..., 1.050e+00, 3.400e+00,
        1.050e+03],
       [1.316e+01, 2.360e+00, 2.670e+00, ..., 1.030e+00, 3.170e+00,
        1.185e+03],
       ...,
       [1.327e+01, 4.280e+00, 2.260e+00, ..., 5.900e-01, 1.560e+00,
        8.350e+02],
       [1.317e+01, 2.590e+00, 2.370e+00, ..., 6.000e-01, 1.620e+00,
        8.400e+02],
       [1.413e+01, 4.100e+00, 2.740e+00, ..., 6.100e-01, 1.600e+00,
        5.600e+02]])

In [10]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2])

---
## TRAIN TEST SPLIT

In [11]:
from sklearn.model_selection import train_test_split
X_Train, X_Test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=0, stratify=y)

---
## Merkmale MÜSSEN für PCA Standardisiert werden!!

In [12]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_Train_std = sc.fit_transform(X_Train)
X_Test_std  = sc.fit(X_Test)

### np.cov = Kovarianzmatrix berechnen
### np.linalg.eign = Zerlegung der Kovarianzmatrix in Eigenwerte und Eigenvektoren

In [13]:
cov_matrix = np.cov(X_Train_std.T)
eigen_vals, eigen_vecs = np.linalg.eig(cov_matrix)
print("\nEigenwerte: \n%s" % eigen_vals)


Eigenwerte: 
[4.84274532 2.41602459 1.54845825 0.96120438 0.84166161 0.6620634
 0.51828472 0.34650377 0.3131368  0.10754642 0.21357215 0.15362835
 0.1808613 ]


In [21]:
tot = sum(eigen_vals)
sorted_vals = sorted(eigen_vals, reverse=True)
var_exp = [(i / tot) for i in sorted_vals]  # Eigenwerte sortieren, absteigend und so  die Werte durch die Summe dividieren = Anteil 
cum_var_exp = np.cumsum(var_exp)

In [22]:
import matplotlib.pyplot as plt
plt.figure(figsize=(12,9), dpi=120)
plt.bar(range(1,14), var_exp, alpha=.5, align="center", label="Individuelle erkläre Varianz")
plt.step(range(1,14), cum_var_exp, where="mid", label="Kummulierte erklärte Varianz")
plt.ylabel("Anteil der erklärten Varianz")
plt.xlabel("Hauptkomponenten")
plt.legend(loc="best")
plt.tight_layout()
plt.show()

<Figure size 1440x1080 with 1 Axes>