In [1]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import pickle
import pandas as pd

In [2]:
# method used for saving object as pickle
def save_object_as_pickle(obj, filename):
    with open(filename, 'wb') as file:
        pickle.dump(obj, file, pickle.HIGHEST_PROTOCOL)

### Load the data

In [3]:
from sklearn import datasets
data_breast_cancer = datasets.load_breast_cancer()

In [4]:
data_breast_cancer['data']

array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]])

In [5]:
from sklearn.datasets import load_iris 
data_iris = load_iris()

In [6]:
data_iris['data']

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

## PCA

### Data breast cancer data

#### Not scaled

In [7]:
pca = PCA(n_components=0.9)
dbc_p = pca.fit_transform(data_breast_cancer['data'])

In [8]:
print(pca.explained_variance_ratio_)

[0.98204467]


#### Scaled

In [9]:
std_scaler = StandardScaler()
scaled_dbc = std_scaler.fit_transform(data_breast_cancer['data'])
pca_scaled = PCA(n_components=0.9)
dbc_p_scaled = pca_scaled.fit_transform(scaled_dbc)

In [10]:
# git jest, 7 kolumn powinno wyjsc
print(pca_scaled.explained_variance_ratio_)
dbc_scaled_expl_var = list(pca_scaled.explained_variance_ratio_)
dbc_scaled_expl_var

[0.44272026 0.18971182 0.09393163 0.06602135 0.05495768 0.04024522
 0.02250734]


[0.44272025607526305,
 0.189711820440331,
 0.09393163257431378,
 0.06602134915470166,
 0.054957684923462695,
 0.04024522039883345,
 0.022507337129825056]

#### Save object as pickle

In [11]:
dbc_scaled_filename = "pca_bc.pkl"
save_object_as_pickle(dbc_scaled_expl_var, dbc_scaled_filename)

### Iris data

#### Not scaled

In [12]:
pca2 = PCA(n_components=0.9)
iris_p = pca2.fit_transform(data_iris['data'])

In [13]:
print(pca2.explained_variance_ratio_)

[0.92461872]


#### Scaled

In [14]:
std_scaler2 = StandardScaler()
scaled_iris = std_scaler2.fit_transform(data_iris['data'])
pca_iris_scaled = PCA(n_components=0.9)
iris_p_scaled = pca_iris_scaled.fit_transform(scaled_iris)

In [15]:
data_iris['data']

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [16]:
scaled_iris

array([[-9.00681170e-01,  1.01900435e+00, -1.34022653e+00,
        -1.31544430e+00],
       [-1.14301691e+00, -1.31979479e-01, -1.34022653e+00,
        -1.31544430e+00],
       [-1.38535265e+00,  3.28414053e-01, -1.39706395e+00,
        -1.31544430e+00],
       [-1.50652052e+00,  9.82172869e-02, -1.28338910e+00,
        -1.31544430e+00],
       [-1.02184904e+00,  1.24920112e+00, -1.34022653e+00,
        -1.31544430e+00],
       [-5.37177559e-01,  1.93979142e+00, -1.16971425e+00,
        -1.05217993e+00],
       [-1.50652052e+00,  7.88807586e-01, -1.34022653e+00,
        -1.18381211e+00],
       [-1.02184904e+00,  7.88807586e-01, -1.28338910e+00,
        -1.31544430e+00],
       [-1.74885626e+00, -3.62176246e-01, -1.34022653e+00,
        -1.31544430e+00],
       [-1.14301691e+00,  9.82172869e-02, -1.28338910e+00,
        -1.44707648e+00],
       [-5.37177559e-01,  1.47939788e+00, -1.28338910e+00,
        -1.31544430e+00],
       [-1.26418478e+00,  7.88807586e-01, -1.22655167e+00,
      

In [17]:
print(pca_iris_scaled.explained_variance_ratio_)
ir_scaled_expl_var = list(pca_iris_scaled.explained_variance_ratio_)
ir_scaled_expl_var

[0.72962445 0.22850762]


[0.7296244541329985, 0.22850761786701793]

#### Save object as pickle

In [18]:
ir_scaled_filename = "pca_ir.pkl"
save_object_as_pickle(ir_scaled_expl_var, ir_scaled_filename)

## ex. 4

#### Breast cancer

In [19]:
bc_comp = pca_scaled.components_

In [20]:
bc_comp

array([[ 2.18902444e-01,  1.03724578e-01,  2.27537293e-01,
         2.20994985e-01,  1.42589694e-01,  2.39285354e-01,
         2.58400481e-01,  2.60853758e-01,  1.38166959e-01,
         6.43633464e-02,  2.05978776e-01,  1.74280281e-02,
         2.11325916e-01,  2.02869635e-01,  1.45314521e-02,
         1.70393451e-01,  1.53589790e-01,  1.83417397e-01,
         4.24984216e-02,  1.02568322e-01,  2.27996634e-01,
         1.04469325e-01,  2.36639681e-01,  2.24870533e-01,
         1.27952561e-01,  2.10095880e-01,  2.28767533e-01,
         2.50885971e-01,  1.22904556e-01,  1.31783943e-01],
       [-2.33857132e-01, -5.97060883e-02, -2.15181361e-01,
        -2.31076711e-01,  1.86113023e-01,  1.51891610e-01,
         6.01653628e-02, -3.47675005e-02,  1.90348770e-01,
         3.66575471e-01, -1.05552152e-01,  8.99796818e-02,
        -8.94572342e-02, -1.52292628e-01,  2.04430453e-01,
         2.32715896e-01,  1.97207283e-01,  1.30321560e-01,
         1.83848000e-01,  2.80092027e-01, -2.19866379e-

In [21]:
l_bc = []
for row in bc_comp:
    idx = row.argmax()
    l_bc.append(idx)
    
l_bc

[7, 9, 11, 4, 16, 28, 29]

#### Save object as pickle

In [22]:
idx_bc_filename = "idx_bc.pkl"
save_object_as_pickle(l_bc, idx_bc_filename)

#### Iris

In [23]:
iris_comp = pca_iris_scaled.components_

In [24]:
l_iris = []
for row in iris_comp:
    idx = row.argmax()
    l_iris.append(idx)
    
l_iris

[2, 1]

#### Save object as pickle

In [25]:
idx_ir_filename = "idx_ir.pkl"
save_object_as_pickle(l_iris, idx_ir_filename)

### Check saved Pickles contents

In [26]:
# check if pickles' contents are saved correctly

print("pca_bc.pkl\n", pd.read_pickle("pca_bc.pkl"), "\n")
print("pca_ir.pkl\n", pd.read_pickle("pca_ir.pkl"), "\n")
print("idx_bc.pkl\n", pd.read_pickle("idx_bc.pkl"), "\n")
print("idx_ir.pkl\n", pd.read_pickle("idx_ir.pkl"))

pca_bc.pkl
 [0.44272025607526305, 0.189711820440331, 0.09393163257431378, 0.06602134915470166, 0.054957684923462695, 0.04024522039883345, 0.022507337129825056] 

pca_ir.pkl
 [0.7296244541329985, 0.22850761786701793] 

idx_bc.pkl
 [7, 9, 11, 4, 16, 28, 29] 

idx_ir.pkl
 [2, 1]
