# Principal Component Analysis & Feature Selection

## PCA

In [1]:
import pandas as pd
import dataframe_image as dfi
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# https://www.kaggle.com/datasets/saurabh00007/iriscsv
filename = 'Iris.csv'
df = pd.read_csv(filename, index_col=0)
cols_x = list(df.columns[0:4])
col_y = df.columns[4]

# model pipeline for PCA
pipe = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('PCA', PCA()),
])
features = df[cols_x]
pipe.fit(features)

# PCA scores
scores = pipe.transform(features)
df_pca = pd.DataFrame(
    scores,
    columns=["PC{}".format(x + 1) for x in range(scores.shape[1])],
    index=df.index
)
cols_pc = list(df_pca.columns)
df_pca.insert(0, col_y, df[col_y].copy())
dfi.export(df_pca.head(), 'table_031_iris_PCA.png')
df_pca.head()

[0104/171822.058644:INFO:headless_shell.cc(623)] Written to file /tmp/tmpceyvoqq1/temp.png.


Unnamed: 0_level_0,Species,PC1,PC2,PC3,PC4
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Iris-setosa,-2.264542,0.505704,-0.121943,-0.023073
2,Iris-setosa,-2.086426,-0.655405,-0.227251,-0.103208
3,Iris-setosa,-2.36795,-0.318477,0.05148,-0.027825
4,Iris-setosa,-2.304197,-0.575368,0.09886,0.066311
5,Iris-setosa,-2.388777,0.674767,0.021428,0.037397


# ANOVA

In [168]:
import numpy as np
from scipy import stats

df_pca_anova_1 = pd.DataFrame(columns=['F-value', 'p-value'])
list_species = list(set(df_pca['Species']))
for pc in cols_pc:
    args = tuple()
    for i, species in enumerate(list_species):
        args += df_pca[df_pca['Species'] == list_species[i]][pc],

    f_value, p_value = stats.f_oneway(*args)
    #print(pc, 'F value =', f_value, ', p value =', p_value)
    tmp = pd.DataFrame(
        np.array([f_value, p_value]).reshape(1, 2),
        columns=['F-value', 'p-value'],
        index=[pc]
    )
    df_pca_anova_1 = pd.concat([df_pca_anova_1, tmp], axis=0)

print(df_pca_anova_1)    

         F-value       p-value
PC1  1043.159859  1.412319e-87
PC2    14.429384  1.897877e-06
PC3     5.290448  6.043825e-03
PC4     1.636495  1.981874e-01
