In [1]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv(r"C:\Users\basde\Downloads\playground-series-s3e24\train.csv")

In [3]:
features = df.drop(columns=['id', 'smoking'])

In [4]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

In [11]:
# If n_components is not set, it will be the number of features.
pca = PCA(n_components=12)
# pca = PCA()
principal_components = pca.fit_transform(scaled_features)

In [6]:
pca_df = pd.DataFrame(data=principal_components)

In [7]:
final_df = pd.concat([pca_df, df[['smoking']]], axis=1)

In [8]:
# Explained variance ratio
explained_variance = pca.explained_variance_ratio_
print(f"Explained variance by each component: {explained_variance}")
print(f"Total explained variance by PCA: {sum(explained_variance):.2f}")

# Display the resulting DataFrame
print("\nDataFrame with Principal Components:")
print(pca_df.head())

# explained_variance_ratio_: This shows the percentage of variance each principal component accounts for. For example, if the output is [0.75, 0.15], it means:

#    * Principal Component 1 (PC1) captures 75% of the total variance in the original data.
#    * Principal Component 2 (PC2) captures an additional 15% of the variance.

# Total Explained Variance: By summing the ratios, you get the cumulative variance explained. In the example above, the total is 90% (0.75 + 0.15). This means your new 2-component dataset retains 90% of the information from the original features.

# Your Goal: Aim for a high total explained variance (e.g., 85-95%) with the fewest components possible. This confirms your dimensionality reduction was successful and didn't lose too much critical information.



Explained variance by each component: [0.20445065 0.10619703 0.08149305 0.06827215 0.06585984 0.05953544
 0.04920546 0.04593811 0.04443891 0.04012636 0.03663176 0.03385772
 0.02763552 0.02503728 0.02311558 0.02041342 0.02027721 0.01588655
 0.01397479 0.01077018 0.00410782 0.00277516]
Total explained variance by 2 components: 1.00

DataFrame with Principal Components:
         0         1         2         3         4         5         6   \
0  0.960143  0.976923 -1.956298 -1.071300 -0.563011 -1.561063  0.428796   
1  0.210887  5.274566 -5.520988  1.706747  4.835730  3.137934 -1.676118   
2  1.263916 -1.114100 -0.646922  0.636022 -0.274310 -1.621167 -1.240356   
3  3.950327 -1.182368 -0.866905 -1.698953  0.724084  0.162398 -0.474352   
4 -0.591412 -1.983237 -0.888322 -0.829224 -0.001653  0.348551  0.475405   

         7         8         9   ...        12        13        14        15  \
0 -0.283878 -0.809164 -0.527654  ... -0.967674  0.126539  0.990202  0.506664   
1 -0.393582  1.9249

In [9]:
print(pca.components_)

[[-1.49673434e-01  3.31144848e-01  4.05061810e-01  3.60159699e-01
   1.01890122e-01  1.07980639e-01 -4.80747882e-02 -4.79695381e-02
   1.69844257e-01  1.93092338e-01  9.84328137e-02  3.76350061e-02
   2.67953266e-01 -2.64314638e-01  4.69125224e-02  3.40384544e-01
  -2.20820150e-02  2.49925194e-01  1.55660387e-01  2.57461024e-01
   2.32389123e-01  5.53642600e-02]
 [ 4.36030180e-01 -2.71372322e-01 -5.91439638e-02  1.21567729e-01
  -2.65001674e-01 -2.64868948e-01  2.03465691e-01  2.04508130e-01
   3.27799148e-01  2.84193163e-01  2.13210637e-01  3.18840039e-01
   1.15679587e-01 -2.22413006e-02  2.96824540e-01 -7.65079847e-02
  -2.54717358e-02 -8.97343222e-02  1.34978427e-01  7.96147583e-02
   1.13839027e-01 -7.04708690e-02]
 [-1.37610154e-01 -5.22016417e-02 -3.83878093e-02 -7.68971943e-02
   1.48222200e-01  1.53149266e-01 -3.30024850e-01 -3.30810091e-01
  -1.45264293e-01 -8.56897062e-02 -1.30628086e-01  5.76877866e-01
   2.19458563e-02  1.55629879e-01  5.38986932e-01 -1.23008930e-02
  -2.3

In [10]:
loadings_df = pd.DataFrame(data=pca.components_,
                           columns=features.columns)

print("PCA Component Loadings:")
print(loadings_df)

# Look for large absolute values: A feature with a high positive or high negative loading score is a strong driver of that component. 
# For example, if feature3 has a loading of 0.68 for Principal Component 1, it means feature3 contributes significantly to it.

# Look at the signs (positive or negative): The sign tells you the direction of the correlation. 
# Features with the same sign (both positive or both negative) on a component vary in the same direction. Features with opposite signs vary in opposite directions.


PCA Component Loadings:
         age  height(cm)  weight(kg)  waist(cm)  eyesight(left)  \
0  -0.149673    0.331145    0.405062   0.360160        0.101890   
1   0.436030   -0.271372   -0.059144   0.121568       -0.265002   
2  -0.137610   -0.052202   -0.038388  -0.076897        0.148222   
3   0.022524   -0.043614   -0.054428  -0.035741       -0.068591   
4  -0.007586    0.129774    0.092223   0.078754        0.083209   
5  -0.078760   -0.023028   -0.067517  -0.109394        0.473667   
6   0.220720   -0.209069   -0.085151   0.028078        0.355430   
7   0.021462    0.057012    0.013918  -0.001600        0.009267   
8  -0.014353   -0.082953   -0.039049  -0.014224        0.072896   
9   0.066223    0.115975   -0.120407  -0.178683        0.013923   
10  0.008510   -0.055719   -0.337476  -0.395054       -0.032216   
11  0.353436   -0.134627   -0.200544  -0.142967        0.147618   
12  0.448692   -0.084651    0.197400   0.395955        0.113299   
13  0.009970   -0.002809   -0.007805  