In [1]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv(r"C:\Users\basde\Downloads\playground-series-s3e24\train.csv")

In [8]:
features = df.drop(columns=['id', 'smoking'])

In [9]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

In [28]:
# If n_components is not set, it will be the number of features.
pca = PCA(n_components=8)
principal_components = pca.fit_transform(scaled_features)

In [29]:
pca_df = pd.DataFrame(data=principal_components)

In [30]:
final_df = pd.concat([pca_df, df[['smoking']]], axis=1)

In [31]:
# Explained variance ratio
explained_variance = pca.explained_variance_ratio_
print(f"Explained variance by each component: {explained_variance}")
print(f"Total explained variance by 2 components: {sum(explained_variance):.2f}")

# Display the resulting DataFrame
print("\nDataFrame with Principal Components:")
print(pca_df.head())

# explained_variance_ratio_: This shows the percentage of variance each principal component accounts for. For example, if the output is [0.75, 0.15], it means:

#    * Principal Component 1 (PC1) captures 75% of the total variance in the original data.
#    * Principal Component 2 (PC2) captures an additional 15% of the variance.

# Total Explained Variance: By summing the ratios, you get the cumulative variance explained. In the example above, the total is 90% (0.75 + 0.15). This means your new 2-component dataset retains 90% of the information from the original features.

# Your Goal: Aim for a high total explained variance (e.g., 85-95%) with the fewest components possible. This confirms your dimensionality reduction was successful and didn't lose too much critical information.



Explained variance by each component: [0.20445065 0.10619703 0.08149305 0.06827215 0.06585984 0.05953544
 0.04920546 0.04593811]
Total explained variance by 2 components: 0.68

DataFrame with Principal Components:
          0         1         2         3         4         5         6  \
0  0.960143  0.976923 -1.956298 -1.071300 -0.563011 -1.561063  0.428796   
1  0.210887  5.274566 -5.520988  1.706747  4.835730  3.137934 -1.676118   
2  1.263916 -1.114100 -0.646922  0.636022 -0.274310 -1.621167 -1.240356   
3  3.950327 -1.182368 -0.866905 -1.698953  0.724084  0.162398 -0.474352   
4 -0.591412 -1.983237 -0.888322 -0.829224 -0.001653  0.348551  0.475405   

          7  
0 -0.283878  
1 -0.393582  
2 -0.307351  
3 -1.085726  
4 -0.107668  


In [32]:
print(pca.components_)

[[-0.14967343  0.33114485  0.40506181  0.3601597   0.10189012  0.10798064
  -0.04807479 -0.04796954  0.16984426  0.19309234  0.09843281  0.03763501
   0.26795327 -0.26431464  0.04691252  0.34038454 -0.02208202  0.24992519
   0.15566039  0.25746102  0.23238912  0.05536426]
 [ 0.43603018 -0.27137232 -0.05914396  0.12156773 -0.26500167 -0.26486895
   0.20346569  0.20450813  0.32779915  0.28419316  0.21321064  0.31884004
   0.11567959 -0.0222413   0.29682454 -0.07650798 -0.02547174 -0.08973432
   0.13497843  0.07961476  0.11383903 -0.07047087]
 [-0.13761015 -0.05220164 -0.03838781 -0.07689719  0.1482222   0.15314927
  -0.33002485 -0.33081009 -0.14526429 -0.08568971 -0.13062809  0.57687787
   0.02194586  0.15562988  0.53898693 -0.01230089 -0.02371937 -0.0502339
   0.05736717  0.07777121  0.02988171 -0.01415478]
 [ 0.02252439 -0.04361403 -0.05442803 -0.03574081 -0.06859101 -0.06610204
   0.21608803  0.22022018 -0.39423693 -0.410678   -0.08171746 -0.02001354
  -0.0503011   0.03268788 -0.01373

In [33]:
loadings_df = pd.DataFrame(data=pca.components_,
                           columns=features.columns)

print("PCA Component Loadings:")
print(loadings_df)

# Look for large absolute values: A feature with a high positive or high negative loading score is a strong driver of that component. 
# For example, if feature3 has a loading of 0.68 for Principal Component 1, it means feature3 contributes significantly to it.

# Look at the signs (positive or negative): The sign tells you the direction of the correlation. 
# Features with the same sign (both positive or both negative) on a component vary in the same direction. Features with opposite signs vary in opposite directions.


PCA Component Loadings:
        age  height(cm)  weight(kg)  waist(cm)  eyesight(left)  \
0 -0.149673    0.331145    0.405062   0.360160        0.101890   
1  0.436030   -0.271372   -0.059144   0.121568       -0.265002   
2 -0.137610   -0.052202   -0.038388  -0.076897        0.148222   
3  0.022524   -0.043614   -0.054428  -0.035741       -0.068591   
4 -0.007586    0.129774    0.092223   0.078754        0.083209   
5 -0.078760   -0.023028   -0.067517  -0.109394        0.473667   
6  0.220720   -0.209069   -0.085151   0.028078        0.355430   
7  0.021462    0.057012    0.013918  -0.001600        0.009267   

   eyesight(right)  hearing(left)  hearing(right)  systolic  relaxation  ...  \
0         0.107981      -0.048075       -0.047970  0.169844    0.193092  ...   
1        -0.264869       0.203466        0.204508  0.327799    0.284193  ...   
2         0.153149      -0.330025       -0.330810 -0.145264   -0.085690  ...   
3        -0.066102       0.216088        0.220220 -0.394237  