In [54]:
import json
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [55]:
x_features=pd.read_csv('../processed/x_features.csv')
with open("../processed/feature_selection.json", "r") as f:
    feature_names = json.load(f)

 Dimensionality Reduction by PCA

In [56]:
#reducing number of feature(columns) with minimal loss
x=x_features[feature_names]

pca=PCA(n_components=2)
x_pca=pca.fit_transform(x)

x_pca_df = pd.DataFrame(x_pca,columns=['PC1', 'PC2'])

print(x_pca_df)




          PC1       PC2
0    1.230517  0.190117
1   -0.767382  2.666396
2    3.053663  1.697289
3   -0.991935  1.073267
4   -1.876149  0.579699
..        ...       ...
279  1.773828 -2.051658
280  3.080810  0.745618
281  2.559431 -0.573585
282 -1.932211  0.451198
283 -1.948868 -1.014972

[284 rows x 2 columns]


Explained Variance Ratio

In [57]:
#each new col in PCA called PC , captures some of the original information
cumulative_sum= np.cumsum(pca.explained_variance_ratio_)
no_components= np.argmax(cumulative_sum >=0.95)+1 #to get the acual values(index)

print(f"You need {no_components} components to keep 95% of the data.")

You need 1 components to keep 95% of the data.


Data Visualization

In [58]:
fig1=make_subplots(rows=1,cols=2,subplot_titles=('Scatter plot (PC1 vs PC2)','cumulative variance plot'))

# Scatter Plot
fig1.add_trace(
    go.Scatter(
        x=x_pca[:, 0],
        y=x_pca[:, 1],
        mode='markers',
        name='PC1 vs PC2',
        marker=dict(size=5, color='blue')
    ),
    row=1, col=1
)

# Cumulative Variance Plot
fig1.add_trace(
    go.Scatter(
        x=list(range(1, len(cumulative_sum)+1)),
        y=cumulative_sum,
        mode='lines+markers',
        name='Cumulative Variance',
        line=dict(color='green')
    ),
    row=1, col=2
)

#reference line at 95% to help see how many PCs are needed to reach that variance.
fig1.add_trace(
    go.Scatter(
        x=[1, len(cumulative_sum)],
        y=[0.95, 0.95],
        mode='lines',
        line=dict(color='red', dash='dash'),
        name='95% threshold'
    ),
    row=1, col=2
)

fig1.update_layout(
    title='PCA Visualization',
    height=500,
    width=1000,
    showlegend=True
)

fig1.show()

In [59]:
x_pca_df.to_csv('../processed/x_pca.csv',index=False)
