In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.decomposition import PCA

In [None]:
data_filepath = "https://raw.githubusercontent.com/aoguedao/gmu_casbbi_data_science/main/data/vact.csv"
data = pd.read_csv(data_filepath).iloc[:, 5:18]
data.head()

In [None]:
data.shape

## Correlation Matrix

In [None]:
corr = data.corr()
plt.figure(figsize=(10, 10))
sns.heatmap(
    corr,
    cmap="vlag",
    vmin=-1,
    vmax=1,
    center=0,
    annot=True,
    fmt=".2f",
    square=True,
    linewidths=.5,
    cbar_kws={"shrink": .3}
)
plt.xticks(rotation=45, horizontalalignment="right")
plt.yticks(rotation=45, horizontalalignment="right")
plt.title("Correlation")
plt.tight_layout()
plt.show()
plt.close()

## PCA

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA()
pca.fit(data)

In [None]:
pca.explained_variance_

In [None]:
pca.explained_variance_ratio_ 

In [None]:
pca.singular_values_

In [None]:
fig, ax = plt.subplots(figsize=(8, 5))
ax.plot(
    np.arange(1, pca.n_components_ + 1), pca.singular_values_, "-o", linewidth=2
)
ax.set_ylabel("PCA singular values")
fig.show()

In [None]:
fig, ax = plt.subplots(figsize=(8, 5))
ax.plot(
    np.arange(1, pca.n_components_ + 1), pca.explained_variance_ratio_, "-o", linewidth=2
)
ax.set_ylabel("PCA explained variance ratio")
fig.show()

## PCA N-Components

In [None]:
n = 4
pcan = PCA(n_components=n)
pcan.fit(data)


In [None]:
pd.DataFrame(
    pcan.components_,
    columns=data.columns,
    index = [f"PC_{x}" for x in range(1, pcan.n_components_ + 1)]
)