<a href="https://colab.research.google.com/github/ath0217/hello-github/blob/main/Lab_Session_09.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Importing libraries**


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

In [None]:
sns.set_style("darkgrid")

In [None]:

!mkdir data

In [None]:
import gdown

urls = ['https://drive.google.com/uc?export=download&id=1EeB75LhKj1ofiXjm7Su9WEUh_u62YiCW', # USArrests  https://drive.google.com/file/d/1EeB75LhKj1ofiXjm7Su9WEUh_u62YiCW/view?usp=sharing
        'https://drive.google.com/uc?export=download&id=1bqKHru3xACYpcDLqvjJlXM8FZHAwMC1f', # residential  https://drive.google.com/file/d/1bqKHru3xACYpcDLqvjJlXM8FZHAwMC1f/view?usp=sharing
]
outputs = ['USArrests.csv','residential.csv']
for url,output in zip(urls,outputs):
  gdown.download(url, f'data/{output}', quiet=False)

**Principal component analysis**


In [None]:
df = pd.read_csv('data/USArrests.csv', index_col=0)
df.head(3)

In [None]:

df.shape

In [None]:

df.info()

In [None]:

sns.pairplot(df)

**We should normalize our data**



In [None]:
normalized_df=(df-df.mean())/df.std()

In [None]:

normalized_df.mean().round(6)

In [None]:

from sklearn.decomposition import PCA

In [None]:

pca = PCA(random_state=714)
pca.fit(normalized_df)

In [None]:
print(pca.n_components_)
print(pca.components_) #loading vectors
print(pca.components_.shape)

In [None]:
print(pca.explained_variance_)
[print('{:.6f}'.format(i)) for i in pca.explained_variance_ ]

In [None]:
print(pca.explained_variance_ratio_)
[print('{:.6f}'.format(i)) for i in pca.explained_variance_ratio_ ]

In [None]:
variance_ratio_cumsum = np.cumsum(np.pad(pca.explained_variance_ratio_, (1, 0), "constant"))
variance_ratio_cumsum

In [None]:
#proportion of variance explained
sns.lineplot(x=range(len(pca.explained_variance_ratio_)), y=pca.explained_variance_ratio_, marker='o')

In [None]:

#cumulative proportion of variance explained
sns.lineplot(x=range(len(variance_ratio_cumsum)), y=variance_ratio_cumsum,marker='o')

In [None]:
# The loading vectors
pca_loadings = pd.DataFrame(pca.components_.T, index=df.columns, columns=['V1', 'V2', 'V3', 'V4'])
pca_loadings

In [None]:
(pca_loadings**2).sum(axis=0)

In [None]:
projected_df = pca.transform(normalized_df) #PC score vectors
projected_df = pd.DataFrame(projected_df, index = normalized_df.index, columns=[f'PC{i+1}' for i in range(len(normalized_df.columns))])
projected_df

In [None]:
projected_df.mean(axis = 0).round(6)

In [None]:
fig , ax1 = plt.subplots(figsize=(9,7))

ax1.set_xlim(-3.5,3.5)
ax1.set_ylim(-3.5,3.5)

# Plot Principal Components 1 and 2
for i in projected_df.index:
    ax1.annotate(i, (projected_df.PC1.loc[i], -projected_df.PC2.loc[i]), ha='center')

# Plot reference lines
ax1.hlines(0,-3.5,3.5, linestyles='dotted', colors='grey')
ax1.vlines(0,-3.5,3.5, linestyles='dotted', colors='grey')

ax1.set_xlabel('First Principal Component')
ax1.set_ylabel('Second Principal Component')
    
# Plot Principal Component loading vectors, using a second y-axis.
ax2 = ax1.twinx().twiny() 

ax2.set_ylim(-1,1)
ax2.set_xlim(-1,1)
ax2.tick_params(axis='y', colors='orange')
ax2.set_xlabel('Principal Component 1 loading vectors', color='orange')

# Plot labels for vectors. Variable 'a' is a small offset parameter to separate arrow tip and text.
a = 1.07  
for i in pca_loadings[['V1', 'V2']].index:
    ax2.annotate(i, (pca_loadings.V1.loc[i]*a, -pca_loadings.V2.loc[i]*a), color='orange')

# Plot vectors
ax2.arrow(0,0,pca_loadings.V1[0], -pca_loadings.V2[0],color='orange')
ax2.arrow(0,0,pca_loadings.V1[1], -pca_loadings.V2[1],color='orange')
ax2.arrow(0,0,pca_loadings.V1[2], -pca_loadings.V2[2],color='orange')
ax2.arrow(0,0,pca_loadings.V1[3], -pca_loadings.V2[3],color='orange')
plt.tight_layout()

PCA for visualization: Hand-written digits
**굵은 텍스트**

In [None]:
from sklearn.datasets import load_digits
digits = load_digits()
digits.data.shape

In [None]:
normalized_digits=(digits.data-digits.data.mean())/digits.data.std()

In [None]:
pca = PCA(n_components=2)  # project from 64 to 2 dimensions
projected = pca.fit_transform(normalized_digits)
print(normalized_digits.shape)
print(projected.shape)

In [None]:
pca.explained_variance_ratio_

In [None]:
plt.scatter(projected[:, 0], projected[:, 1],
            c=digits.target, edgecolor='none', alpha=0.5,
            cmap=plt.cm.get_cmap('Spectral', 10))
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.colorbar();

**PCA Regression**


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import mean_squared_error, make_scorer

In [None]:
res = pd.read_csv('data/residential.csv')
res.head(3)

In [None]:

res.shape

In [None]:
X = res[res.columns[:-1]]
y = res[res.columns[-1]]

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=714,test_size=0.20)

In [None]:
lr = LinearRegression()
cross_val_lr  = cross_val_score(lr, X_train, y_train, cv=5, scoring=make_scorer(mean_squared_error))
print(cross_val_lr)
print('=================================================')
print('MSE of LR')
print(f'Mean: {cross_val_lr.mean()}, Std: {cross_val_lr.std()}')

In [None]:
pca_lr_pipe = Pipeline([('pca', PCA(n_components=50,random_state=714)),
                   ('lr', LinearRegression())])

cross_val_pca  = cross_val_score(pca_lr_pipe, X_train, y_train, cv=5,scoring=make_scorer(mean_squared_error))
print(cross_val_pca)
print('=================================================')
print('Result of PCR')
print(f'Mean: {cross_val_pca.mean()}, Std: {cross_val_pca.std()}')

In [None]:
pca_lr_pipe = Pipeline([('pca', PCA(n_components=25,random_state=714)),
                   ('lr', LinearRegression())])

cross_val_pca  = cross_val_score(pca_lr_pipe, X_train, y_train, cv=5,scoring=make_scorer(mean_squared_error))
print(cross_val_pca)
print('=================================================')
print('Result of PCR')
print(f'Mean: {cross_val_pca.mean()}, Std: {cross_val_pca.std()}')

In [None]:
pca_lr_pipe = Pipeline([('pca', PCA(n_components=5,random_state=714)),
                   ('lr', LinearRegression())])

cross_val_pca  = cross_val_score(pca_lr_pipe, X_train, y_train, cv=5,scoring=make_scorer(mean_squared_error))
print(cross_val_pca)
print('=================================================')
print('Result of PCR')
print(f'Mean: {cross_val_pca.mean()}, Std: {cross_val_pca.std()}')